In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
import re
from scipy.spatial.distance import cdist

In [None]:
train = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv")
test = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")
misconception_mapping = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")
sample_submission = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/sample_submission.csv")


In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text


def make_all_question_text(df: pd.DataFrame) -> pd.DataFrame:
    df["all_question_text"] = df["ConstructName"] + " " + df["QuestionText"]
    df["all_question_text"] = df["all_question_text"].apply(preprocess_text)
    return df

# train = make_all_question_text(train)
test = make_all_question_text(test)
test

In [None]:
def wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.melt(
        df[[
            "QuestionId",
            "all_question_text",
            "CorrectAnswer",
            "AnswerAText",
            "AnswerBText",
            "AnswerCText",
            "AnswerDText"
        ]],
        id_vars=["QuestionId", "all_question_text", "CorrectAnswer"],
        var_name='Answer',
        value_name='value'
    )
    return df

test_long = wide_to_long(test)
test_long

In [None]:

def make_all_text(df: pd.DataFrame) -> pd.DataFrame:
    text_components = []
    if "all_question_text" in df.columns:
        text_components.append(df["all_question_text"])
    if "value" in df.columns:
        text_components.append(df["value"].apply(preprocess_text))
    
    df["all_text"] = pd.concat(text_components, axis=1).apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
    return df

test_long = make_all_text(test_long)
test_long = test_long.sort_values(["QuestionId", "Answer"]).reset_index(drop=True)
test_long


In [None]:
test_long = test_long.sort_values(["QuestionId", "Answer"]).reset_index(drop=True)
test_long

In [None]:
test_long["all_text"].iloc[3]

In [None]:
labels = misconception_mapping['MisconceptionName'].values

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
# Sentences we want sentence embeddings for

device = "cuda:0"

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/fine-tuning-bge-train-version9/checkpoint-1668')
model = AutoModel.from_pretrained('/kaggle/input/fine-tuning-bge-train-version9/checkpoint-1668')
model.eval()
model.to(device)
print("finish")

In [None]:
def preprocess_text1(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
#     x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

In [None]:
def generate_embeddings(texts, model, tokenizer, device, batch_size=8):
    all_embeddings = []
    texts = [preprocess_text1(text) for text in texts]
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=1024).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
#         attention_mask = inputs['attention_mask'].unsqueeze(-1)
#         embeddings = (outputs.last_hidden_state * attention_mask).sum(1) / attention_mask.sum(1)
#         embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
#         all_embeddings.append(embeddings.cpu().numpy())
#     return np.concatenate(all_embeddings, axis=0)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        all_embeddings.append(embeddings.cpu().numpy())
    return np.concatenate(all_embeddings, axis=0)

test_texts = list(test_long['all_text'].values)
all_text_vector = generate_embeddings(test_texts, model, tokenizer, device)

MisconceptionName = list(misconception_mapping['MisconceptionName'].values)
all_ctx_vector = generate_embeddings(MisconceptionName, model, tokenizer, device)

In [None]:
def cdist_similarity(all_text_vector, all_ctx_vector, m ):
    dist = cdist(all_text_vector, all_ctx_vector, metric = m )
    return 1 / (1 + dist)

In [None]:
# test_cos_sim_arr = cosine_similarity(all_text_vector, all_ctx_vector)
test_cos_sim_arr = cdist_similarity(all_text_vector, all_ctx_vector, 'euclidean')
test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)

In [None]:
test_sorted_indices[:, :25]

In [None]:
test_long["Answer_alphabet"] = test_long["Answer"].str.extract(r'Answer([A-Z])Text$')
test_long["QuestionId_Answer"] = test_long["QuestionId"].astype("str") + "_" + test_long["Answer_alphabet"]
test_long["MisconceptionId"] = test_sorted_indices[:, :25].tolist()
test_long["MisconceptionId"] = test_long["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))
# filter correct row
test_long = test_long[test_long["CorrectAnswer"] != test_long["Answer_alphabet"]]
submission = test_long[["QuestionId_Answer", "MisconceptionId"]].reset_index(drop=True)

In [None]:
submission.to_csv('/kaggle/working/submission.csv', index=False)