> A fork of https://www.kaggle.com/code/pshikk/similarity-preprocessing

> Training notebook at https://www.kaggle.com/code/abdullahmeda/eedi-train-finetune-bge-embedding-model

In [None]:
import re
import numpy as np
import pandas as pd

import torch
from transformers import AutoTokenizer, AutoModel

from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
embed_model_pth = '/kaggle/input/eedi-bge-large-en-v1-5-fintetuned-exp-1/eedi_model'

In [None]:
test = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")
train = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv")
misconception_mapping = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")

In [None]:
columns = ['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']
all_identifiers = train[columns].melt()['value']

In [None]:
train_mc = all_identifiers.dropna().unique().astype(int).tolist()

In [None]:
train_mc[:5]

In [None]:
misconception_mapping = misconception_mapping[misconception_mapping.MisconceptionId.isin(train_mc)].reset_index(drop=True)

In [None]:
len(misconception_mapping)

In [None]:
misconception_mapping

In [None]:
temp_mapping = {i: idx for i, idx, mc in misconception_mapping.reset_index().values}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(embed_model_pth)
model     = AutoModel.from_pretrained(embed_model_pth).to("cuda:0")

In [None]:
# https://www.kaggle.com/code/pshikk/similarity-preprocessing

def preprocess_text(x):
    x = x.lower()                 # Convert words to lowercase
    x = re.sub("@\w+", '',x)      # Delete strings starting with @
    x = re.sub("'\d+", '',x)      # Delete Numbers
    x = re.sub("\d+", '',x)
    x = re.sub("http\w+", '',x)   # Delete URL
    x = re.sub(r"\s+", " ", x)    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\.+", ".", x)    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\,+", ",", x)
    x = x.strip()                 # Remove empty characters at the beginning and end
    return x

In [None]:
def generate_embeddings(texts, model, tokenizer, device, batch_size=8):
    """ Function to generate embeddings """
    
    all_embeddings = []
    texts = [preprocess_text(text) for text in texts] # This was absent in the original code
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=1024).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        all_embeddings.append(embeddings.cpu().numpy())
    return np.concatenate(all_embeddings, axis=0)

In [None]:
# Generate embeddings for misconceptions
MisconceptionName = list(misconception_mapping['MisconceptionName'].values)
all_ctx_vector = generate_embeddings(MisconceptionName, model, tokenizer, "cuda:0")

In [None]:
# Prepare test data
def make_all_question_text(df: pd.DataFrame) -> pd.DataFrame:
    df["all_question_text"] = df["SubjectName"] + "\n\n" + df["ConstructName"] + "\n\n" + df["QuestionText"]
    df["all_question_text"] = df["all_question_text"].apply(preprocess_text)
    return df

In [None]:
def wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.melt(
        df[
            [
                "QuestionId",
                "all_question_text",
                "CorrectAnswer",
                "AnswerAText",
                "AnswerBText",
                "AnswerCText",
                "AnswerDText"
            ]
        ],
        id_vars    = ["QuestionId", "all_question_text", "CorrectAnswer"],
        var_name   = 'Answer',
        value_name = 'value'
    )
    return df

In [None]:
def make_all_text(df: pd.DataFrame) -> pd.DataFrame:
    text_components = []
    if "all_question_text" in df.columns:
        text_components.append(df["all_question_text"])
    if "value" in df.columns:
        text_components.append(df["value"].apply(preprocess_text))
    
    df["all_text"] = pd.concat(text_components, axis=1).apply(lambda x: '\n\n'.join(x.dropna().astype(str)), axis=1)
    return df

In [None]:
def simple_cosine(all_text_vector, all_ctx_vector):
    temp =  cosine_similarity(all_text_vector, all_ctx_vector)
    return temp

def cdist_similarity(all_text_vector, all_ctx_vector, m ):
    dist = cdist(all_text_vector, all_ctx_vector, metric = m )
    return 1 / (1 + dist)  # Convert distance to similarity
#     return np.argsort(-temp, axis=1)

### Submission

In [None]:
test = make_all_question_text(test)
test_long = wide_to_long(test)
test_long = make_all_text(test_long)
test_long = test_long.sort_values(["QuestionId", "Answer"]).reset_index(drop=True)

# Generate embeddings for test data
test_texts = list(test_long['all_text'].values)
all_text_vector = generate_embeddings(test_texts, model, tokenizer, "cuda:0")

e_sim = cdist_similarity(all_text_vector, all_ctx_vector, 'euclidean')

si = e_sim 

sim = np.argsort(-si,axis=1)

# Prepare submission
test_long["Answer_alphabet"] = test_long["Answer"].str.extract(r'Answer([A-Z])Text$')
test_long["QuestionId_Answer"] = test_long["QuestionId"].astype("str") + "_" + test_long["Answer_alphabet"]
test_long["MisconceptionId"] = sim[:, :25].tolist()

In [None]:
test_long["MisconceptionId"] = test_long["MisconceptionId"].apply(lambda x: [temp_mapping[int(i)] for i in x])

In [None]:
test_long["MisconceptionId"] = test_long["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))

# filter correct row
test_long = test_long[test_long["CorrectAnswer"] != test_long["Answer_alphabet"]]

In [None]:
submission = test_long[["QuestionId_Answer", "MisconceptionId"]].reset_index(drop=True)

In [None]:
submission.head(10)

In [None]:
submission.to_csv("submission.csv", index=False)
print("Submission file created successfully!")