# Import

Direct copy of : https://www.kaggle.com/code/jaejohn/eedi-bge-starter

with just better text preprocessing that I stole from a notebook in this comp a while ago  : https://www.kaggle.com/competitions/lmsys-chatbot-arena

Last 3 versions have the similar score

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

import re
from scipy.spatial.distance import cdist

In [None]:
# Data Loading
train = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv")
test = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")
misconception_mapping = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")

In [None]:
train = train.dropna(axis =1 )

In [None]:
train

In [None]:
# Load the model and tokenizer
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/bge-small-en-v1.5/transformers/bge/2')
model = AutoModel.from_pretrained('/kaggle/input/bge-small-en-v1.5/transformers/bge/2')
model.to(device)

In [None]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)
def preprocess_text(x):
    # Convert words to lowercase
    x = x.lower()
    # Remove HTML
#     x = removeHTML(x)
    # Delete strings starting with @
    x = re.sub("@\w+", '',x)
    # Delete Numbers
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    # Delete URL
    x = re.sub("http\w+", '',x)
    # Replace consecutive empty spaces with a single space character
    x = re.sub(r"\s+", " ", x)
    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    # Remove empty characters at the beginning and end
    x = x.strip()
    return x

In [None]:
# Function to generate embeddings
def generate_embeddings(texts, model, tokenizer, device, batch_size=8):
    all_embeddings = []
    texts = [preprocess_text(text) for text in texts] # This was absent in the original code
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=1024).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        all_embeddings.append(embeddings.cpu().numpy())
    return np.concatenate(all_embeddings, axis=0)

In [None]:
# Generate embeddings for misconceptions
MisconceptionName = list(misconception_mapping['MisconceptionName'].values)
all_ctx_vector = generate_embeddings(MisconceptionName, model, tokenizer, device)

In [None]:
# Prepare test data
def make_all_question_text(df: pd.DataFrame) -> pd.DataFrame:
    df["all_question_text"] = df["ConstructName"] + " " + df["QuestionText"]
    df["all_question_text"] = df["all_question_text"].apply(preprocess_text)
    return df

In [None]:
def wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.melt(
        df[
            [
                "QuestionId",
                "all_question_text",
                "CorrectAnswer",
                "AnswerAText",
                "AnswerBText",
                "AnswerCText",
                "AnswerDText"
            ]
        ],
        id_vars    = ["QuestionId", "all_question_text", "CorrectAnswer"],
        var_name   = 'Answer',
        value_name = 'value'
    )
    return df

In [None]:
def make_all_text(df: pd.DataFrame) -> pd.DataFrame:
    text_components = []
    if "all_question_text" in df.columns:
        text_components.append(df["all_question_text"])
    if "value" in df.columns:
        text_components.append(df["value"].apply(preprocess_text))
    
    df["all_text"] = pd.concat(text_components, axis=1).apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
    return df

In [None]:
def simple_cosine(all_text_vector, all_ctx_vector):
    temp =  cosine_similarity(all_text_vector, all_ctx_vector)
    return temp

def cdist_similarity(all_text_vector, all_ctx_vector, m ):
    dist = cdist(all_text_vector, all_ctx_vector, metric = m )
    return 1 / (1 + dist)  # Convert distance to similarity
#     return np.argsort(-temp, axis=1)

<h2>SUBMISSION<h2>

In [None]:
test = make_all_question_text(test)
test_long = wide_to_long(test)
test_long = make_all_text(test_long)
test_long = test_long.sort_values(["QuestionId", "Answer"]).reset_index(drop=True)

# Generate embeddings for test data
test_texts = list(test_long['all_text'].values)
all_text_vector = generate_embeddings(test_texts, model, tokenizer, device)

e_sim = cdist_similarity(all_text_vector, all_ctx_vector, 'euclidean')

si = e_sim 

sim = np.argsort(-si,axis=1)

# Prepare submission
test_long["Answer_alphabet"] = test_long["Answer"].str.extract(r'Answer([A-Z])Text$')
test_long["QuestionId_Answer"] = test_long["QuestionId"].astype("str") + "_" + test_long["Answer_alphabet"]
test_long["MisconceptionId"] = sim[:, :25].tolist()
test_long["MisconceptionId"] = test_long["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))

# filter correct row
test_long = test_long[test_long["CorrectAnswer"] != test_long["Answer_alphabet"]]

In [None]:
submission = test_long[["QuestionId_Answer", "MisconceptionId"]].reset_index(drop=True)

In [None]:
sim[:,:25]

In [None]:
submission.head(10)

In [None]:
submission.to_csv("submission.csv", index=False)
print("Submission file created successfully!")