# Starter with SBERT

Shotout and kudos to this [notebook](https://www.kaggle.com/code/pingfan/baseline-bge-cos-sim) as baseline.

You can find how to load the **Sentence Bert paraphrase-MiniLM-L6-v2** [here](https://www.kaggle.com/code/cindybtari/loading-pretrained-transformers-offline).

## ☘️ Import

In [None]:
!pip install -q /kaggle/input/dependencies/evaluate/evaluate-0.4.3-py3-none-any.whl

In [None]:
import os
import numpy as np
import pandas as pd
import numpy as np
import evaluate
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist
from scipy.stats import pearsonr
from tqdm import tqdm

## ☘️ Load Data

In [None]:
INPUT = '/kaggle/input/eedi-mining-misconceptions-in-mathematics'

In [None]:
train = pd.read_csv(os.path.join(INPUT, 'train.csv'))
test = pd.read_csv(os.path.join(INPUT, 'test.csv'))

In [None]:
misconception_mapping = pd.read_csv(os.path.join(INPUT, 'misconception_mapping.csv'))

In [None]:
sample_submission = pd.read_csv(os.path.join(INPUT, 'sample_submission.csv'))

In [None]:
# check train

train.sort_values(by='ConstructId').head(10)

In [None]:
# check test 

test.head()

In [None]:
# check misconception mapping

misconception_mapping.head()

In [None]:
# check sample submission

sample_submission.head()

### Check Data

In [None]:
train.iloc[0,2]

In [None]:
train.iloc[0,6]

## ☘️ Preprocess

In [None]:
def get_complete_question(df: pd.DataFrame, construct_col: str, question_col: str) -> pd.Series:
    return df[construct_col] + " " + df[question_col]

In [None]:
def pivot_long(df: pd.DataFrame) -> pd.DataFrame:
    return pd.melt(
        df,
        id_vars=["QuestionId", "complete_question", "CorrectAnswer"],
        value_vars=["AnswerAText", "AnswerBText", "AnswerCText", "AnswerDText"],
        var_name='Answer',
        value_name='value'
    )

In [None]:
def get_complete_text(df: pd.DataFrame, question_col: str, answer_col:str) -> pd.Series:
    return df[question_col] + " " + df[answer_col]

In [None]:
test['complete_question'] = get_complete_question(test, 'ConstructName', 'QuestionText')

In [None]:
test.head()

In [None]:
test_long = pivot_long(test)

In [None]:
test_long.head()

In [None]:
test_long['complete_text'] = get_complete_text(test_long, 'complete_question', 'value')

In [None]:
test_long.head()

In [None]:
test_long.iloc[0,-1]

In [None]:
# sort values by QuestionId and Answer

test_long.sort_values(["QuestionId", "Answer"], inplace = True)
test_long = test_long.reset_index(drop=True)
test_long

## ☘️ Calculate the embedding of the label MisconceptionName

In [None]:
device = "cuda:0"

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/sentence-transformers/BAAI/bge-m3')
model = AutoModel.from_pretrained('/kaggle/input/sentence-transformers/BAAI/bge-m3')

In [None]:
model.eval()

In [None]:
model.to(device)
print('Ok!')

In [None]:
def prepare_inputs(texts, tokenizer, device, max_length=520):
    tokenizer_outputs = tokenizer.batch_encode_plus(
        texts,
        padding=True,
        return_tensors='pt',
        max_length=max_length,
        truncation=True
    )
    return {
        'input_ids': tokenizer_outputs['input_ids'].to(device),
        'attention_mask': tokenizer_outputs['attention_mask'].to(device),
    }

In [None]:
def compute_sentence_embeddings(texts, tokenizer, model, device, per_gpu_batch_size=8):
    all_embeddings = []
    
    # iterate over the text data in batches
    for start_idx in tqdm(range(0, len(texts), per_gpu_batch_size), desc="Processing batches"):
        end_idx = min(start_idx + per_gpu_batch_size, len(texts))
        batch_texts = texts[start_idx:end_idx]
        
        encoded_inputs = prepare_inputs(batch_texts, tokenizer, device)
        
        # compute embeddings
        with torch.no_grad():  
            model_output = model(**encoded_inputs)[0][:, 0]
            normalized_embeddings = torch.nn.functional.normalize(model_output, p=2, dim=1)
        
        all_embeddings.append(normalized_embeddings.cpu().numpy())
    
    all_embeddings = np.concatenate(all_embeddings, axis=0)
    
    return all_embeddings

In [None]:
# parameters

per_gpu_batch_size = 8
max_length = 520

In [None]:
labels = misconception_mapping['MisconceptionName'].values

In [None]:
MisconceptionName = list(labels)

In [None]:
MisconceptionName[:2]

In [None]:
sentence_embeddings = compute_sentence_embeddings(MisconceptionName, tokenizer, model, device, per_gpu_batch_size)

In [None]:
print(f"Sentence embeddings: {sentence_embeddings.shape}")

In [None]:
test_texts = list(test_long.complete_text.values)

In [None]:
all_text_vector = compute_sentence_embeddings(test_texts, tokenizer, model, device, per_gpu_batch_size)

In [None]:
print(f"All vectors: {all_text_vector.shape}")

## ☘️ Predict

In [None]:
dot_product_sim_arr = np.dot(all_text_vector, sentence_embeddings.T)
euclidean_sim_arr = 1 / (1 + cdist(all_text_vector, sentence_embeddings, metric='euclidean'))
test_cos_sim_arr = cosine_similarity(all_text_vector, sentence_embeddings)

In [None]:
avg_sim_arr = (test_cos_sim_arr+euclidean_sim_arr+dot_product_sim_arr)/3

In [None]:
test_sorted_indices = np.argsort(-avg_sim_arr, axis=1)

## ☘️ Submission

In [None]:
test_long["Answer_Alph"] = test_long['Answer'].str.extract(r'(?i)Answer(\w)', expand=False)
test_long["QuestionId_Answer"] = test_long["QuestionId"].astype("str") + "_" + test_long["Answer_Alph"]
test_long["MisconceptionId"] = test_sorted_indices[:, :25].tolist()
test_long["MisconceptionId"] = test_long["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))

In [None]:
test_long

In [None]:
# filter out the correct row

test_long = test_long[test_long["CorrectAnswer"] != test_long["Answer_Alph"]]
test_long

In [None]:
submission = test_long[["QuestionId_Answer", "MisconceptionId"]].reset_index(drop=True)
submission

In [None]:
submission.to_csv('submission.csv', index = False)