# Overview
In this notebook, `ConstructName` + `SubjectName` + `QuestionText` and `Answer[A-D]Text` are vectorized using TFIDF, and those with high cosine similarity are submitted as inference results.

Please let me know if there are any mistakes.

# Import

In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

# Data Load

In [None]:
train = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv")
test = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")

misconception_mapping = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")
sample_submission = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/sample_submission.csv")

# Preprocess

In [None]:
def make_all_question_text(df: pd.DataFrame) -> pd.DataFrame:
    df["all_question_text"] = df["ConstructName"] +\
                    " " +\
                    df["QuestionText"]
    return df

# train = make_all_question_text(train)
test = make_all_question_text(test)

In [None]:
test

In [None]:
def wide_to_long(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.melt(
        df[
            [
                "QuestionId",
                "all_question_text",
                "CorrectAnswer",
                "AnswerAText",
                "AnswerBText",
                "AnswerCText",
                "AnswerDText"
            ]
        ],
        id_vars=["QuestionId", "all_question_text", "CorrectAnswer"],
        var_name='Answer',
        value_name='value'
    )

    return df

# train_long = wide_to_long(train)
test_long = wide_to_long(test)
test_long

In [None]:
def make_all_text(df: pd.DataFrame) -> pd.DataFrame:
    df["all_text"] = df["all_question_text"] +\
                     " " +\
                     df["value"]
    return df

# train_long = make_all_text(train_long)
test_long = make_all_text(test_long)
test_long

In [None]:
test_long.all_text.values[0]


In [None]:
# sort
# train_long = train_long.sort_values(["QuestionId", "Answer"]).reset_index(drop=True)
test_long = test_long.sort_values(["QuestionId", "Answer"]).reset_index(drop=True)
test_long

# 计算标签MisconceptionName的emb

In [None]:
# 数据准备

labels = misconception_mapping['MisconceptionName'].values


In [None]:
# 加载模型
from transformers import AutoTokenizer, AutoModel
import torch
# Sentences we want sentence embeddings for

device = "cuda:0"

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/baai-bge-large-en')
model = AutoModel.from_pretrained('/kaggle/input/baai-bge-large-en')
model.eval()
model.to(device)
print("finish")

In [None]:
from tqdm import tqdm
MisconceptionName = list(misconception_mapping['MisconceptionName'].values)
per_gpu_batch_size = 8


def prepare_inputs(text, tokenizer, device):
    tokenizer_outputs = tokenizer.batch_encode_plus(
        text,
        padding=True,
        return_tensors='pt',
        max_length=520,
        truncation=True)
    result = {
        'input_ids': tokenizer_outputs.input_ids.to(device),
        'attention_mask': tokenizer_outputs.attention_mask.to(device),
    }
    return result


all_ctx_vector = []
for mini_batch in tqdm(
        range(0, len(MisconceptionName[:]), per_gpu_batch_size)):
    mini_context = MisconceptionName[mini_batch:mini_batch
                                           + per_gpu_batch_size]
    encoded_input = prepare_inputs(mini_context,tokenizer,device)
    sentence_embeddings = model(
        **encoded_input)[0][:, 0]
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    
    all_ctx_vector.append(sentence_embeddings.detach().cpu().numpy())

all_ctx_vector = np.concatenate(all_ctx_vector, axis=0)
# all_ctx_vector = np.array(all_ctx_vector).astype('float32')
# faiss_index = faiss.IndexFlatIP(all_ctx_vector.shape[-1])
# faiss_index.add(all_ctx_vector)
# sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
print("Sentence embeddings:", sentence_embeddings.shape)

In [None]:
test_texts = list(test_long.all_text.values)
all_text_vector = []
per_gpu_batch_size = 8

for mini_batch in tqdm(
        range(0, len(test_texts[:]), per_gpu_batch_size)):
    mini_context = test_texts[mini_batch:mini_batch
                                           + per_gpu_batch_size]
    encoded_input = prepare_inputs(mini_context,tokenizer,device)
    sentence_embeddings = model(
        **encoded_input)[0][:, 0]
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    
    all_text_vector.append(sentence_embeddings.detach().cpu().numpy())

all_text_vector = np.concatenate(all_text_vector, axis=0)
print(all_text_vector.shape)

# Predict

In [None]:
test_cos_sim_arr = cosine_similarity(all_text_vector, all_ctx_vector)
test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)

In [None]:
test_sorted_indices[:, :25]

# Make Submit File

In [None]:
test_long["Answer_alphabet"] = test_long["Answer"].str.extract(r'Answer([A-Z])Text$')
test_long["QuestionId_Answer"] = test_long["QuestionId"].astype("str") + "_" + test_long["Answer_alphabet"]
test_long["MisconceptionId"] = test_sorted_indices[:, :25].tolist()
test_long["MisconceptionId"] = test_long["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))
# filter correct row
test_long = test_long[test_long["CorrectAnswer"] != test_long["Answer_alphabet"]]
submission = test_long[["QuestionId_Answer", "MisconceptionId"]].reset_index(drop=True)

In [None]:
submission.head(10)

In [None]:
sample_submission.head(10)

In [None]:
submission.to_csv("submission.csv", index=False)