# Overview
In this notebook, `ConstructName` + `SubjectName` + `QuestionText` and `Answer[A-D]Text` are vectorized using TFIDF, and those with high cosine similarity are submitted as inference results.

- MAP@25: 0.1378
- recall: 0.4530

Please let me know if there are any mistakes.

# Import

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data Load

In [None]:
train = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv")
test = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")

misconception_mapping = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")
sample_submission = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/sample_submission.csv")

# Preprocess

In [None]:
def make_all_question_text(df: pd.DataFrame) -> pd.DataFrame:
    df["AllQuestionText"] = df["ConstructName"] +\
                    " " +\
                    df["SubjectName"] +\
                    " " +\
                    df["QuestionText"]
    return df

train = make_all_question_text(train)
test = make_all_question_text(test)

In [None]:
from typing import Literal
def wide_to_long(
        df: pd.DataFrame, col: Literal["AnswerText", "MisconceptionId"]
    ) -> pd.DataFrame:
    
    if col == "AnswerText":
        add_col = [f"Answer{alpha}Text" for alpha in ["A", "B", "C", "D"]]
        var_name = "AnswerType"
    elif col == "MisconceptionId":
        add_col = [f"Misconception{alpha}Id" for alpha in ["A", "B", "C", "D"]]
        var_name = "MisconceptionType"
    else:
        raise Exception
        
    return pd.melt(
        df[["QuestionId", "AllQuestionText", "CorrectAnswer"] + add_col],
        id_vars=["QuestionId", "AllQuestionText", "CorrectAnswer"],
        var_name=var_name,
        value_name=col
    )

train_long = wide_to_long(train, col="AnswerText")
test_long = wide_to_long(test, col="AnswerText")

train_long_mis = wide_to_long(train, col="MisconceptionId")

In [None]:
train_long["AnswerAlphabet"] = train_long["AnswerType"].str.extract(r'Answer([A-Z])Text$')
test_long["AnswerAlphabet"] = test_long["AnswerType"].str.extract(r'Answer([A-Z])Text$')

train_long_mis["MisconceptionAlphabet"] = train_long_mis["MisconceptionType"].str.extract(r'Misconception([A-Z])Id$')

In [None]:
train_long = pd.merge(
    train_long,
    train_long_mis[["QuestionId", "MisconceptionId", "MisconceptionAlphabet"]],
    left_on=["QuestionId", "AnswerAlphabet"],
    right_on=["QuestionId", "MisconceptionAlphabet"]
)

In [None]:
def make_all_text(df: pd.DataFrame) -> pd.DataFrame:
    df["AllText"] = df["AllQuestionText"] + " " + df["AnswerText"]
    return df

train_long = make_all_text(train_long)
test_long = make_all_text(test_long)

In [None]:
# sort
train_long = train_long.sort_values(["QuestionId", "AnswerType"]).reset_index(drop=True)
test_long = test_long.sort_values(["QuestionId", "AnswerType"]).reset_index(drop=True)

# Train tfidf

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(
    pd.concat([train_long["AllText"], misconception_mapping["MisconceptionName"]])
)

In [None]:
train_long_vec = tfidf_matrix.toarray()[:len(train_long)]
misconception_mapping_vec = tfidf_matrix.toarray()[len(train_long):]

print(train_long_vec.shape)
print(misconception_mapping_vec.shape)

In [None]:
train_cos_sim_arr = cosine_similarity(train_long_vec, misconception_mapping_vec)
train_sorted_indices = np.argsort(-train_cos_sim_arr, axis=1)

In [None]:
train_sorted_indices[:, :25]

In [None]:
# example
def print_example(df: pd.DataFrame, sorted_indices: np.ndarray, idx: int) -> None:
    print(f"Query idx{idx}")
    print(df["AllText"][idx])
    print("\nCos Sim No.1")
    print(misconception_mapping["MisconceptionName"][sorted_indices[idx, 0]])
    print("\nCos Sim No.2")
    print(misconception_mapping["MisconceptionName"][sorted_indices[idx, 1]])

print_example(train_long, train_sorted_indices, 0)

In [None]:
print_example(train_long, train_sorted_indices, 1)

# Evaluate

In [None]:
train_long["PredictMisconceptionId"] = train_sorted_indices[:, :25].tolist()

In [None]:
# https://www.kaggle.com/code/cdeotte/how-to-train-open-book-model-part-1#MAP@3-Metric
def map_at_25(predictions, labels):
    map_sum = 0
    for x, y in zip(predictions, labels):
        z = [1/i if y == j else 0 for i, j in zip(range(1, 26), x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

In [None]:
map_at_25(
    train_long["PredictMisconceptionId"][train_long["MisconceptionId"].notnull()],
    train_long["MisconceptionId"][train_long["MisconceptionId"].notnull()],
)

In [None]:
def recall(predictions, labels):
    acc_num = np.sum([1 for x, y in zip(predictions, labels) if y in x])
    return acc_num / len(predictions)

recall(
    train_long["PredictMisconceptionId"][train_long["MisconceptionId"].notnull()],
    train_long["MisconceptionId"][train_long["MisconceptionId"].notnull()],
)

# Predict

In [None]:
test_long_vec = vectorizer.transform(test_long["AllText"])

In [None]:
test_cos_sim_arr = cosine_similarity(test_long_vec, misconception_mapping_vec)
test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)

In [None]:
test_sorted_indices[:, :25]

In [None]:
print_example(test_long, test_sorted_indices, 0)

In [None]:
print_example(test_long, test_sorted_indices, 1)

# Make Submit File

In [None]:
test_long["QuestionId_Answer"] = test_long["QuestionId"].astype("str") + "_" + test_long["AnswerAlphabet"]
test_long["MisconceptionId"] = test_sorted_indices[:, :25].tolist()
test_long["MisconceptionId"] = test_long["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))
test_long = test_long[test_long["CorrectAnswer"] != test_long["AnswerAlphabet"]] # filter correct row
submission = test_long[["QuestionId_Answer", "MisconceptionId"]].reset_index(drop=True)

In [None]:
submission.head(10)

In [None]:
sample_submission.head(10)

In [None]:
submission.to_csv("submission.csv", index=False)