# Overview
I prepare 3 Notebook.

1. Train Tfidf Retriver (Recall: 0.4530, CV: 0.1378, LB: 0.128) <- Now

2. [Train DeBERTa Reranker](https://www.kaggle.com/code/sinchir0/retriever-tfidf-reranker-deberta-2-trn-rerank)(CV: 0.1740)

3. [Infer by Tfidf Retriver And DeBERTa Reranker](https://www.kaggle.com/code/sinchir0/retriever-tfidf-reranker-deberta-3-infer) (LB: 0.189)

Please let me know if there are any mistakes.

# Setting

In [None]:
DATA_PATH = "/kaggle/input/eedi-mining-misconceptions-in-mathematics"
RETRIEVE_NUM = 25  # TODO: increase

# Install

In [None]:
%pip install -qq scikit-learn==1.5.2

# Import

In [None]:
import pickle

import polars as pl
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import sklearn

assert sklearn.__version__ == "1.5.2"

# Data Load

In [None]:
train = pl.read_csv(f"{DATA_PATH}/train.csv")
misconception_mapping = pl.read_csv(f"{DATA_PATH}/misconception_mapping.csv")

In [None]:
train.head()

# Preprocess

In [None]:
common_col = [
    "QuestionId",
    "ConstructName",
    "SubjectName",
    "QuestionText",
    "CorrectAnswer",
]

train_long = (
    pl.read_csv(f"{DATA_PATH}/train.csv")
    .select(
        pl.col(common_col + [f"Answer{alpha}Text" for alpha in ["A", "B", "C", "D"]])
    )
    .unpivot(
        index=common_col,
        variable_name="AnswerType",
        value_name="AnswerText",
    )
    .with_columns(
        pl.concat_str(
            [
                pl.col("ConstructName"),
                pl.col("SubjectName"),
                pl.col("QuestionText"),
                pl.col("AnswerText"),
            ],
            separator=" ",
        ).alias("AllText"),
        pl.col("AnswerType").str.extract(r"Answer([A-D])Text$").alias("AnswerAlphabet"),
    )
    .with_columns(
        pl.concat_str(
            [pl.col("QuestionId"), pl.col("AnswerAlphabet")], separator="_"
        ).alias("QuestionId_Answer"),
    )
    .sort("QuestionId_Answer")
)
train_long.head()

In [None]:
train_misconception_long = (
    pl.read_csv(f"{DATA_PATH}/train.csv")
    .select(
        pl.col(
            common_col + [f"Misconception{alpha}Id" for alpha in ["A", "B", "C", "D"]]
        )
    )
    .unpivot(
        index=common_col,
        variable_name="MisconceptionType",
        value_name="MisconceptionId",
    )
    .with_columns(
        pl.col("MisconceptionType")
        .str.extract(r"Misconception([A-D])Id$")
        .alias("AnswerAlphabet"),
    )
    .with_columns(
        pl.concat_str(
            [pl.col("QuestionId"), pl.col("AnswerAlphabet")], separator="_"
        ).alias("QuestionId_Answer"),
    )
    .sort("QuestionId_Answer")
    .select(pl.col(["QuestionId_Answer", "MisconceptionId"]))
    .with_columns(pl.col("MisconceptionId").cast(pl.Int64))
)

train_misconception_long.head()

In [None]:
# join MisconceptionId
train_long = train_long.join(train_misconception_long, on="QuestionId_Answer")

# Train tfidf

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(
    pl.concat(
        [train_long["AllText"], misconception_mapping["MisconceptionName"]],
        how="vertical",
    )
)

In [None]:
with open("vectorizer.pkl", "wb") as file:
    pickle.dump(vectorizer, file)

In [None]:
train_long_vec = tfidf_matrix.toarray()[: len(train_long)]
misconception_mapping_vec = tfidf_matrix.toarray()[len(train_long) :]

print(train_long_vec.shape)
print(misconception_mapping_vec.shape)

In [None]:
np.save("misconception_mapping_vec.npy", misconception_mapping_vec)

In [None]:
train_cos_sim_arr = cosine_similarity(train_long_vec, misconception_mapping_vec)
train_sorted_indices = np.argsort(-train_cos_sim_arr, axis=1)

In [None]:
def print_example(df: pl.DataFrame, sorted_indices: np.ndarray, idx: int) -> None:
    print(f"Query idx{idx}")
    print(df["AllText"][idx])
    print("\nCos Sim No.1")
    print(misconception_mapping["MisconceptionName"][int(sorted_indices[idx, 0])])
    print("\nCos Sim No.2")
    print(misconception_mapping["MisconceptionName"][int(sorted_indices[idx, 1])])

In [None]:
print_example(train_long, train_sorted_indices, 0)

In [None]:
print_example(train_long, train_sorted_indices, 1)

# Evaluate

In [None]:
train_long = train_long.with_columns(
    pl.Series(train_sorted_indices[:, :RETRIEVE_NUM].tolist()).alias(
        "PredictMisconceptionId"
    )
)

In [None]:
# https://www.kaggle.com/code/cdeotte/how-to-train-open-book-model-part-1#MAP@3-Metric
def map_at_25(predictions, labels):
    map_sum = 0
    for x, y in zip(predictions, labels):
        z = [1 / i if y == j else 0 for i, j in zip(range(1, 26), x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

In [None]:
map_at_25_score = map_at_25(
    train_long.filter(pl.col("MisconceptionId").is_not_null())[
        "PredictMisconceptionId"
    ],
    train_long.filter(pl.col("MisconceptionId").is_not_null())["MisconceptionId"],
)
map_at_25_score

In [None]:
def recall(predictions, labels):
    acc_num = np.sum([1 for x, y in zip(predictions, labels) if y in x])
    return acc_num / len(predictions)


recall_score = recall(
    train_long.filter(pl.col("MisconceptionId").is_not_null())[
        "PredictMisconceptionId"
    ],
    train_long.filter(pl.col("MisconceptionId").is_not_null())["MisconceptionId"],
)
recall_score

# Make Retrieved Train File

In [None]:
train_retrieved = (
    train_long.filter(
        pl.col(
            "MisconceptionId"
        ).is_not_null()  # TODO: Consider ways to utilize data where MisconceptionId is NaN.
    )
    .explode("PredictMisconceptionId")
    .with_columns(
        (pl.col("MisconceptionId") == pl.col("PredictMisconceptionId"))
        .cast(pl.Int64)
        .alias("target")
    )
    .join(
        misconception_mapping,
        on="MisconceptionId",
    )
    .join(
        misconception_mapping.rename(lambda x: "Predict" + x),
        on="PredictMisconceptionId",
    )
)
train_retrieved.shape

In [None]:
train_retrieved["label"].value_counts()

In [None]:
train_retrieved["label"].value_counts(normalize=True)

In [None]:
train_retrieved.write_csv(
    f"train_ret{RETRIEVE_NUM}_map{map_at_25_score:.4f}_recall{recall_score:.4f}.csv",
)