# Overview
I prepared 3 Notebook.

1. [Train Tfidf Retriver](https://www.kaggle.com/code/sinchir0/retriever-tfidf-reranker-deberta-1-trn-ret) (Recall: 0.4530, CV:0.1378, LB:0.128)

2. [Train DeBERTa Reranker](https://www.kaggle.com/code/sinchir0/retriever-tfidf-reranker-deberta-2-trn-rerank)(CV: 0.1740)

3. Infer by Tfidf Retriver And DeBERTa Reranker(LB:0.189)  <- Now

Please let me know if there are any mistakes.

# Install

In [None]:
!pip uninstall -qq -y \
scikit-learn \
polars \
transformers \
accelerate \
datasets

In [None]:
!python -m pip install -qq --no-index --find-links=/kaggle/input/eedi-library \
scikit-learn \
polars \
transformers \
accelerate \
datasets

# Setting

In [None]:
RETRIEVE_NUM = 50
EVAL_BS = 4
INFERENCE_MAX_LENGTH = 256

DATA_PATH = "/kaggle/input/eedi-mining-misconceptions-in-mathematics"
RETRIEVER_PATH = "/kaggle/input/retriever-tfidf-reranker-deberta-1-trn-ret"
RERANKER_PATH = "/kaggle/input/retriever-tfidf-reranker-deberta-2-trn-rerank" + "/trained_model"

# Import

In [None]:
import os
import pickle

from tqdm.auto import tqdm

import numpy as np
import polars as pl

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import torch
from datasets import Dataset
from scipy.special import softmax
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

In [None]:
NUM_PROC = os.cpu_count()

In [None]:
device = torch.device(f"cuda:0")

In [None]:
import transformers
import sklearn
import datasets

assert pl.__version__ == "1.7.1"
assert transformers.__version__ == "4.44.2"
assert sklearn.__version__ == "1.5.2"
assert datasets.__version__ == "3.0.0"

# Load

In [None]:
with open(f"{RETRIEVER_PATH}/vectorizer.pkl", "rb") as file:
    vectorizer = pickle.load(file)
    
misconception_mapping_vec = np.load(f"{RETRIEVER_PATH}/misconception_mapping_vec.npy")

# Check Environment

In [None]:
!python --version

In [None]:
!nvidia-smi

# Preprocess Test

In [None]:
common_col = [
    "QuestionId",
    "ConstructName",
    "SubjectName",
    "QuestionText",
    "CorrectAnswer",
]

test_long = (
    pl.read_csv(f"{DATA_PATH}/test.csv")
    .select(
        pl.col(common_col + [f"Answer{alpha}Text" for alpha in ["A", "B", "C", "D"]])
    )
    .unpivot(
        index=common_col,
        variable_name="AnswerType",
        value_name="AnswerText",
    )
    .with_columns(
        pl.concat_str(
            [
                pl.col("ConstructName"),
                pl.col("SubjectName"),
                pl.col("QuestionText"),
                pl.col("AnswerText"),
            ],
            separator=" ",
        ).alias("AllText"),
        pl.col("AnswerType").str.extract(r"Answer([A-Z])Text$").alias("AnswerAlphabet"),
    )
    .with_columns(
        pl.concat_str([pl.col("QuestionId"), pl.col("AnswerAlphabet")], separator="_").alias("QuestionId_Answer"),
    )
    .sort("QuestionId_Answer")
)
test_long.head()

# Retrieval

In [None]:
test_long_vec = vectorizer.transform(test_long["AllText"])
test_cos_sim_arr = cosine_similarity(test_long_vec, misconception_mapping_vec)
test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)

test_long = test_long.with_columns(
    pl.Series(test_sorted_indices[:, :RETRIEVE_NUM].tolist()).alias("PredictMisconceptionId")
)
test_long.head()

In [None]:
test = (
    test_long
    .explode("PredictMisconceptionId")
    .join(
        pl.read_csv(f"{DATA_PATH}/misconception_mapping.csv").with_columns(pl.all().name.prefix("Predict")),
        on="PredictMisconceptionId",
    )
)
test.head(10)

# Rerank

In [None]:
tokenizer = AutoTokenizer.from_pretrained(RERANKER_PATH)
model = AutoModelForSequenceClassification.from_pretrained(RERANKER_PATH)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=16)

In [None]:
def tokenize(examples, max_token_length: int):
    separator = " [SEP] "

    joined_text = (
        examples["ConstructName"]
        + separator
        + examples["SubjectName"]
        + separator
        + examples["QuestionText"]
        + separator
        + examples["AnswerText"]
        + separator  # TODO: change special token
        + examples["PredictMisconceptionName"]
    )

    return tokenizer(
        joined_text,
        max_length=max_token_length,
        truncation=True,
        padding=False,
    )


test = Dataset.from_polars(test).map(
    tokenize,
    batched=False,
    fn_kwargs={"max_token_length": INFERENCE_MAX_LENGTH},
    num_proc=NUM_PROC,
).to_polars()

In [None]:
test = test.with_columns(
    pl.col("input_ids").list.len().alias("length")
).sort("length")

In [None]:
@torch.inference_mode()
@torch.amp.autocast("cuda")
def inference(
    test: pl.DataFrame,
    model,
    device,
    batch_size=EVAL_BS,
    max_length=INFERENCE_MAX_LENGTH
):
    probabilities = []
    
    for i in tqdm(range(0, len(test), batch_size), total=len(test) // EVAL_BS):
        batch = test[i:i + batch_size]  
        input_ids = batch["input_ids"].to_list()
        attention_mask = batch["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
    
        outputs = model(**inputs)
        proba = softmax(outputs.logits.detach().numpy(), -1)
        probabilities.extend(proba[:, 1])
    
    return (
        test.with_columns(
            pl.Series(probabilities).alias("pred_prob")
        )
    )

In [None]:
results = inference(test, model, device)

In [None]:
results.head()

In [None]:
results = (
    results.sort(by=["QuestionId_Answer", "pred_prob"], descending=[False, True])
    .group_by(["QuestionId_Answer"], maintain_order=True)
    .agg(pl.col("PredictMisconceptionId").alias("MisconceptionId"))
)

# Make Submit File

In [None]:
submission = (
    test_long.join(
        results,
        on=["QuestionId_Answer"],
    ).with_columns(
        pl.col("MisconceptionId").map_elements(
            lambda x: " ".join(map(str, x)), return_dtype=pl.String
        )
    ).filter(
        pl.col("CorrectAnswer") != pl.col("AnswerAlphabet")
    ).select(
        pl.col(["QuestionId_Answer", "MisconceptionId"])
    ).sort("QuestionId_Answer")
)

In [None]:
submission.head(10)

In [None]:
submission.write_csv("submission.csv")