# Overview
I prepared 3 Notebook.

1. [Train Tfidf Retriver](https://www.kaggle.com/code/sinchir0/retriever-tfidf-reranker-deberta-1-trn-ret) (Recall: 0.4530, CV:0.1378, LB:0.128)

2. Train DeBERTa Reranker(CV: 0.1740) <- Now

3. [Infer by Tfidf Retriver And DeBERTa Reranker](https://www.kaggle.com/code/sinchir0/retriever-tfidf-reranker-deberta-inference) (LB:0.189)

Please let me know if there are any mistakes.

In [None]:
# path setting
EXP_NAME = "rerank-deberta"
COMPETITION_NAME = "eedi-mining-misconceptions-in-mathematics"
MODEL_NAME = "microsoft/deberta-v3-xsmall"

DATA_PATH = "/kaggle/input/eedi-mining-misconceptions-in-mathematics"
RETRIEVED_DATA_PATH = "/kaggle/input/retriever-tfidf-reranker-deberta-1-trn-ret/train_ret25_map0.1378_recall0.4531.csv"
MODEL_OUTPUT_PATH = "trained_model"

# experiment parameter
DEBUG = False
WANDB = True

# model parameter
MAX_LENGTH = 256
SEED = 42
EPOCH = 2
LR = 2e-05
TRAIN_BS = 8
GRAD_ACC_NUM = 128 // TRAIN_BS
EVAL_BS = 8
NUM_LABELS = 2

In [None]:
!nvidia-smi

In [None]:
!python --version

# install

In [None]:
%pip install -qq polars==1.7.1
%pip install -qq transformers==4.44.2
%pip install -qq sentencepiece==0.2.0
%pip install -qq datasets==3.0.0
%pip install -qq accelerate==0.34.2
%pip install -qq wandb==0.18.0

# import

In [None]:
import os
import random

import polars as pl
import numpy as np
import torch
import wandb
from datasets import (
    Dataset,
    DatasetDict
)
from tokenizers import AddedToken
from tqdm.auto import tqdm
from scipy.special import softmax
from sklearn.metrics import log_loss
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

In [None]:
NUM_PROC = os.cpu_count()

In [None]:
import transformers
import datasets
import accelerate

assert transformers.__version__ == "4.44.2"
assert datasets.__version__ == "3.0.0"
assert accelerate.__version__ == "0.34.2"

In [None]:
# Seed the same seed to all
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(SEED)

# Wandb

In [None]:
if WANDB:
    # Settings -> add wandb api
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    wandb.login(key=user_secrets.get_secret("wandb_api"))
    wandb.init(project=COMPETITION_NAME, name=EXP_NAME)
    REPORT_TO = "wandb"
else:
    REPORT_TO = "none"

REPORT_TO

# Data Import & Preprocess

In [None]:
train = pl.read_csv(RETRIEVED_DATA_PATH)

In [None]:
if DEBUG:
    train = pl.concat(
        [
            train.filter(pl.col("label") == 0).sample(fraction=1.0).head(50),
            train.filter(pl.col("label") == 1).sample(fraction=1.0).head(50),
        ]
    )
    EPOCH = 1

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_tokens([AddedToken("\n", normalized=False)])
tokenizer.add_tokens([AddedToken(" " * 2, normalized=False)])

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=NUM_LABELS
)
model.resize_token_embeddings(len(tokenizer))

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=16)

# Tokenize

In [None]:
def tokenize(examples, max_token_length: int):
    separator = " [SEP] "

    joined_text = (
        examples["ConstructName"]
        + separator
        + examples["SubjectName"]
        + separator
        + examples["QuestionText"]
        + separator
        + examples["AnswerText"]
        + separator  # TODO: use other special token
        + examples["PredictMisconceptionName"]
    )

    return tokenizer(
        joined_text,
        max_length=max_token_length,
        truncation=True,
        padding="max_length",
    )


train_dataset = Dataset.from_polars(train).map(
    tokenize,
    batched=False,
    fn_kwargs={"max_token_length": MAX_LENGTH},
    num_proc=NUM_PROC,
)

In [None]:
print(tokenizer.decode(train_dataset["input_ids"][0]))

# Train Valid Split

In [None]:
train_valid_dataset = DatasetDict(
    {
        "train": train_dataset.filter(lambda example: example["QuestionId"] % 3 != 0, num_proc=NUM_PROC),
        "valid": train_dataset.filter(lambda example: example["QuestionId"] % 3 == 0, num_proc=NUM_PROC),
    }
)

In [None]:
print(train_valid_dataset)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds_prob = softmax(predictions, axis=-1)
    return {"eval_loss": log_loss(labels, preds_prob)}

In [None]:
training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_PATH,
    learning_rate=LR,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    gradient_accumulation_steps=GRAD_ACC_NUM,
    eval_accumulation_steps=GRAD_ACC_NUM,
    num_train_epochs=EPOCH,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=0.1,
    save_strategy="steps",
    save_steps=0.1,
    save_total_limit=1,
    logging_steps=2,
    seed=SEED,
    metric_for_best_model="eval_loss",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine_with_restarts",
    report_to=REPORT_TO,
    run_name=EXP_NAME,
    load_best_model_at_end=True,
    fp16=True,
    fp16_full_eval=True,
    gradient_checkpointing=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_valid_dataset["train"],
    eval_dataset=train_valid_dataset["valid"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
os.system(f"rm -rf {MODEL_OUTPUT_PATH}/checkpoint-*") # delete checkpoint
trainer.save_model(MODEL_OUTPUT_PATH)

# Make And Save Valid_dataset

In [None]:
valid_dataset = train_valid_dataset["valid"]

valid_pred = softmax(trainer.predict(valid_dataset).predictions, axis=-1)

np.save(f"{MODEL_OUTPUT_PATH}/valid_prediction.npy", valid_pred)

def add_valid_pred(example, idx, valid_pred):
    example["valid_pred"] = valid_pred[idx]
    return example

valid_dataset = valid_dataset.map(
    add_valid_pred, with_indices=True, fn_kwargs={"valid_pred": valid_pred}
)

valid_dataset.save_to_disk(f"{MODEL_OUTPUT_PATH}/valid_dataset")

# Calc CV

In [None]:
valid_data_for_cv = (
    (
        valid_dataset.to_polars()
        .with_columns(
            pl.col("valid_pred").map_elements(lambda x: x[1], return_dtype=pl.Float64)
        )
        .sort(by=["QuestionId_Answer", "valid_pred"], descending=[False, True])
        .group_by(["QuestionId_Answer"], maintain_order=True)
        .agg(pl.col("PredictMisconceptionId").alias("Predict"))
    )
    .join(
        valid_dataset.to_polars()[
            ["QuestionId_Answer", "MisconceptionId"]
        ].unique(),
        on=["QuestionId_Answer"],
    )
    .sort(by=["QuestionId_Answer"])
)

valid_data_for_cv.head()

In [None]:
# https://www.kaggle.com/code/cdeotte/how-to-train-open-book-model-part-1#MAP@3-Metric
def map_at_25(predictions, labels):
    map_sum = 0
    for x, y in zip(predictions, labels):
        z = [1 / i if y == j else 0 for i, j in zip(range(1, 26), x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)


map_at_25_score = map_at_25(
    valid_data_for_cv["Predict"], valid_data_for_cv["MisconceptionId"]
)
print(f"MAP@25 Score: {map_at_25_score}")

In [None]:
with open("cv_score.txt", "w") as f:
    f.write(str(map_at_25_score))

In [None]:
if WANDB:
    wandb.finish()