In [None]:
EXP_NAME = "fine-tuning-bge-version2"
DATA_PATH = "/kaggle/input/eedi-mining-misconceptions-in-mathematics"
MODEL_NAME = "BAAI/bge-large-en-v1.5"
COMPETITION_NAME = "eedi-mining-misconceptions-in-mathematics"
OUTPUT_PATH = "."
MODEL_OUTPUT_PATH = f"{OUTPUT_PATH}/trained_model"

RETRIEVE_NUM = 25

EPOCH = 2
LR = 2e-05
BS = 8
GRAD_ACC_STEP = 128 // BS

TRAINING = True
DEBUG = False
WANDB = True

In [None]:
%pip install -qq polars==1.7.1
%pip install -qq datasets==3.0.0
%pip install -qq sentence_transformers==3.1.0

In [None]:
import os
import numpy as np

from datasets import load_dataset, Dataset

import wandb
import polars as pl
# 
from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

In [None]:
import datasets
import sentence_transformers

assert pl.__version__ == "1.7.1"
assert datasets.__version__ == "3.0.0"
assert sentence_transformers.__version__ == "3.1.0"

In [None]:
NUM_PROC = os.cpu_count()


In [None]:
if WANDB:
    # Settings -> add wandb api
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    wandb.login(key=user_secrets.get_secret("wandb-key"))
    wandb.init(project=COMPETITION_NAME, name=EXP_NAME)
    REPORT_TO = "wandb"
else:
    REPORT_TO = "none"

REPORT_TO

In [None]:
import re
import pandas as pd

df_train = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv')

# Define columns to select from the original df_train
select_column = ["QuestionId", "ConstructName", "SubjectName", "CorrectAnswer", "QuestionText"]

# Melt the df_train for answers
df_answer = pd.melt(df_train, 
                    id_vars=select_column,
                    value_vars=[f"Answer{ans}Text" for ans in ["A", "B", "C", "D"]],
                    var_name="Option",
                    value_name="AnswerText").sort_values("QuestionId")

# Melt the df_train for misconceptions
df_misconception = pd.melt(df_train, 
                    id_vars=select_column,
                    value_vars=[f"Misconception{ans}Id" for ans in ["A", "B", "C", "D"]],
                    var_name="Option",
                    value_name="MisconceptionID").sort_values("QuestionId")

# Extract the options (A, B, C, D) from the Option column for both dataframes
df_answer['Option'] = df_answer['Option'].apply(lambda x: re.search(r'Answer([A-D])', x).group(1) if re.search(r'Answer([A-D])', x) else None)
df_misconception['Option'] = df_misconception['Option'].apply(lambda x: re.search(r'Misconception([A-D])', x).group(1) if re.search(r'Misconception([A-D])', x) else None)

# Merge the two dataframes on 'QuestionId' and 'Option', keeping common columns without duplication
df_merged = pd.merge(df_answer, df_misconception, 
                     on=["QuestionId", "Option", "ConstructName", "SubjectName", "CorrectAnswer", "QuestionText"], 
                     how="inner", 
                     suffixes=('', '_y'))

# Drop any extra duplicated columns that were suffixed with '_y'
df_merged.drop(df_merged.filter(regex='_y$').columns.tolist(), axis=1, inplace=True)




In [None]:
df = df_merged.dropna()

In [None]:
df

In [None]:
train = pd.read_parquet("/kaggle/input/eedi-data-synthesizing/output.parquet")
misconception_mapping = pd.read_csv(f"{DATA_PATH}/misconception_mapping.csv")

In [None]:
train["MisconceptionID"] = df["MisconceptionID"].values.astype(int)

In [None]:
mapping = {}
for k, v in zip(misconception_mapping["MisconceptionId"].values, misconception_mapping["MisconceptionName"].values):
    mapping[k] = v

In [None]:
train["GroundTruthMisconception"] = train["MisconceptionID"].apply(lambda x: mapping[x])

In [None]:
train

In [None]:
def create_training_text(row):
    text = f"""
    {row["ConstructName"]}
    {row["QuestionText"]}
    Answer: {row["AnswerText"]}
    Misconception: {row["Misconception"]}
    """
    return text

In [None]:
train["FullText"] = train.apply(lambda row: create_training_text(row), axis=1)

In [None]:
model = SentenceTransformer(MODEL_NAME)


In [None]:
train_long_vec = model.encode(
    train["FullText"].values, normalize_embeddings=True
)
misconception_mapping_vec = model.encode(
    misconception_mapping["MisconceptionName"].values, normalize_embeddings=True
)
print(train_long_vec.shape)
print(misconception_mapping_vec.shape)

In [None]:
train_cos_sim_arr = cosine_similarity(train_long_vec, misconception_mapping_vec)
train_sorted_indices = np.argsort(train_cos_sim_arr, axis=1)

In [None]:
train_sorted_indices.shape


In [None]:

train["PredictMisconceptionId"] = train_sorted_indices[:, :RETRIEVE_NUM].tolist()

In [None]:
train

In [None]:
train_exploded = train.explode("PredictMisconceptionId")

In [None]:
train_exploded["PredictMisconception"] = train_exploded["PredictMisconceptionId"].apply(lambda x:mapping[x])

In [None]:
train_exploded

In [None]:
import polars as pl

final_train = pl.from_pandas(train_exploded)




In [None]:
train = (
    Dataset.from_polars(final_train)
    .filter(  # To create an anchor, positive, and negative structure, delete rows where the positive and negative are identical.
        lambda example: example["MisconceptionID"] != example["PredictMisconceptionId"],
        num_proc=NUM_PROC,
    )
)

In [None]:
train = train.select_columns(["FullText", "GroundTruthMisconception", "PredictMisconception"])

In [None]:
train

In [None]:
model = SentenceTransformer(MODEL_NAME)

loss = MultipleNegativesRankingLoss(model)

args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=OUTPUT_PATH,
    # Optional training parameters:
    num_train_epochs=EPOCH,
    per_device_train_batch_size=BS,
    gradient_accumulation_steps=GRAD_ACC_STEP,
    per_device_eval_batch_size=BS,
    eval_accumulation_steps=GRAD_ACC_STEP,
    learning_rate=LR,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    lr_scheduler_type="cosine_with_restarts",
    save_strategy="steps",
    save_steps=0.1,
    save_total_limit=2,
    logging_steps=100,
    report_to=REPORT_TO,  # Will be used in W&B if `wandb` is installed
    run_name=EXP_NAME,
    do_eval=False
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train,
    loss=loss
)

trainer.train()
model.save_pretrained(MODEL_OUTPUT_PATH)