In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os, shutil

BASE_MODEL = "microsoft/deberta-v3-base"

# Save the model and tokenizer locally
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=8)

os.makedirs("/kaggle/working/deberta_model", exist_ok=True)
tokenizer.save_pretrained("/kaggle/working/deberta_model")
model.save_pretrained("/kaggle/working/deberta_model")

print("‚úÖ Model and tokenizer saved at /kaggle/working/deberta_model")


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

2025-10-14 10:48:23.918805: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760438904.356175      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760438904.457592      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model and tokenizer saved at /kaggle/working/deberta_model


In [16]:
import pandas as pd
import numpy as np
import re
import os
import torch
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import KFold
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import average_precision_score
from tqdm.auto import tqdm
os.environ["WANDB_DISABLED"] = "true"

In [26]:
# --- Configuration ---
KAGGLE_DATA_PATH = "/kaggle/input/map-charting-student-math-misunderstandings/"
TRAIN_FILE = os.path.join(KAGGLE_DATA_PATH, "train.csv")
TEST_FILE = os.path.join(KAGGLE_DATA_PATH, "test.csv")
SUBMISSION_FILE = "submission.csv"

MODEL_NAME = 'microsoft/deberta-v3-base'
MAX_LEN = 512
NUM_EPOCHS = 3
LR = 2e-5
PER_DEVICE_BATCH_SIZE = 8
SEED = 42
N_FOLDS = 5

torch.manual_seed(SEED)
np.random.seed(SEED)

# --- Data Preparation ---
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    return ""

def format_input(row):
    return (
        f"Question: {clean_text(row['QuestionText'])} \n"
        f"Answer: {clean_text(row['MC_Answer'])} \n"
        f"Explanation: {clean_text(row['StudentExplanation'])}"
    )


def load_data(path, is_training=True):
    df = pd.read_csv(path)
    df['input_text'] = df.apply(format_input, axis=1)
    if is_training:
        df['target_label'] = df['Category'].astype(str) + ':' + df['Misconception'].fillna('NA').astype(str)
        df_responses = df.groupby('row_id')['target_label'].apply(list).reset_index(name='labels')
        df_responses = df_responses.merge(df[['row_id', 'input_text']].drop_duplicates(subset=['row_id']), on='row_id')
        return df_responses
    return df

In [27]:
# --- Dataset & Metrics ---
class MisconceptionDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item
    def __len__(self):
        return len(self.encodings['input_ids'])

def map_at_k(y_true, y_pred_proba, k=3):
    avg_precisions = []
    sorted_pred_indices = np.argsort(y_pred_proba, axis=1)[:, ::-1]
    for i in range(y_true.shape[0]):
        true_labels = np.where(y_true[i] == 1)[0]
        if len(true_labels) == 0: continue
        top_k_pred_indices = sorted_pred_indices[i, :k]
        running_correct = 0
        total_precision = 0
        remaining_true = set(true_labels)
        for rank, pred_idx in enumerate(top_k_pred_indices, 1):
            if pred_idx in remaining_true:
                running_correct += 1
                total_precision += (running_correct / rank)
                remaining_true.remove(pred_idx)
                if not remaining_true: break
        if running_correct > 0:
            avg_precisions.append(total_precision / len(true_labels))
    return np.mean(avg_precisions) if avg_precisions else 0.0

def compute_metrics(p, mlb_classes):
    logits = p.predictions
    probabilities = torch.sigmoid(torch.tensor(logits)).numpy()
    y_true = p.label_ids
    map3_score = map_at_k(y_true, probabilities, k=3)
    macro_ap = average_precision_score(y_true, probabilities, average='macro')
    return {'map3_score': map3_score, 'macro_ap': macro_ap}

In [None]:


# --- K-Fold Training with Resume + MAP Evaluation ---
def run_kfold_pipeline():
    print("--- Loading and preparing data ---")
    df_responses = load_data(TRAIN_FILE, is_training=True)
    if df_responses is None:
        return

    mlb = MultiLabelBinarizer()
    Y_labels = mlb.fit_transform(df_responses['labels'])
    num_labels = len(mlb.classes_)
    print(f"Total labels found: {num_labels}")

    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
    all_test_preds = []
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

    for fold, (train_index, val_index) in enumerate(kf.split(df_responses)):
        fold_dir = f'./deberta_results_fold_{fold}'

        # ‚úÖ Skip already completed folds
        if os.path.exists(os.path.join(fold_dir, "completed.txt")):
            print(f"Skipping Fold {fold} ‚Äî already trained.")
            continue

        print(f"\n--- Starting Fold {fold+1}/{N_FOLDS} ---")
        fold_train_df = df_responses.iloc[train_index].reset_index(drop=True)
        fold_val_df = df_responses.iloc[val_index].reset_index(drop=True)
        
        X_train, Y_train = fold_train_df['input_text'].tolist(), Y_labels[train_index]
        X_val, Y_val = fold_val_df['input_text'].tolist(), Y_labels[val_index]

        train_encodings = tokenizer(X_train, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')
        val_encodings = tokenizer(X_val, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')

        train_dataset = MisconceptionDataset(train_encodings, Y_train)
        val_dataset = MisconceptionDataset(val_encodings, Y_val)
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME, num_labels=num_labels, problem_type="multi_label_classification"
        )

        training_args = TrainingArguments(
            output_dir=fold_dir,
            num_train_epochs=NUM_EPOCHS,
            per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
            per_device_eval_batch_size=PER_DEVICE_BATCH_SIZE * 2,
            warmup_ratio=0.1,
            weight_decay=0.01,
            learning_rate=LR,
            logging_steps=50,
            eval_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=2,
            load_best_model_at_end=True,
            metric_for_best_model='map3_score',
            greater_is_better=True,
            fp16=torch.cuda.is_available(),
            report_to="none"
        )

        def wrapped_compute_metrics(p):
            return compute_metrics(p, mlb.classes_)

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=wrapped_compute_metrics,
            data_collator=data_collator,
        )

        trainer.train()


if __name__ == "__main__":
    run_kfold_pipeline()


--- Loading and preparing data ---
Total labels found: 65





--- Starting Fold 1/5 ---


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Map3 Score,Macro Ap
1,0.0232,0.022769,0.877577,0.281498
2,0.0146,0.015685,0.911011,0.401557
3,0.0129,0.013676,0.92404,0.434606





--- Starting Fold 2/5 ---


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Map3 Score,Macro Ap
1,0.0205,0.020032,0.884773,0.281715
2,0.0171,0.014529,0.919136,0.414665
3,0.0116,0.013351,0.924073,0.441155





--- Starting Fold 3/5 ---


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Map3 Score,Macro Ap
1,0.0222,0.020782,0.886812,0.267555
2,0.0147,0.014797,0.92133,0.394427




In [None]:
# --- Final Prediction Ensemble ---
    print("\n--- Generating final submission ---")
    df_test = load_data(TEST_FILE, is_training=False)
    X_test = df_test['input_text'].tolist()
    test_encodings = tokenizer(X_test, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')
    test_dataset = MisconceptionDataset(test_encodings)

    for fold in range(N_FOLDS):
        fold_dir = f'./deberta_results_fold_{fold}'
        if not os.path.exists(os.path.join(fold_dir, "pytorch_model.bin")):
            print(f"Skipping fold {fold} ‚Äî model not found.")
            continue

        model = AutoModelForSequenceClassification.from_pretrained(fold_dir)
        trainer = Trainer(model=model, tokenizer=tokenizer)
        raw_predictions = trainer.predict(test_dataset).predictions
        probabilities = torch.sigmoid(torch.tensor(raw_predictions)).numpy()
        all_test_preds.append(probabilities)
        del model, trainer
        torch.cuda.empty_cache()


 avg_probabilities = np.mean(all_test_preds, axis=0)
    top_3_indices = np.argsort(avg_probabilities, axis=1)[:, ::-1][:, :3]
    predictions = [' '.join([mlb.classes_[i] for i in row]) for row in top_3_indices]

    submission_df = pd.DataFrame({
        'row_id': df_test['row_id'],
        'Category:Misconception': predictions
    })
    submission_df.to_csv(SUBMISSION_FILE, index=False)
    print(f"‚úÖ Submission saved to {SUBMISSION_FILE}")
    print(submission_df.head())

if __name__ == "__main__":
    run_kfold_pipeline()


In [24]:
# ================================================================
# ‚úÖ Final Cross-Validation + Ensemble + Submission Pipeline
# ================================================================
import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, Trainer
from torch.utils.data import Dataset
from sklearn.metrics import average_precision_score

# ================================================================
# 1Ô∏è‚É£ Configs
# ================================================================
N_FOLDS = 5
MAX_LEN = 512
BATCH_SIZE = 16
BASE_MODEL = "microsoft/deberta-v3-small"
TRAIN_FILE = "/kaggle/input/train.csv"   # update if different
TEST_FILE = "/kaggle/input/test.csv"     # update if different
SUBMISSION_FILE = "submission.csv"

# ================================================================
# 2Ô∏è‚É£ Dataset Class
# ================================================================
class MisconceptionDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx]).float()
        return item

# ================================================================
# 3Ô∏è‚É£ MAP@3 Metric Function
# ================================================================
def mapk(actual, predicted, k=3):
    return np.mean([
        np.isin(predicted[i][:k], actual[i]).sum() / k
        for i in range(len(actual))
    ])

# ================================================================
# 4Ô∏è‚É£ Main Pipeline
# ================================================================
def run_kfold_pipeline():
    from sklearn.model_selection import KFold
    from sklearn.preprocessing import MultiLabelBinarizer
    from transformers import AutoTokenizer, TrainingArguments

    print("\n--- Loading and preparing data ---")
    df = pd.read_csv(TRAIN_FILE)
    df['labels'] = df['Category:Misconception'].apply(lambda x: x.split())
    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df['labels'])

    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
    X = df['input_text'].tolist()

    kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
    fold = 0
    all_test_preds = []
    all_val_scores = []

    for train_index, val_index in kf.split(X):
        fold_dir = f'./deberta_results_fold_{fold}'
        if os.path.exists(os.path.join(fold_dir, "pytorch_model.bin")):
            print(f"Skipping Fold {fold} ‚Äî model already exists.")
            fold += 1
            continue

        print(f"\n--- Starting Fold {fold+1}/{N_FOLDS} ---")

        X_train, X_val = [X[i] for i in train_index], [X[i] for i in val_index]
        y_train, y_val = y[train_index], y[val_index]

        train_enc = tokenizer(X_train, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')
        val_enc = tokenizer(X_val, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')

        train_dataset = MisconceptionDataset(train_enc, y_train)
        val_dataset = MisconceptionDataset(val_enc, y_val)

        model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=y.shape[1])

        args = TrainingArguments(
            output_dir=fold_dir,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            learning_rate=2e-5,
            num_train_epochs=3,
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            save_total_limit=1,
            logging_dir=f"{fold_dir}/logs",
        )

        trainer = Trainer(model=model, args=args, train_dataset=train_dataset, eval_dataset=val_dataset)
        trainer.train()
        trainer.save_model(fold_dir)
        print(f"‚úÖ Fold {fold} model saved at {fold_dir}")

        # Local validation MAP@3
        val_preds = torch.sigmoid(torch.tensor(trainer.predict(val_dataset).predictions)).numpy()
        top_3 = np.argsort(val_preds, axis=1)[:, ::-1][:, :3]
        actual_indices = [np.where(row == 1)[0] for row in y_val]
        score = mapk(actual_indices, top_3, k=3)
        all_val_scores.append(score)
        print(f"üìä Local MAP@3 for fold {fold}: {score:.4f}")

        del model, trainer
        torch.cuda.empty_cache()
        fold += 1

    print(f"\nüìà Average Local MAP@3 across folds: {np.mean(all_val_scores):.4f}")

    # ================================================================
    # üß† Final Ensemble for Submission
    # ================================================================
    print("\n--- Generating final submission ---")
    df_test = pd.read_csv(TEST_FILE)
    X_test = df_test['input_text'].tolist()
    test_encodings = tokenizer(X_test, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt')
    test_dataset = MisconceptionDataset(test_encodings)

    for fold in range(N_FOLDS):
        fold_dir = f'./deberta_results_fold_{fold}'
        if not os.path.exists(os.path.join(fold_dir, "pytorch_model.bin")):
            print(f"Skipping fold {fold} ‚Äî model not found.")
            continue

        model = AutoModelForSequenceClassification.from_pretrained(fold_dir)
        trainer = Trainer(model=model, tokenizer=tokenizer)
        raw_preds = trainer.predict(test_dataset).predictions
        probs = torch.sigmoid(torch.tensor(raw_preds)).numpy()
        all_test_preds.append(probs)

        del model, trainer
        torch.cuda.empty_cache()

    avg_prob = np.mean(all_test_preds, axis=0)
    top_3_idx = np.argsort(avg_prob, axis=1)[:, ::-1][:, :3]
    predictions = [' '.join([mlb.classes_[i] for i in row]) for row in top_3_idx]

    submission_df = pd.DataFrame({
        'row_id': df_test['row_id'],
        'Category:Misconception': predictions
    })
    submission_df.to_csv(SUBMISSION_FILE, index=False)
    print(f"‚úÖ Submission saved to {SUBMISSION_FILE}")
    print(submission_df.head())

# ================================================================
# üöÄ Run
# ================================================================
if __name__ == "__main__":
    run_kfold_pipeline()



--- Loading and preparing data ---


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/train.csv'

In [2]:
# # ================================================================
# # üìò Math Misconception Multi-Label Prediction Notebook (Optimized)
# # ================================================================

# import os
# import pandas as pd
# import numpy as np
# import torch
# from torch.utils.data import Dataset
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)

# # ================================================================
# # 1Ô∏è‚É£ CONFIGURATION
# # ================================================================

# MODEL_PATH = "/kaggle/input/map-deberta-v2-trained-model/deberta_results/checkpoint-6195"
# DATA_PATH = "/kaggle/input/map-charting-student-math-misunderstandings/"
# TEST_FILE = os.path.join(DATA_PATH, "test.csv")
# TRAIN_FILE = os.path.join(DATA_PATH, "train.csv")

# MAX_LEN = 512
# TOP_K = 3
# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # ================================================================
# # 2Ô∏è‚É£ TEXT PREPROCESSING
# # ================================================================

# def clean_text(text):
#     if isinstance(text, str):
#         text = text.lower().strip()
#         text = ' '.join(text.split())
#         return text
#     return ""

# def format_input(row):
#     return f"Question: {clean_text(row['QuestionText'])}\n" \
#            f"Answer: {clean_text(row['MC_Answer'])}\n" \
#            f"Explanation: {clean_text(row['StudentExplanation'])}"

# # ================================================================
# # 3Ô∏è‚É£ DATASET CLASS
# # ================================================================

# class MisconceptionDataset(Dataset):
#     def __init__(self, encodings):
#         self.encodings = encodings

#     def __len__(self):
#         return self.encodings['input_ids'].shape[0]

#     def __getitem__(self, idx):
#         return {key: val[idx] for key, val in self.encodings.items()}

# # ================================================================
# # 4Ô∏è‚É£ LOAD TRAIN & TEST DATA
# # ================================================================

# # Load train to rebuild mlb classes (or use saved mlb_classes.npy if available)
# train_df = pd.read_csv(TRAIN_FILE)
# train_df['Misconception'] = train_df['Misconception'].fillna('NA')
# train_df['Target'] = train_df['Category'] + ":" + train_df['Misconception']

# mlb_labels = [[label] for label in train_df['Target'].tolist()]
# from sklearn.preprocessing import MultiLabelBinarizer
# mlb = MultiLabelBinarizer()
# mlb.fit(mlb_labels)
# all_labels = mlb.classes_

# # Load and preprocess test data
# test_df = pd.read_csv(TEST_FILE)
# test_df['input_text'] = test_df.apply(format_input, axis=1)
# texts = test_df['input_text'].tolist()

# # ================================================================
# # CREATE INPUT TEXT FOR TRAINING DATA
# # ================================================================
# def clean_text(text):
#     if isinstance(text, str):
#         text = text.lower().strip()
#         text = ' '.join(text.split())
#         return text
#     return ""

# def format_input(row):
#     return (
#         f"Question: {clean_text(row['QuestionText'])}\n"
#         f"Answer: {clean_text(row['MC_Answer'])}\n"
#         f"Explanation: {clean_text(row['StudentExplanation'])}"
#     )

# train_df['input_text'] = train_df.apply(format_input, axis=1)

# # ================================================================
# # 7Ô∏è‚É£.1 Local MAP@3 Evaluation (Optional)
# # ================================================================

# from sklearn.model_selection import train_test_split

# # Split train data into pseudo-train and pseudo-val (10% val)
# train_texts = train_df['input_text'].tolist()
# train_labels = mlb.transform([[label] for label in train_df['Target'].tolist()])

# X_train, X_val, Y_train, Y_val = train_test_split(
#     train_texts, train_labels, test_size=0.1, random_state=42
# )

# # Tokenize validation
# val_encodings = tokenizer(X_val, truncation=True, padding="max_length",
#                           max_length=MAX_LEN, return_tensors="pt")
# val_dataset = MisconceptionDataset(val_encodings)

# # Predict on validation set
# model.eval()
# val_logits = []

# with torch.no_grad():
#     for i in range(0, len(val_dataset), batch_size):
#         batch = {k: v[i:i+batch_size].to(DEVICE) for k, v in val_encodings.items()}
#         outputs = model(**batch)
#         val_logits.append(outputs.logits.cpu())

# val_logits = torch.cat(val_logits, dim=0)
# val_probabilities = torch.sigmoid(val_logits).numpy()

# # Top-3 predictions
# val_top_indices = np.argsort(val_probabilities, axis=1)[:, ::-1][:, :TOP_K]

# # Compute MAP@3
# def map_at_k(y_true, y_pred, k=3):
#     score = 0.0
#     n = y_true.shape[0]
#     for i in range(n):
#         true_labels = np.where(y_true[i]==1)[0]
#         pred_labels = y_pred[i]
#         hits = 0
#         avg_prec = 0.0
#         for j, p in enumerate(pred_labels):
#             if p in true_labels:
#                 hits += 1
#                 avg_prec += hits / (j + 1)
#         if len(true_labels) > 0:
#             avg_prec /= min(len(true_labels), k)
#         score += avg_prec
#     return score / n

# val_map3 = map_at_k(Y_val, val_top_indices, k=TOP_K)
# print(f"üìä Local MAP@3 score on validation set: {val_map3:.4f}")


# # ================================================================
# # 5Ô∏è‚É£ TOKENIZATION
# # ================================================================

# tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
# encodings = tokenizer(texts,
#                       truncation=True,
#                       padding="max_length",
#                       max_length=MAX_LEN,
#                       return_tensors="pt")
# test_dataset = MisconceptionDataset(encodings)

# # ================================================================
# # 6Ô∏è‚É£ LOAD MODEL
# # ================================================================

# model = AutoModelForSequenceClassification.from_pretrained(
#     MODEL_PATH,
#     local_files_only=True
# )
# model.to(DEVICE)
# model.eval()

# # ================================================================
# # 7Ô∏è‚É£ PREDICTION
# # ================================================================

# batch_size = 32
# all_logits = []

# with torch.no_grad():
#     for i in range(0, len(test_dataset), batch_size):
#         batch = {k: v[i:i+batch_size].to(DEVICE) for k, v in encodings.items()}
#         outputs = model(**batch)
#         all_logits.append(outputs.logits.cpu())

# logits = torch.cat(all_logits, dim=0)
# probabilities = torch.sigmoid(logits).numpy()

# # ================================================================
# # 8Ô∏è‚É£ TOP-K PREDICTIONS & MAP@3 READY
# # ================================================================

# top_indices = np.argsort(probabilities, axis=1)[:, ::-1][:, :TOP_K]
# predictions = [" ".join([all_labels[i] for i in row]) for row in top_indices]

# # ================================================================
# # 9Ô∏è‚É£ SAVE SUBMISSION
# # ================================================================

# submission_df = pd.DataFrame({
#     "row_id": test_df["row_id"],
#     "Category:Misconception": predictions
# })

# submission_file = "submission.csv"
# submission_df.to_csv(submission_file, index=False)
# print(f"‚úÖ Submission saved to '{submission_file}'")
# print(submission_df.head())


In [3]:
# import pandas as pd
# import numpy as np
# import re
# import torch
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
# from sklearn.preprocessing import MultiLabelBinarizer

# # Load the training CSV
# train_file = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
# df = pd.read_csv(train_file)

# # Recreate the target labels
# df['target_label'] = df['Category'].astype(str) + ':' + df['Misconception'].fillna('NA').astype(str)

# # Group by row_id to get all labels per response
# df_responses = df.groupby('row_id')['target_label'].apply(list).reset_index(name='labels')

# # Fit MultiLabelBinarizer
# mlb = MultiLabelBinarizer()
# mlb.fit(df_responses['labels'])

# # Now you have mlb.classes_ regenerated
# mlb_classes = mlb.classes_


In [4]:
# from transformers import Trainer, TrainingArguments
# import os

# os.environ["WANDB_DISABLED"] = "true"  # Disable W&B


# # --- Config ---
# TEST_FILE = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'
# MODEL_NAME = '/kaggle/input/map-deberta-v2-trained-model/deberta_results/checkpoint-6195'
# MAX_LEN = 256

# # --- Text preprocessing ---
# def clean_text(text):
#     if isinstance(text, str):
#         text = text.lower()
#         text = re.sub(r'\s+', ' ', text).strip()
#         return text
#     return ""

# def format_input(row):
#     return (
#         f"Question: {clean_text(row['QuestionText'])} \n"
#         f"Answer: {clean_text(row['MC_Answer'])} \n"
#         f"Explanation: {clean_text(row['StudentExplanation'])}"
#     )

# def load_data(path, is_training=False):
#     df = pd.read_csv(path)
#     df['input_text'] = df.apply(format_input, axis=1)
#     return df

# # --- Dataset class ---
# class MisconceptionDataset(torch.utils.data.Dataset):
#     def __init__(self, encodings):
#         self.encodings = encodings

#     def __getitem__(self, idx):
#         return {key: val[idx].clone().detach() for key, val in self.encodings.items()}

#     def __len__(self):
#         return len(self.encodings['input_ids'])

# # --- Generate submission ---
# def generate_submission(model_path, test_file, output_file='submission.csv'):
#     df_test = load_data(test_file)
#     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
#     test_encodings = tokenizer(
#         df_test['input_text'].tolist(),
#         truncation=True,
#         padding='max_length',
#         max_length=MAX_LEN,
#         return_tensors='pt'
#     )
#     test_dataset = MisconceptionDataset(test_encodings)

#     # Load model
#     model = AutoModelForSequenceClassification.from_pretrained(model_path)
#     trainer = Trainer(model=model)

#     # Predict
#     raw_predictions = trainer.predict(test_dataset).predictions
#     probabilities = torch.sigmoid(torch.tensor(raw_predictions)).numpy()

#     # Top 3 predictions
#     top_3_indices = np.argsort(probabilities, axis=1)[:, ::-1][:, :3]

#     # Map indices to class names (use the same ML classes as Colab)
#     # You need to save mlb.classes_ from Colab as a .npy file and load it here
#     # mlb_classes = np.load('/kaggle/input/mlb-classes/mlb_classes.npy', allow_pickle=True)
#     predictions = [' '.join([mlb_classes[i] for i in row]) for row in top_3_indices]

#     submission_df = pd.DataFrame({
#         'row_id': df_test['row_id'],
#         'Category:Misconception': predictions
#     })
#     submission_df.to_csv(output_file, index=False)
#     print(f"Submission saved to {output_file}")

# # --- Run ---
# if __name__ == "__main__":
#     generate_submission(MODEL_NAME, TEST_FILE)


In [5]:
# import os
# import pandas as pd
# import numpy as np
# import torch
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# from torch.utils.data import Dataset

# # ================================================================
# # 1Ô∏è‚É£ CONFIGURATION
# # ================================================================
# MODEL_PATH = "/kaggle/input/map-deberta-v2-trained-model/deberta_results/checkpoint-6195"  # Your trained model checkpoint
# DATA_PATH = "/kaggle/input/map-charting-student-math-misunderstandings/"
# TEST_FILE = os.path.join(DATA_PATH, "test.csv")
# MAX_LEN = 512
# TOP_K = 3

# # ================================================================
# # 2Ô∏è‚É£ TEXT PREPROCESSING
# # ================================================================
# def clean_text(text):
#     if isinstance(text, str):
#         text = text.lower().strip()
#         text = ' '.join(text.split())
#         return text
#     return ""

# def format_input(row):
#     return f"Question: {clean_text(row['QuestionText'])}\n" \
#            f"Answer: {clean_text(row['MC_Answer'])}\n" \
#            f"Explanation: {clean_text(row['StudentExplanation'])}"

# # ================================================================
# # 3Ô∏è‚É£ DATASET CLASS
# # ================================================================
# class MisconceptionDataset(Dataset):
#     def __init__(self, encodings):
#         self.encodings = encodings

#     def __len__(self):
#         return self.encodings['input_ids'].shape[0]

#     def __getitem__(self, idx):
#         return {key: val[idx] for key, val in self.encodings.items()}

# # ================================================================
# # 4Ô∏è‚É£ LOAD TEST DATA & TOKENIZE
# # ================================================================
# df_test = pd.read_csv(TEST_FILE)
# df_test['input_text'] = df_test.apply(format_input, axis=1)
# texts = df_test['input_text'].tolist()

# tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)
# encodings = tokenizer(texts, truncation=True, padding="max_length",
#                       max_length=MAX_LEN, return_tensors="pt")

# test_dataset = MisconceptionDataset(encodings)

# # ================================================================
# # 5Ô∏è‚É£ LOAD MODEL
# # ================================================================
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = AutoModelForSequenceClassification.from_pretrained(
#     MODEL_PATH,
#     local_files_only=True
# )
# model.to(device)
# model.eval()

# # ================================================================
# # 6Ô∏è‚É£ PREDICTION
# # ================================================================
# with torch.no_grad():
#     all_logits = []
#     batch_size = 32  # adjust based on GPU memory
#     for i in range(0, len(test_dataset), batch_size):
#         batch = {k: v[i:i+batch_size].to(device) for k, v in encodings.items()}
#         outputs = model(**batch)
#         all_logits.append(outputs.logits.cpu())
    
#     logits = torch.cat(all_logits, dim=0)
#     probabilities = torch.sigmoid(logits).numpy()

# # ================================================================
# # 7Ô∏è‚É£ MAP INDICES TO LABELS
# # ================================================================
# # Rebuild MultiLabelBinarizer to get same class order as training
# from sklearn.preprocessing import MultiLabelBinarizer
# train_df = pd.read_csv(os.path.join(DATA_PATH, "train.csv"))
# train_df['Misconception'] = train_df['Misconception'].fillna('NA')
# train_df['Target'] = train_df['Category'] + ":" + train_df['Misconception']
# mlb = MultiLabelBinarizer()
# mlb.fit([[label] for label in train_df['Target'].tolist()])
# all_labels = mlb.classes_

# top_indices = np.argsort(probabilities, axis=1)[:, ::-1][:, :TOP_K]
# predictions = [" ".join([all_labels[i] for i in row]) for row in top_indices]

# # ================================================================
# # 8Ô∏è‚É£ SAVE SUBMISSION
# # ================================================================
# submission_df = pd.DataFrame({
#     "row_id": df_test["row_id"],
#     "Category:Misconception": predictions
# })

# submission_file = "submission.csv"
# submission_df.to_csv(submission_file, index=False)
# print(f"‚úÖ Submission saved to '{submission_file}'")
# print(submission_df.head())


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/deberta-base-v3 and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Submission saved to 'submission.csv'
   row_id                             Category:Misconception
0   36696  False_Correct:NA False_Misconception:Adding_ac...
1   36697  False_Misconception:Adding_across False_Correc...
2   36698  False_Correct:NA False_Misconception:Adding_ac...


‚úÖ Submission saved to 'submission.csv'
   row_id                             Category:Misconception
0   36696  True_Correct:NA True_Neither:NA True_Misconcep...
1   36697  False_Misconception:WNB False_Misconception:In...
2   36698  True_Neither:NA True_Correct:NA True_Misconcep...


In [6]:
# import pandas as pd
# import numpy as np
# import re
# import os
# import torch
# from sklearn.preprocessing import MultiLabelBinarizer
# from sklearn.model_selection import train_test_split
# from torch.utils.data import Dataset
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
# from torch.nn import BCEWithLogitsLoss
# from sklearn.metrics import average_precision_score # For standard AP score

In [7]:
# # --- Configuration ---
# TRAIN_FILE = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
# TEST_FILE = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'
# MODEL_NAME = '/kaggle/input/map-deberta-v2-trained-model/deberta_results/checkpoint-6195' # High-performance model
# MAX_LEN = 256  # Max length for the combined input text
# NUM_EPOCHS = 3
# LR = 2e-5

In [8]:


# # --- 1. Data Preparation and Feature Engineering ---

# def clean_text(text):
#     """Basic text cleaning: lowercase and remove non-alphanumeric characters."""
#     if isinstance(text, str):
#         # Decode common LaTeX/unicode, clean, and strip excess whitespace
#         text = text.lower()
#         text = re.sub(r'\s+', ' ', text).strip()
#         return text
#     return ""

# def format_input(row):
#     """
#     Combines all relevant context into a single input string for the model.
#     This mimics the successful strategy of providing full context.
#     """
#     # Using a clear separator for the model
#     return (
#         f"Question: {clean_text(row['QuestionText'])} \n"
#         f"Answer: {clean_text(row['MC_Answer'])} \n"
#         f"Explanation: {clean_text(row['StudentExplanation'])}"
#     )

# def load_data(path, is_training=True):
#     """Loads and prepares data, including feature engineering."""
#     try:
#         df = pd.read_csv(path)
#     except FileNotFoundError:
#         print(f"Error: File not found at {path}")
#         return None

#     # Rename columns for consistency
#     df.rename(columns={'StudentExplanation': 'StudentExplanation', 
#                       'Misconception': 'misconception_name'}, inplace=True)
    
#     # Generate the rich input feature
#     df['input_text'] = df.apply(format_input, axis=1)
    
#     if is_training:
#         # Create the combined target label
#         df['target_label'] = df['Category'].astype(str) + ':' + df['misconception_name'].fillna('NA').astype(str)
#         # Group to get a list of all targets per student response (Multi-Label)
#         df_responses = df.groupby('row_id')['target_label'].apply(list).reset_index(name='labels')
        
#         # Merge back the unique input text for each row_id
#         df_responses = df_responses.merge(df[['row_id', 'input_text']].drop_duplicates(subset=['row_id']), on='row_id')
        
#         return df_responses
    
#     return df

In [9]:


# # --- 2. Custom Dataset and Model Utils ---

# class MisconceptionDataset(Dataset):
#     """PyTorch Dataset compatible with Hugging Face Trainer."""
#     def __init__(self, encodings, labels=None):
#         self.encodings = encodings
#         self.labels = labels

#     def __getitem__(self, idx):
#         item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        
#         if self.labels is not None:
#             # Use float for BCEWithLogitsLoss
#             item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
            
#         return item

#     def __len__(self):
#         return len(self.encodings['input_ids'])

# # MAP@K is essential for local evaluation/monitoring
# def map_at_k(y_true, y_pred_proba, k=3):
#     """Calculates Mean Average Precision at K (MAP@K) for multi-label classification."""
#     avg_precisions = []
#     # Sort predictions (high to low) and get the indices
#     sorted_pred_indices = np.argsort(y_pred_proba, axis=1)[:, ::-1] 
    
#     for i in range(y_true.shape[0]):
#         # Get the indices of the true positive labels
#         true_labels = np.where(y_true[i] == 1)[0]
#         if len(true_labels) == 0: continue
        
#         top_k_pred_indices = sorted_pred_indices[i, :k]
#         running_correct = 0
#         total_precision = 0
#         remaining_true = set(true_labels)
        
#         for rank, pred_idx in enumerate(top_k_pred_indices, 1):
#             if pred_idx in remaining_true:
#                 running_correct += 1
#                 total_precision += (running_correct / rank)
#                 remaining_true.remove(pred_idx)
#                 if not remaining_true: break
        
#         if running_correct > 0:
#             avg_precisions.append(total_precision / len(true_labels))

#     return np.mean(avg_precisions) if avg_precisions else 0.0

# def compute_metrics(p, mlb_classes):
#     """Custom metric function for Trainer to calculate MAP@3."""
#     logits = p.predictions
#     # Sigmoid to convert logits to probabilities
#     probabilities = torch.sigmoid(torch.tensor(logits)).numpy()
#     y_true = p.label_ids
    
#     # Calculate MAP@3 for the competition
#     map3_score = map_at_k(y_true, probabilities, k=3)
    
#     # Calculate standard macro Average Precision for comparison
#     macro_ap = average_precision_score(y_true, probabilities, average='macro')
    
#     return {'map3_score': map3_score, 'macro_ap': macro_ap}

In [10]:
# # --- 3. Pipeline Execution ---

# def run_trainer_pipeline():
#     # --- Load and Prepare Data ---
#     df_responses = load_data(TRAIN_FILE, is_training=True)
#     if df_responses is None: return

#     # --- Multi-Label Encoding ---
#     mlb = MultiLabelBinarizer()
#     Y_labels = mlb.fit_transform(df_responses['labels'])
#     num_labels = len(mlb.classes_)

#     # --- Split Data (Training is easier without stratification here) ---
#     X_train, X_val, Y_train, Y_val = train_test_split(
#         df_responses['input_text'].tolist(), Y_labels, test_size=0.1, random_state=42
#     )

#     # --- Tokenization ---
#     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

#     train_encodings = tokenizer(
#         X_train, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt'
#     )
#     val_encodings = tokenizer(
#         X_val, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt'
#     )

#     # --- Dataset Creation ---
#     train_dataset = MisconceptionDataset(train_encodings, Y_train)
#     val_dataset = MisconceptionDataset(val_encodings, Y_val)

#     # --- Model Loading and Configuration ---
#     model = AutoModelForSequenceClassification.from_pretrained(
#         MODEL_NAME,
#         num_labels=num_labels,
#         # Set problem type for multi-label classification (uses Sigmoid)
#         problem_type="multi_label_classification",
#         # Custom loss function to handle multi-label (BCEWithLogitsLoss)
#         # loss_function=BCEWithLogitsLoss() # Removed as it's not a valid argument
#     )

#     # --- Training Arguments ---
#     training_args = TrainingArguments(
#         output_dir='./deberta_results',
#         num_train_epochs=NUM_EPOCHS,
#         per_device_train_batch_size=16, # Increased batch size for efficiency
#         per_device_eval_batch_size=16,
#         warmup_ratio=0.1, # Use 10% of steps for learning rate warmup
#         weight_decay=0.01,
#         learning_rate=LR,
#         logging_steps=50,
#         eval_strategy="epoch", # Evaluate at the end of each epoch
#         save_strategy="epoch",
#         load_best_model_at_end=True, # Load the model with the best validation score
#         metric_for_best_model='map3_score',
#         greater_is_better=True,
#         fp16=torch.cuda.is_available(), # Use mixed precision if GPU is available
#     )

#     # --- Trainer Initialization and Training ---
#     print(f"\n--- Starting Training DeBERTa-v3 on {torch.device('cuda' if torch.cuda.is_available() else 'cpu')} ---")

#     # We wrap compute_metrics to pass the classes object to the internal function
#     def wrapped_compute_metrics(p):
#         return compute_metrics(p, mlb.classes_)

#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=train_dataset,
#         eval_dataset=val_dataset,
#         tokenizer=tokenizer,
#         compute_metrics=wrapped_compute_metrics,
#     )

#     trainer.train()

#     # --- Prediction and Submission Generation ---
#     if os.path.exists(TEST_FILE):
#         print("\n--- Generating Submission on Test Data ---")
#         generate_submission(trainer, mlb, TEST_FILE, tokenizer)
#     else:
#         print(f"Test file {TEST_FILE} not found. Cannot generate submission.")


# def generate_submission(trainer, mlb, test_path, tokenizer, output_filename='submission_deberta_trainer.csv'):
#     """Generates the final submission file using the Hugging Face Trainer."""

#     df_test = load_data(test_path, is_training=False)

#     X_test = df_test['input_text'].tolist()
#     test_encodings = tokenizer(
#         X_test, truncation=True, padding='max_length', max_length=MAX_LEN, return_tensors='pt'
#     )
#     test_dataset = MisconceptionDataset(test_encodings)

#     # Predict logits using the best model loaded by the Trainer
#     raw_predictions = trainer.predict(test_dataset).predictions

#     # Convert logits to probabilities
#     probabilities = torch.sigmoid(torch.tensor(raw_predictions)).numpy()

#     # Get indices of top 3 probability predictions
#     # argsort[::-1] gives descending indices, [:3] takes the top 3
#     top_3_indices = np.argsort(probabilities, axis=1)[:, ::-1][:, :3]

#     predictions = []

#     # Map the indices back to the actual class names (Category:Misconception)
#     for row in top_3_indices:
#         labels = [mlb.classes_[i] for i in row]
#         predictions.append(' '.join(labels))

#     # Create the submission DataFrame
#     submission_df = pd.DataFrame({
#         'row_id': df_test['row_id'],
#         'Category:Misconception': predictions
#     })

#     submission_df.to_csv(output_filename, index=False)
#     print(f"Submission file saved successfully to {output_filename}")


# if __name__ == "__main__":
#     run_trainer_pipeline()