In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/spm.model
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/config.json
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/README (1).md
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/README.md
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/tokenizer_config.json
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/tokenizer_config (1).json
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/pytorch_model.bin
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/special_tokens_map.json
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/gitattributes.txt
/kaggle/input/huggingfacedebertav3variants/khalidalt-DeBERTa-v3-large/added_tokens.json
/kaggle/input/huggingfacedebertav3variants/deberta-v3-base-squad2/spm.model
/kaggle/input/huggingfacedebertav3variants/deberta-v

In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification, DebertaV2Tokenizer, DebertaV2ForSequenceClassification
from transformers import AdamW, get_scheduler, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
import os
from sklearn.model_selection import train_test_split

In [3]:
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [4]:
train_df_actual = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv")
real_test_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")
misconception_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")

In [5]:
# Split into train, test, and validation
train_df, test_df = train_test_split(train_df_actual, test_size=0.15, random_state=42)  # 15% for testing
train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=42)  # 15% of remaining for validation

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")


Train size: 1349, Validation size: 239, Test size: 281


In [6]:
import re

# Function to preprocess LaTeX into plain text
def preprocess_latex_to_text(latex_str):
    # Handle common LaTeX functions and symbols
    latex_str = latex_str.replace(r'\frac', 'over')  # Convert fractions
    latex_str = latex_str.replace(r'\sum', 'sum')  # Convert summation symbol
    latex_str = latex_str.replace(r'\int', 'integral')  # Convert integral symbol
    latex_str = latex_str.replace(r'\sqrt', 'square root')  # Convert square root
    latex_str = latex_str.replace(r'\text', '')  # Remove text formatting in LaTeX
    
    # Handle superscripts and subscripts (e.g., x^2 or x_1)
    latex_str = re.sub(r'\^{(.*?)}', r' raised to \1', latex_str)  # e.g., x^{2} becomes x raised to 2
    latex_str = re.sub(r'_{(.*?)}', r' sub \1', latex_str)  # e.g., x_{1} becomes x sub 1
    
    # Remove other LaTeX math environments (e.g., dollar signs for inline math)
    latex_str = latex_str.replace('$', '')
    
    # Optionally, remove other LaTeX-specific symbols or escape characters
    latex_str = latex_str.replace(r'\\', '')  # Remove LaTeX newlines
    latex_str = latex_str.replace(r'{', ' ').replace(r'}', ' ')  # Remove curly braces
    latex_str = ' '.join(latex_str.split())  # Clean up extra spaces
    
    return latex_str

In [7]:
# Combine question and answer text
def preprocess_data(df):
    data = []
    for _, row in df.iterrows():
        for option in ["A", "B", "C", "D"]:  # Only incorrect answers
            # if row['CorrectAnswer'] != option:
            input_text = f"Question: {row['QuestionText']} | Answer: {row[f'Answer{option}Text']}"
            input_text = preprocess_latex_to_text(input_text)
            label = row[f"Misconception{option}Id"]
            questionid = f"{row['QuestionId']}"
            answer = f"{option}"
            data.append((questionid, answer, input_text, label))
    return pd.DataFrame(data, columns=["QuestionId","Answer", "text", "label"])

In [8]:
# class MisconceptionDataset(Dataset):
#     def __init__(self, texts, labels, tokenizer, max_length=128):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         text = self.texts[idx]
#         label = self.labels[idx]

#         # Assuming `text` has two parts: question and answer
#         question, answer = text.split(" | Answer: ")

#         # Tokenize with [CLS] and [SEP] tokens
#         tokens = self.tokenizer(
#             question,
#             answer,
#             padding="max_length",
#             truncation=True,
#             max_length=self.max_length,
#             return_tensors="pt",
#             return_special_tokens_mask=True,  # Helps ensure correct use of [SEP]
#         )
#         return {
#             "input_ids": tokens["input_ids"].squeeze(),
#             "attention_mask": tokens["attention_mask"].squeeze(),
#             "labels": torch.tensor(label, dtype=torch.float),
#         }

In [9]:
class MisconceptionDataset(Dataset):
    def __init__(self, question_ids, answer_labels, texts, labels, tokenizer, max_length=512):
        """
        Args:
            question_ids (list): List of Question IDs.
            answer_labels (list): List of Answer Labels (e.g., A, B, C, D).
            texts (list): List of question texts.
            labels (list): List of true Misconception IDs.
            tokenizer (transformers tokenizer): Tokenizer to encode the text.
            max_length (int, optional): Max sequence length for tokenization. Defaults to 512.
        """
        self.question_ids = question_ids
        self.answer_labels = answer_labels
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        question_id = self.question_ids[idx]
        answer_label = self.answer_labels[idx]
        text = self.texts[idx]
        label = self.labels[idx]

        # Assuming `text` has two parts: question and answer
        question, answer = text.split(" | Answer: ")
        question = question.split("Question: ")[1]

        # Tokenize with [CLS] and [SEP] tokens
        tokens = self.tokenizer(
            question,
            answer,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
            return_special_tokens_mask=True,  # Helps ensure correct use of [SEP]
        )
        
        # # Assuming the `text` is just the question text; answer is inferred from `answer_label`
        # question = text  # In case answer text is separate, modify as needed

        # # Tokenize the question text
        # tokens = self.tokenizer(
        #     question,
        #     padding="max_length",
        #     truncation=True,
        #     max_length=self.max_length,
        #     return_tensors="pt",
        #     return_special_tokens_mask=True,  # Ensures correct use of [SEP]
        # )

        return {
            "input_ids": tokens["input_ids"].squeeze(),
            "attention_mask": tokens["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.float),
            "QuestionId": question_id,  # Include the Question ID
            "AnswerLabel": answer_label  # Include the Answer label (A, B, C, D)
        }

In [10]:
# Preprocess data
train_data = preprocess_data(train_df)
val_data = preprocess_data(val_df)
test_data = preprocess_data(test_df)

train_data.dropna(inplace = True)
val_data.dropna(inplace = True)
test_data.dropna(inplace = True)

# # Combine into DataFrames
# train_data = pd.DataFrame(train_data, columns=["text", "label"])
# val_data = pd.DataFrame(val_data, columns=["text", "label"])
# test_data = pd.DataFrame(test_data, columns=["text", "label"])

# Convert labels to multi-hot encoding
all_labels = sorted(misconception_df["MisconceptionId"].unique())  # Get all unique labels
mlb = MultiLabelBinarizer(classes=all_labels)

train_labels = mlb.fit_transform([[label] for label in train_data["label"]])
val_labels = mlb.transform([[label] for label in val_data["label"]])
test_labels = mlb.transform([[label] for label in test_data["label"]])

train_data.head(5)

Unnamed: 0,QuestionId,Answer,text,label
0,368,A,"Question: Without using a calculator, which tw...",734.0
7,1043,D,Question: Which signs belong in the boxes to m...,2030.0
8,1340,A,Question: A square has an area of \( 100 \math...,1678.0
10,1340,C,Question: A square has an area of \( 100 \math...,734.0
12,586,A,Question: Simplify \( square root 48 \) as muc...,2384.0


In [11]:
print(f"Train size: {len(train_data)}, Validation size: {len(val_data)}, Test size: {len(test_data)}")


Train size: 3160, Validation size: 555, Test size: 655


In [12]:
# Create datasets
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")
train_dataset = MisconceptionDataset(train_data["QuestionId"].tolist(), train_data["Answer"].tolist(), train_data["text"].tolist(), train_labels, tokenizer)
val_dataset = MisconceptionDataset(val_data["QuestionId"].tolist(), val_data["Answer"].tolist(), val_data["text"].tolist(), val_labels, tokenizer)
test_dataset = MisconceptionDataset(test_data["QuestionId"].tolist(), test_data["Answer"].tolist(), test_data["text"].tolist(), test_labels, tokenizer)

# DataLoader for batching
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
val_dataloader = DataLoader(val_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

In [13]:
import torch.optim.lr_scheduler as lr_scheduler
# Model
model = DebertaV2ForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", num_labels=len(all_labels), ignore_mismatched_sizes=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
num_training_steps = len(train_dataloader) * 10  # 10 epochs
# scheduler = get_scheduler("linear", optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
# scheduler = get_scheduler('cosine_annealing_lr', optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
# scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_training_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=int(0.1 * num_training_steps), num_training_steps=num_training_steps
)

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from torch.nn import BCEWithLogitsLoss
from torch.nn import CrossEntropyLoss

# Change the loss calculation in training and evaluation loops
loss_fn = CrossEntropyLoss()

In [15]:
def calculate_precision_at_k(predictions, true_labels, k):
    """
    Calculate Precision at rank k.
    predictions: List of predicted MisconceptionId(s) for a particular QuestionId_Answer.
    true_labels: Set of correct MisconceptionId(s) for the same QuestionId_Answer.
    k: The rank at which precision is calculated.
    """
    relevant = sum([1 for pred in predictions[:k] if pred == true_labels])
    return relevant / k

def evaluate_map_at_25(predictions, true_labels, n=25):
    """
    Evaluate Mean Average Precision at 25 (MAP@25).
    predictions: Dictionary of predictions for each QuestionId_Answer.
    true_labels: Dictionary of true MisconceptionId(s) for each QuestionId_Answer.
    n: Number of predictions per observation (default is 25).
    """
    map_score = 0
    U = len(predictions)  # Number of observations (test set size)

    for question_answer, pred_list in predictions.items():
        true_set = true_labels.get(question_answer, [])
        
        if len(true_set) == 0:
            continue  # Skip this if there are no true labels
        true_set = np.argmax(true_set)
        # Store the correct labels found
        relevant_found = set()
        avg_precision = 0
        
        for k in range(1, min(len(pred_list), n) + 1):
            precision = 0
            # Check if the prediction at rank k is relevant and not already counted
            if pred_list[k - 1] == true_set and pred_list[k - 1] not in relevant_found:
                relevant_found.add(pred_list[k - 1])
                precision = calculate_precision_at_k(pred_list, true_set, k)
            
            # Precision at rank k: how many relevant items are in the top k
            # precision = len(relevant_found) / k
            avg_precision += precision
        
        map_score += avg_precision

    # Compute Mean Average Precision (MAP@25)
    return map_score / U

In [16]:
# Training loop with validation
best_val_loss = float("inf")
best_model_path = "best_model.pth"

for epoch in range(10):  # Number of epochs
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        predictions, true_labels = [], []
        # Store MAP@25 results
        all_predictions = {}
        all_true_labels = {}
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=-1)

        # Get top 25 predictions (this can be adjusted for the actual number of misconceptions)
        top_k_predictions = torch.topk(probabilities, k=25, dim=1).indices.cpu().numpy()
        
        # Store predictions and true labels
        for i, question_id in enumerate(batch["QuestionId"]):  # Ensure batch contains 'QuestionId'
            question_answer = f"{question_id}_{batch['AnswerLabel'][i]}"  # Format QuestionId_Answer for unique identifier
            
            all_predictions[question_answer] = top_k_predictions[i]  # Store top 25 predicted misconception ids
            all_true_labels[question_answer] = labels[i].cpu().numpy()  # Store true misconception ids

        map_at_25 = evaluate_map_at_25(all_predictions, all_true_labels)
        loss = loss_fn(logits, labels) + 0.5*(1- map_at_25)
        # loss = outputs.loss
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    train_loss /= len(train_dataloader)
    print(f"Epoch {epoch + 1} Training Loss: {train_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            predictions, true_labels = [], []
            # Store MAP@25 results
            all_predictions = {}
            all_true_labels = {}
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=-1)
            # Get top 25 predictions (this can be adjusted for the actual number of misconceptions)
            top_k_predictions = torch.topk(probabilities, k=25, dim=1).indices.cpu().numpy()
            
            # Store predictions and true labels
            for i, question_id in enumerate(batch["QuestionId"]):  # Ensure batch contains 'QuestionId'
                question_answer = f"{question_id}_{batch['AnswerLabel'][i]}"  # Format QuestionId_Answer for unique identifier
                
                all_predictions[question_answer] = top_k_predictions[i]  # Store top 25 predicted misconception ids
                all_true_labels[question_answer] = labels[i].cpu().numpy()  # Store true misconception ids

            val_loss += (loss_fn(logits, labels) + 0.5*(1- map_at_25)).item()
            # val_loss += outputs.loss.item()

    val_loss /= len(val_dataloader)
    print(f"Epoch {epoch + 1} Validation Loss: {val_loss:.4f}")

    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), best_model_path)
        print(f"Saved best model at epoch {epoch + 1}.")

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Epoch 1 Training Loss: 8.2957
Epoch 1 Validation Loss: 8.0293
Saved best model at epoch 1.
Epoch 2 Training Loss: 7.5896
Epoch 2 Validation Loss: 8.0074
Saved best model at epoch 2.
Epoch 3 Training Loss: 7.2414
Epoch 3 Validation Loss: 7.9878
Saved best model at epoch 3.
Epoch 4 Training Loss: 6.9016
Epoch 4 Validation Loss: 7.7952
Saved best model at epoch 4.
Epoch 5 Training Loss: 6.5544
Epoch 5 Validation Loss: 7.8265
Epoch 6 Training Loss: 6.2338
Epoch 6 Validation Loss: 7.6591
Saved best model at epoch 6.
Epoch 7 Training Loss: 5.9578
Epoch 7 Validation Loss: 7.4592
Saved best model at epoch 7.
Epoch 8 Training Loss: 5.7242
Epoch 8 Validation Loss: 7.6889
Epoch 9 Training Loss: 5.5776
Epoch 9 Validation Loss: 7.6189
Epoch 10 Training Loss: 5.4626
Epoch 10 Validation Loss: 7.7776


In [17]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score

# Assuming you have already loaded the model and test_dataloader
model.load_state_dict(torch.load(best_model_path, weights_only=True))
model.eval()

test_loss = 0
predictions, true_labels = [], []

# Store MAP@25 results
all_predictions = {}
all_true_labels = {}

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        test_loss += outputs.loss.item()

        logits = outputs.logits
        # print(logits.shape)
        probabilities = torch.softmax(logits, dim=-1)

        # Get top 25 predictions (this can be adjusted for the actual number of misconceptions)
        top_k_predictions = torch.topk(probabilities, k=25, dim=1).indices.cpu().numpy()
        
        # Store predictions and true labels
        for i, question_id in enumerate(batch["QuestionId"]):  # Ensure batch contains 'QuestionId'
            question_answer = f"{question_id}_{batch['AnswerLabel'][i]}"  # Format QuestionId_Answer for unique identifier
            
            all_predictions[question_answer] = top_k_predictions[i]  # Store top 25 predicted misconception ids
            all_true_labels[question_answer] = labels[i].cpu().numpy()  # Store true misconception ids

# # Calculate MAP@25
# def calculate_precision_at_k(predictions, true_labels, k):
#     """
#     Calculate Precision at rank k.
#     predictions: List of predicted MisconceptionId(s) for a particular QuestionId_Answer.
#     true_labels: Set of correct MisconceptionId(s) for the same QuestionId_Answer.
#     k: The rank at which precision is calculated.
#     """
#     relevant = sum([1 for pred in predictions[:k] if pred in true_labels])
#     return relevant / k

# def evaluate_map_at_25(predictions, true_labels, n=25):
#     """
#     Evaluate Mean Average Precision at 25 (MAP@25).
#     predictions: Dictionary of predictions for each QuestionId_Answer.
#     true_labels: Dictionary of true MisconceptionId(s) for each QuestionId_Answer.
#     n: Number of predictions per observation (default is 25).
#     """
#     map_score = 0
#     U = len(predictions)  # Number of observations (test set size)

#     for question_answer, pred_list in predictions.items():
#         true_set = true_labels.get(question_answer, [])
        
#         if len(true_set) == 0:
#             continue  # Skip this if there are no true labels
        
#         avg_precision = 0
#         for k in range(1, min(len(pred_list), n) + 1):
#             precision = calculate_precision_at_k(pred_list, true_set, k)
#             avg_precision += precision
        
#         map_score += avg_precision / min(len(pred_list), n)

#     # Compute Mean Average Precision (MAP@25)
#     return map_score / U

def calculate_precision_at_k(predictions, true_labels, k):
    """
    Calculate Precision at rank k.
    predictions: List of predicted MisconceptionId(s) for a particular QuestionId_Answer.
    true_labels: Set of correct MisconceptionId(s) for the same QuestionId_Answer.
    k: The rank at which precision is calculated.
    """
    relevant = sum([1 for pred in predictions[:k] if pred == true_labels])
    return relevant / k

def evaluate_map_at_25(predictions, true_labels, n=25):
    """
    Evaluate Mean Average Precision at 25 (MAP@25).
    predictions: Dictionary of predictions for each QuestionId_Answer.
    true_labels: Dictionary of true MisconceptionId(s) for each QuestionId_Answer.
    n: Number of predictions per observation (default is 25).
    """
    map_score = 0
    U = len(predictions)  # Number of observations (test set size)

    for question_answer, pred_list in predictions.items():
        true_set = true_labels.get(question_answer, [])
        
        if len(true_set) == 0:
            continue  # Skip this if there are no true labels
        true_set = np.argmax(true_set)
        # Store the correct labels found
        relevant_found = set()
        avg_precision = 0
        
        for k in range(1, min(len(pred_list), n) + 1):
            precision = 0
            # Check if the prediction at rank k is relevant and not already counted
            if pred_list[k - 1] == true_set and pred_list[k - 1] not in relevant_found:
                relevant_found.add(pred_list[k - 1])
                precision = calculate_precision_at_k(pred_list, true_set, k)
            
            # Precision at rank k: how many relevant items are in the top k
            # precision = len(relevant_found) / k
            avg_precision += precision
        
        map_score += avg_precision

    # Compute Mean Average Precision (MAP@25)
    return map_score / U



# Evaluate MAP@25
map_at_25 = evaluate_map_at_25(all_predictions, all_true_labels)
test_loss /= len(test_dataloader)
# print(f"Test Loss: {test_loss:.4f}")
print(f"MAP@25: {map_at_25:.4f}")

MAP@25: 0.1069


In [18]:
# https://www.kaggle.com/code/cdeotte/how-to-train-open-book-model-part-1#MAP@3-Metric
def map_at_25(predictions, labels):
    map_sum = 0
    # U = len(predictions)  # Number of observations (test set size)

    for question_answer, pred_list in predictions.items():
        true_set1 = labels.get(question_answer, [])
        true_set = np.argmax(true_set1)
        # print(true_set1)
        # print(pred_list)
    # for x, y in zip(predictions, labels):
    #     z = [1 / i if y == j else 0 for i, j in zip(range(1, 26), x)]
        z = [1 / i if true_set == j else 0 for i, j in zip(range(1, 26), pred_list)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

In [19]:
map_at_25_score = map_at_25(all_predictions, all_true_labels)
map_at_25_score

0.10692927401654044

In [20]:
# all_true_labels

In [21]:
# all_predictions

In [22]:
# class PredictionDataset(Dataset):
#     def __init__(self, question_ids, answer_labels, texts, tokenizer, max_length=128):
#         """
#         Args:
#             question_ids (list): List of Question IDs.
#             answer_labels (list): List of Answer Labels (e.g., A, B, C, D).
#             texts (list): List of question texts.
#             labels (list): List of true Misconception IDs.
#             tokenizer (transformers tokenizer): Tokenizer to encode the text.
#             max_length (int, optional): Max sequence length for tokenization. Defaults to 128.
#         """
#         self.question_ids = question_ids
#         self.answer_labels = answer_labels
#         self.texts = texts
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         question_id = self.question_ids[idx]
#         answer_label = self.answer_labels[idx]
#         text = self.texts[idx]

#         # Assuming the `text` is just the question text; answer is inferred from `answer_label`
#         question = text  # In case answer text is separate, modify as needed

#         # Tokenize the question text
#         tokens = self.tokenizer(
#             question,
#             padding="max_length",
#             truncation=True,
#             max_length=self.max_length,
#             return_tensors="pt",
#             return_special_tokens_mask=True,  # Ensures correct use of [SEP]
#         )

#         return {
#             "input_ids": tokens["input_ids"].squeeze(),
#             "attention_mask": tokens["attention_mask"].squeeze(),
#             "QuestionId": question_id,  # Include the Question ID
#             "AnswerLabel": answer_label  # Include the Answer label (A, B, C, D)
#         }

In [23]:
# # Preprocess data
# def preprocess_testdata(df):
#     data = []
#     for _, row in df.iterrows():
#         for option in ["A", "B", "C", "D"]:  # Only incorrect answers
#             # if row['CorrectAnswer'] != option:
#             input_text = f"Question: {row['QuestionText']} | Answer: {row[f'Answer{option}Text']}"
#             # label = row[f"Misconception{option}Id"]
#             questionid = f"{row['QuestionId']}"
#             answer = f"{option}"
#             data.append((questionid, answer, input_text))
#     return pd.DataFrame(data, columns=["QuestionId","Answer", "text"])

# real_test_data = preprocess_testdata(real_test_df)
# real_test_data.dropna(inplace = True)

# # # Combine into DataFrames
# # train_data = pd.DataFrame(train_data, columns=["text", "label"])
# # val_data = pd.DataFrame(val_data, columns=["text", "label"])
# # test_data = pd.DataFrame(test_data, columns=["text", "label"])

# # Convert labels to multi-hot encoding
# # all_labels = sorted(misconception_df["MisconceptionId"].unique())  # Get all unique labels
# # mlb = MultiLabelBinarizer(classes=all_labels)

# # real_test_labels = mlb.transform([[label] for label in real_test_data["label"]])

In [24]:
# # Create datasets
# real_test_dataset = PredictionDataset(real_test_data["QuestionId"].tolist(), real_test_data["Answer"].tolist(), real_test_data["text"].tolist(), tokenizer)

# # DataLoader for batching
# real_test_dataloader = DataLoader(real_test_dataset, batch_size=16)

In [25]:
# # Assuming you have already loaded the model and test_dataloader
# model.load_state_dict(torch.load(best_model_path))
# model.eval()

# test_loss = 0
# predictions, true_labels = [], []

# # Store MAP@25 results
# all_predictions = {}
# all_true_labels = {}

# with torch.no_grad():
#     for batch in real_test_dataloader:
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         # labels = batch["labels"].to(device)
        
#         # Forward pass
#         outputs = model(input_ids, attention_mask=attention_mask)
#         # test_loss += outputs.loss.item()

#         logits = outputs.logits
#         probabilities = torch.softmax(logits, dim=-1)

#         # Get top 25 predictions (this can be adjusted for the actual number of misconceptions)
#         top_k_predictions = torch.topk(probabilities, k=25, dim=1).indices.cpu().numpy()
        
#         # Store predictions and true labels
#         for i, question_id in enumerate(batch["QuestionId"]):  # Ensure batch contains 'QuestionId'
#             question_answer = f"{question_id}_{batch['AnswerLabel'][i]}"  # Format QuestionId_Answer for unique identifier
            
#             all_predictions[question_answer] = top_k_predictions[i]  # Store top 25 predicted misconception ids
#             all_true_labels[question_answer] = labels[i].cpu().numpy()  # Store true misconception ids

In [26]:
# # Convert each NumPy array to a string
# data_str = [(key, np.array2string(value)) for key, value in all_predictions.items()]
# df = pd.DataFrame(data_str, columns=["QuestionId_Answer", "MisconceptionId"])
# df.to_csv("submission.csv", columns=["QuestionId_Answer", "MisconceptionId"], index=False)

In [27]:
# pd.read_csv("submission.csv")