In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/roberta-base/rust_model.ot
/kaggle/input/roberta-base/config.json
/kaggle/input/roberta-base/merges.txt
/kaggle/input/roberta-base/README.md
/kaggle/input/roberta-base/tokenizer.json
/kaggle/input/roberta-base/vocab.json
/kaggle/input/roberta-base/dict.txt
/kaggle/input/roberta-base/gitattributes
/kaggle/input/roberta-base/pytorch_model.bin
/kaggle/input/roberta-base/model.safetensors
/kaggle/input/roberta-base/flax_model.msgpack
/kaggle/input/eedi-mining-misconceptions-in-mathematics/sample_submission.csv
/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv
/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv
/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv
/kaggle/input/bert-base-uncased/pytorch/default/1/bert-base-uncased/config.json
/kaggle/input/bert-base-uncased/pytorch/default/1/bert-base-uncased/README.md
/kaggle/input/bert-base-uncased/pytorch/default/1/bert-base-uncased/tf_model.h5
/kaggle/input/bert-base

In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification, DebertaTokenizer, DebertaForSequenceClassification
from transformers import AdamW, get_scheduler
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
import os
from sklearn.model_selection import train_test_split

In [3]:
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [4]:
train_df_actual = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv")
real_test_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")
misconception_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")

In [5]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

# Function to preprocess text (handling mathematical symbols)
# def preprocess_text(text):
#     # Replace common mathematical symbols with their names or remove them
#     text = re.sub(r"∑", "sum", text)  # Replace summation symbol with 'sum'
#     text = re.sub(r"×", "*", text)    # Replace multiplication symbol with '*'
#     text = re.sub(r"÷", "/", text)    # Replace division symbol with '/'
#     text = re.sub(r"±", "plus_minus", text)  # Replace ± symbol with 'plus_minus'
    
#     # Remove extra spaces or unwanted characters (you can expand this list)
#     text = re.sub(r"[^\w\s]", "", text)  # Remove non-alphanumeric characters
    
#     # Normalize mathematical expressions or numbers
#     text = re.sub(r"\d+", "number", text)  # Replace digits with the word 'number'
    
#     # Optionally handle LaTeX or other math-related formats
#     # For LaTeX, consider using a LaTeX parser to clean up complex equations
    
#     return text.strip().lower()

# Function to preprocess LaTeX into plain text
def preprocess_latex_to_text(latex_str):
    # Handle common LaTeX functions and symbols
    latex_str = latex_str.replace(r'\frac', 'over')  # Convert fractions
    latex_str = latex_str.replace(r'\sum', 'sum')  # Convert summation symbol
    latex_str = latex_str.replace(r'\int', 'integral')  # Convert integral symbol
    latex_str = latex_str.replace(r'\sqrt', 'square root')  # Convert square root
    latex_str = latex_str.replace(r'\text', '')  # Remove text formatting in LaTeX
    
    # Handle superscripts and subscripts (e.g., x^2 or x_1)
    latex_str = re.sub(r'\^{(.*?)}', r' raised to \1', latex_str)  # e.g., x^{2} becomes x raised to 2
    latex_str = re.sub(r'_{(.*?)}', r' sub \1', latex_str)  # e.g., x_{1} becomes x sub 1
    
    # Remove other LaTeX math environments (e.g., dollar signs for inline math)
    latex_str = latex_str.replace('$', '')
    
    # Optionally, remove other LaTeX-specific symbols or escape characters
    latex_str = latex_str.replace(r'\\', '')  # Remove LaTeX newlines
    latex_str = latex_str.replace(r'{', ' ').replace(r'}', ' ')  # Remove curly braces
    latex_str = ' '.join(latex_str.split())  # Clean up extra spaces
    
    return latex_str.strip().lower()

# Load data
# train_df = pd.read_csv("train.csv")

# Combine question and answer text
def preprocess_data(df):
    data = []
    for _, row in df.iterrows():
        for option in ["A", "B", "C", "D"]:  # Only incorrect answers
            input_text = f"Question: {row['QuestionText']} | Answer: {row[f'Answer{option}Text']}"
            input_text = preprocess_latex_to_text(input_text)
            label = row[f"Misconception{option}Id"]
            processed_text = preprocess_latex_to_text(input_text)  # Apply text preprocessing
            data.append((processed_text, label))
    return pd.DataFrame(data, columns=["text", "label"])

train_data = preprocess_data(train_df_actual)
train_data.dropna(inplace = True)

# Convert labels to multi-hot encoding
all_labels = sorted(train_data["label"].unique())
mlb = MultiLabelBinarizer(classes=all_labels)
train_labels = mlb.fit_transform([[label] for label in train_data["label"]])

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data["text"], train_labels, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Use top 5000 features
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200, random_state=42)

# Train the classifier
clf.fit(X_train_tfidf, np.array(y_train))

# Predict on validation set
y_val_pred = clf.predict_proba(X_val_tfidf)


In [6]:
print(len(clf.classes_))

1604


In [7]:
print(X_val_tfidf.shape)

(874, 2588)


In [8]:
len(y_train)

3496

In [9]:
y_train[0].shape

(1604,)

In [10]:
np.array(y_train).shape

(3496, 1604)

In [11]:
# y_val_pred[5]

In [12]:
len(y_val_pred)

1604

In [13]:
# for i in range(1604):
#     if y_val_pred[i].shape != (874, 2):
#         print(i)

In [14]:
y_val_pred[3].shape

(874, 1)

In [15]:
# np.array(y_val_pred).shape

In [16]:
# y_val.shape

In [17]:
# # Predict on validation set
# y_val_prob = clf.predict_proba(X_val_tfidf)
# y_val_prob = np.array(y_val_prob)
# print(y_val_prob.shape)

# # Get top-k predictions
# k = 25  # Set the number of top predictions to return

# # Initialize an array to store the top-k predictions for each sample
# top_k_predictions = np.zeros((y_val_prob.shape[0], k), dtype=int)

# # Iterate over each sample
# for i, probs in enumerate(y_val_prob):
#     # Get the indices of the top-k probabilities
#     top_k_indices = np.argsort(probs)[::-1][:k]
#     top_k_predictions[i] = top_k_indices


In [18]:
# # Predict on validation set
# y_val_prob = clf.predict_proba(X_val_tfidf)  # List of arrays (one per class)

# # Combine the probabilities for all classes into a 2D array
# # Each row corresponds to a sample, and each column corresponds to a class
# y_val_prob_combined = np.array([probs[:, 1] for probs in y_val_prob]).T  # Use probs[:, 1] to get positive class probabilities

# print(f"Shape of y_val_prob_combined: {y_val_prob_combined.shape}")  # (n_samples, n_classes)

# # Get top-k predictions
# k = 25  # Set the number of top predictions to return

# # Initialize an array to store the top-k predictions for each sample
# top_k_predictions = np.zeros((y_val_prob_combined.shape[0], k), dtype=int)

# # Iterate over each sample
# for i, probs in enumerate(y_val_prob_combined):
#     # Get the indices of the top-k probabilities
#     top_k_indices = np.argsort(probs)[::-1][:k]
#     top_k_predictions[i] = top_k_indices

# # Output the top-k predictions
# print("Top-k predictions for the first sample:", top_k_predictions[0])

In [19]:
# Predict on validation set
y_val_prob = clf.predict_proba(X_val_tfidf)
def normalize_probabilities(y_val_prob):
    max_cols = max(probs.shape[1] if len(probs.shape) > 1 else 1 for probs in y_val_prob)
    normalized_probs = []
    
    for probs in y_val_prob:
        if len(probs.shape) == 1 or probs.shape[1] == 1:
            # Single-column probabilities
            normalized_probs.append(probs.flatten())
        else:
            # Multi-column probabilities
            normalized_probs.append(probs[:, 1])  # Positive class probabilities
    
    return np.array(normalized_probs).T

y_val_prob_combined = normalize_probabilities(y_val_prob)
print(f"Shape of y_val_prob_combined: {y_val_prob_combined.shape}")

Shape of y_val_prob_combined: (874, 1604)


In [20]:
k = 25
top_k_predictions = np.zeros((y_val_prob_combined.shape[0], k), dtype=int)

for i, probs in enumerate(y_val_prob_combined):
    top_k_indices = np.argsort(probs)[::-1][:k]
    top_k_predictions[i] = top_k_indices

In [21]:
# Output the top-k predictions
print("Top-k predictions for the first sample:", top_k_predictions[0])

Top-k predictions for the first sample: [1066 1463 1145 1149 1152 1154 1159 1165  369  365 1182  129 1141 1461
 1184  360  132  359 1192  346 1451 1450 1475  116  491]


In [22]:
# # Evaluate using accuracy or other metrics
# accuracy = accuracy_score(y_val, y_val_pred)
# print(f"Validation Accuracy: {accuracy:.4f}")

In [23]:
# Optionally, predict on the test set (if you have one)
# test_data = preprocess_data(test_df)  # Assuming you have a test.csv file
# X_test_tfidf = vectorizer.transform(test_data["text"])
# y_test_pred = clf.predict(X_test_tfidf)
# test_accuracy = accuracy_score(test_data["label"], y_test_pred)
# print(f"Test Accuracy: {test_accuracy:.4f}")

In [24]:
# https://www.kaggle.com/code/cdeotte/how-to-train-open-book-model-part-1#MAP@3-Metric
def map_at_25(predictions, labels):
    map_sum = 0
    for x, y in zip(predictions, labels):
        y = np.argmax(y)
        z = [10 / i if y == j else 0 for i, j in zip(range(1, 26), x)]
        # z = [1 / i if true_set == j else 0 for i, j in zip(range(1, 26), pred_list)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

In [25]:
map_at_25_score = map_at_25(top_k_predictions, y_val)
map_at_25_score

0.005852990229995291

In [26]:
# all_true_labels

In [27]:
# all_predictions

In [28]:
# class PredictionDataset(Dataset):
#     def __init__(self, question_ids, answer_labels, texts, tokenizer, max_length=128):
#         """
#         Args:
#             question_ids (list): List of Question IDs.
#             answer_labels (list): List of Answer Labels (e.g., A, B, C, D).
#             texts (list): List of question texts.
#             labels (list): List of true Misconception IDs.
#             tokenizer (transformers tokenizer): Tokenizer to encode the text.
#             max_length (int, optional): Max sequence length for tokenization. Defaults to 128.
#         """
#         self.question_ids = question_ids
#         self.answer_labels = answer_labels
#         self.texts = texts
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         question_id = self.question_ids[idx]
#         answer_label = self.answer_labels[idx]
#         text = self.texts[idx]

#         # Assuming the `text` is just the question text; answer is inferred from `answer_label`
#         question = text  # In case answer text is separate, modify as needed

#         # Tokenize the question text
#         tokens = self.tokenizer(
#             question,
#             padding="max_length",
#             truncation=True,
#             max_length=self.max_length,
#             return_tensors="pt",
#             return_special_tokens_mask=True,  # Ensures correct use of [SEP]
#         )

#         return {
#             "input_ids": tokens["input_ids"].squeeze(),
#             "attention_mask": tokens["attention_mask"].squeeze(),
#             "QuestionId": question_id,  # Include the Question ID
#             "AnswerLabel": answer_label  # Include the Answer label (A, B, C, D)
#         }

In [29]:
# # Preprocess data
# def preprocess_testdata(df):
#     data = []
#     for _, row in df.iterrows():
#         for option in ["A", "B", "C", "D"]:  # Only incorrect answers
#             # if row['CorrectAnswer'] != option:
#             input_text = f"Question: {row['QuestionText']} | Answer: {row[f'Answer{option}Text']}"
#             # label = row[f"Misconception{option}Id"]
#             questionid = f"{row['QuestionId']}"
#             answer = f"{option}"
#             data.append((questionid, answer, input_text, label))
#     return pd.DataFrame(data, columns=["QuestionId","Answer", "text",])

# real_test_data = preprocess_testdata(real_test_df)
# real_test_data.dropna(inplace = True)

# # # Combine into DataFrames
# # train_data = pd.DataFrame(train_data, columns=["text", "label"])
# # val_data = pd.DataFrame(val_data, columns=["text", "label"])
# # test_data = pd.DataFrame(test_data, columns=["text", "label"])

# # Convert labels to multi-hot encoding
# # all_labels = sorted(misconception_df["MisconceptionId"].unique())  # Get all unique labels
# # mlb = MultiLabelBinarizer(classes=all_labels)

# # real_test_labels = mlb.transform([[label] for label in real_test_data["label"]])

In [30]:
# # Create datasets
# real_test_dataset = PredictionDataset(real_test_data["QuestionId"].tolist(), real_test_data["Answer"].tolist(), real_test_data["text"].tolist(), tokenizer)

# # DataLoader for batching
# real_test_dataloader = DataLoader(real_test_dataset, batch_size=16)

In [31]:
# # Assuming you have already loaded the model and test_dataloader
# model.load_state_dict(torch.load(best_model_path))
# model.eval()

# test_loss = 0
# predictions, true_labels = [], []

# # Store MAP@25 results
# all_predictions = {}
# all_true_labels = {}

# with torch.no_grad():
#     for batch in real_test_dataloader:
#         input_ids = batch["input_ids"].to(device)
#         attention_mask = batch["attention_mask"].to(device)
#         # labels = batch["labels"].to(device)
        
#         # Forward pass
#         outputs = model(input_ids, attention_mask=attention_mask)
#         # test_loss += outputs.loss.item()

#         logits = outputs.logits
#         probabilities = torch.softmax(logits, dim=-1)

#         # Get top 25 predictions (this can be adjusted for the actual number of misconceptions)
#         top_k_predictions = torch.topk(probabilities, k=25, dim=1).indices.cpu().numpy()
        
#         # Store predictions and true labels
#         for i, question_id in enumerate(batch["QuestionId"]):  # Ensure batch contains 'QuestionId'
#             question_answer = f"{question_id}_{batch['AnswerLabel'][i]}"  # Format QuestionId_Answer for unique identifier
            
#             all_predictions[question_answer] = top_k_predictions[i]  # Store top 25 predicted misconception ids
#             all_true_labels[question_answer] = labels[i].cpu().numpy()  # Store true misconception ids

In [32]:
# # Convert each NumPy array to a string
# data_str = [(key, np.array2string(value)) for key, value in all_predictions.items()]
# df = pd.DataFrame(data_str, columns=["QuestionId_Answer", "MisconceptionId"])
# df.to_csv("submission.csv", columns=["QuestionId_Answer", "MisconceptionId"], index=False)

In [33]:
# pd.read_csv("submission.csv")