In [2]:
# Step 1: Install necessary packages (skip if on Kaggle)
# !pip install transformers openpyxl tqdm scikit-learn --quiet

# Step 2: Load dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch

# Load dataset
dataset_path = '/kaggle/input/trainingdataset/training new 1.xlsx'
df = pd.read_excel(dataset_path)

print("✅ Columns:", df.columns)

texts = df['input'].astype(str).tolist()
labels = df['Class'].tolist()

# Encode class labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
num_labels = len(set(encoded_labels))

# Train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, encoded_labels, test_size=0.1, stratify=encoded_labels, random_state=42
)

# Step 3: Tokenize and Dataset class
from transformers import AutoTokenizer

# Load tokenizer (using local path for mathbert model if valid)
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/mathbert/other/model/1")

class MathBERTDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_len)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

train_dataset = MathBERTDataset(train_texts, train_labels, tokenizer)
val_dataset = MathBERTDataset(val_texts, val_labels, tokenizer)

# Step 4: Fine-Tune MathBERT for Classification
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    "/kaggle/input/mathbert/other/model/1", num_labels=num_labels,
    output_hidden_states=True  # We need hidden states for embeddings
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./mathbert_output",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    report_to="none"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Step 5: Extract Fine-Tuned MathBERT Embeddings
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

finetuned_embeddings = []

with torch.no_grad():
    for text in tqdm(texts, desc="Extracting Fine-Tuned MathBERT Embeddings"):
        # Tokenizing the input text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Forward pass through the model
        output = model(**inputs)

        # Extract the embeddings (using the last hidden state)
        mean_emb = output.hidden_states[-1].mean(dim=1)  # Average across all tokens
        finetuned_embeddings.append(mean_emb.squeeze(0).cpu())  # Remove batch dimension

# Saving the fine-tuned embeddings in a DataFrame
finetuned_df = pd.DataFrame([emb.numpy() for emb in finetuned_embeddings])
finetuned_df['Class'] = labels
finetuned_df.to_excel('mathbert_fine_tuned_embeddings.xlsx', index=False)

print("✅ Fine-tuned MathBERT embeddings saved!")

# Step 6: Extract Pretrained MathBERT Embeddings
from transformers import AutoModel

# Load the pretrained model (using the local path for MathBERT if valid)
pretrained_model = AutoModel.from_pretrained(
    "/kaggle/input/mathbert/other/model/1", output_hidden_states=True
).to(device)
pretrained_model.eval()

pretrained_embeddings = []

with torch.no_grad():
    for text in tqdm(texts, desc="Extracting Pretrained MathBERT Embeddings"):
        # Tokenizing the input text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Forward pass through the pretrained model
        output = pretrained_model(**inputs)

        # Extract the embeddings (using the last hidden state)
        mean_emb = output.hidden_states[-1].mean(dim=1)  # Average across all tokens
        pretrained_embeddings.append(mean_emb.squeeze(0).cpu())  # Remove batch dimension

# Saving the pretrained embeddings in a DataFrame
pretrained_df = pd.DataFrame([emb.numpy() for emb in pretrained_embeddings])
pretrained_df['Class'] = labels
pretrained_df.to_excel('mathbert_pre_trained_embeddings.xlsx', index=False)

print("✅ Pretrained MathBERT embeddings saved!")


✅ Columns: Index(['input', 'Class'], dtype='object')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/mathbert/other/model/1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,1.0561
200,1.0027
300,0.8495
400,0.7891
500,0.7122
600,0.6976
700,0.5887
800,0.5345
900,0.5296


Extracting Fine-Tuned MathBERT Embeddings: 100%|██████████| 1680/1680 [01:01<00:00, 27.19it/s]


✅ Fine-tuned MathBERT embeddings saved!


Extracting Pretrained MathBERT Embeddings: 100%|██████████| 1680/1680 [01:02<00:00, 26.76it/s]


✅ Pretrained MathBERT embeddings saved!
