In [None]:
# Step 1: Install packages (skip this on Kaggle, pre-installed)
# !pip install transformers openpyxl tqdm scikit-learn --quiet

# Step 2: Load dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

dataset_path = '/kaggle/input/trainingdataset/training new 1.xlsx'
df = pd.read_excel(dataset_path)

print(" Columns:", df.columns)

texts = df['input'].astype(str).tolist()
labels = df['Class'].tolist()

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
num_labels = len(set(encoded_labels))

# Train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, encoded_labels, test_size=0.1, stratify=encoded_labels, random_state=42
)

# Step 3: Tokenize and Dataset
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

class BARTClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_len)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

train_dataset = BARTClassificationDataset(train_texts, train_labels, tokenizer)
val_dataset = BARTClassificationDataset(val_texts, val_labels, tokenizer)

# Step 4: Fine-Tune BART for Classification
from transformers import BartForSequenceClassification, TrainingArguments, Trainer

model_finetuned = BartForSequenceClassification.from_pretrained('facebook/bart-base', num_labels=num_labels)

training_args = TrainingArguments(
    output_dir="./bart_classification_output",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    report_to="none"
)

trainer = Trainer(
    model=model_finetuned,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

# Step 5: Extract Fine-Tuned BART Encoder Embeddings
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder_model = model_finetuned.model.encoder.to(device)
model_finetuned.eval()

all_texts = texts
finetuned_embeddings_list = []

with torch.no_grad():
    for text in tqdm(all_texts, desc="Extracting Fine-Tuned Embeddings"):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        output = encoder_model(**inputs)
        mean_embedding = output.last_hidden_state.mean(dim=1)
        finetuned_embeddings_list.append(mean_embedding.squeeze(0).cpu())

# Step 6: Extract Pretrained BART Encoder Embeddings
from transformers import BartModel

model_pretrained = BartModel.from_pretrained('facebook/bart-base').to(device)
model_pretrained.eval()

pretrained_embeddings_list = []

with torch.no_grad():
    for text in tqdm(all_texts, desc="Extracting Pretrained Embeddings"):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        output = model_pretrained(**inputs)
        mean_embedding = output.last_hidden_state.mean(dim=1)
        pretrained_embeddings_list.append(mean_embedding.squeeze(0).cpu())

# Step 7: Save to Excel
pretrained_df = pd.DataFrame([emb.numpy() for emb in pretrained_embeddings_list])
finetuned_df = pd.DataFrame([emb.numpy() for emb in finetuned_embeddings_list])

pretrained_df['Class'] = labels
finetuned_df['Class'] = labels

pretrained_df.to_excel('pre_trained_bart_embeddings.xlsx', index=False)
finetuned_df.to_excel('fine_tuned_bart_embeddings.xlsx', index=False)

print("All embeddings extracted and saved successfully!")


✅ Columns: Index(['input', 'Class'], dtype='object')


Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,1.0871
200,1.0029
300,0.944
400,0.8475
500,0.8404
600,0.7763
700,0.6942
800,0.6534
900,0.6581


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Extracting Fine-Tuned Embeddings: 100%|██████████| 1680/1680 [00:30<00:00, 55.61it/s]
Extracting Pretrained Embeddings: 100%|██████████| 1680/1680 [01:07<00:00, 24.74it/s]


✅ All embeddings extracted and saved successfully!


In [None]:
import pandas as pd

df = pd.read_excel("mathbert_fine_tuned_embeddings.xlsx")
print(df.head())
