In [None]:
# Step 1: Install necessary libraries
!pip install transformers openpyxl tqdm

# Step 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 3: Load the dataset
import pandas as pd

# Use the uploaded file (modify if needed)
dataset_path = '/content/drive/MyDrive/training new 1.xlsx'
df = pd.read_excel(dataset_path)

# Check columns
print(df.columns)

# Step 4: Prepare text and labels
texts = df['input'].astype(str).tolist()
labels = df['Class'].tolist()

# Step 5: Load GPT-2 tokenizer and model
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from tqdm import tqdm  # <- ADDED tqdm

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Important because GPT2 has no pad token
model_pretrained = GPT2Model.from_pretrained('gpt2')

# Step 6: Tokenize texts
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Step 7: Extract pretrained embeddings (with PROGRESS BAR)
pretrained_embeddings_list = []
with torch.no_grad():
    for i in tqdm(range(len(texts)), desc="Extracting Pretrained Embeddings"):
        single_input = tokenizer(texts[i], padding=True, truncation=True, return_tensors="pt")
        output = model_pretrained(**single_input)
        last_hidden_state = output.last_hidden_state
        mean_embedding = last_hidden_state.mean(dim=1)
        pretrained_embeddings_list.append(mean_embedding.squeeze(0))

pretrained_embeddings = torch.stack(pretrained_embeddings_list)

# Step 8: Prepare Fine-Tuning dataset
class GPT2Dataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.input_ids = []
        for txt in texts:
            encodings = tokenizer(txt, truncation=True, max_length=512, padding="max_length")
            self.input_ids.append(torch.tensor(encodings['input_ids']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {'input_ids': self.input_ids[idx], 'labels': self.input_ids[idx]}

train_dataset = GPT2Dataset(texts, tokenizer)

# Step 9: Fine-tune GPT-2 (Better logging)
model_finetuned = GPT2LMHeadModel.from_pretrained('gpt2')
model_finetuned.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/gpt2_finetuned_output",
    per_device_train_batch_size=2,
    num_train_epochs=2,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=100,  # <- log more frequently
    report_to="none",   # <- clean output
    logging_dir="./logs",
)

trainer = Trainer(
    model=model_finetuned,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

# Step 10: Extract embeddings after fine-tuning (with PROGRESS BAR)
# Step 10: Extract embeddings after fine-tuning (with PROGRESS BAR)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_finetuned_model = model_finetuned.transformer.to(device)  # Move model to GPU

finetuned_embeddings_list = []
with torch.no_grad():
    for i in tqdm(range(len(texts)), desc="Extracting Fine-tuned Embeddings"):
        single_input = tokenizer(texts[i], padding=True, truncation=True, return_tensors="pt").to(device)
        output = model_finetuned_model(**single_input)
        last_hidden_state = output.last_hidden_state
        mean_embedding = last_hidden_state.mean(dim=1)
        finetuned_embeddings_list.append(mean_embedding.squeeze(0))

finetuned_embeddings = torch.stack(finetuned_embeddings_list)


# Step 11: Save embeddings + class to Excel
import numpy as np

pretrained_df = pd.DataFrame(pretrained_embeddings.cpu().numpy())  # <-- added .cpu()
finetuned_df = pd.DataFrame(finetuned_embeddings.cpu().numpy())    # <-- added .cpu()

pretrained_df['Class'] = labels
finetuned_df['Class'] = labels

pretrained_path = '/content/drive/MyDrive/pretrained_embeddings.xlsx'
finetuned_path = '/content/drive/MyDrive/finetuned_embeddings.xlsx'

pretrained_df.to_excel(pretrained_path, index=False)
finetuned_df.to_excel(finetuned_path, index=False)

print(" Saved pretrained and fine-tuned embeddings to Google Drive!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Index(['input', 'Class'], dtype='object')


Extracting Pretrained Embeddings: 100%|██████████| 1680/1680 [10:16<00:00,  2.73it/s]


Step,Training Loss
100,0.79
200,0.5259
300,0.5107
400,0.4586
500,0.4566
600,0.4434
700,0.4177
800,0.4019
900,0.35
1000,0.355


Extracting Fine-tuned Embeddings: 100%|██████████| 1680/1680 [00:23<00:00, 71.87it/s]


✅ Saved pretrained and fine-tuned embeddings to Google Drive!
