In [1]:
!pip install datasets transformers evaluate optuna
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.


In [7]:
from datasets import load_dataset
import torch
import evaluate
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
from transformers import GPT2Config
import numpy as np
from tqdm import tqdm

In [8]:
# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Load the IMDB dataset
imdb = load_dataset("imdb")
train_dataset = imdb['train'].shuffle(seed=42)
test_dataset = imdb['test'].shuffle(seed=42)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# For hyperparameter tuning, create a validation split
val_split_ratio = 0.1
val_size = int(len(train_dataset) * val_split_ratio)

val_dataset = train_dataset.select([i for i in list(range(val_size))])
train_dataset = train_dataset.select([i for i in list(range(val_size, len(train_dataset)))])

print(f"Train dataset size after split: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")


Train dataset size: 25000
Test dataset size: 25000
Train dataset size after split: 22500
Validation dataset size: 2500


In [4]:
# Load GPT-2 tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Critical fix: Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Update tokenizer with the pad token
num_added_tokens = tokenizer.add_special_tokens({'pad_token': tokenizer.pad_token})
print(f"Added {num_added_tokens} tokens to the tokenizer")

# Load GPT-2 for sequence classification with explicit pad_token_id
config = GPT2Config.from_pretrained(model_name, num_labels=2)
config.pad_token_id = tokenizer.pad_token_id  # This is the critical fix
model = GPT2ForSequenceClassification.from_pretrained(model_name, config=config)

# Resize embeddings to account for added tokens
model.resize_token_embeddings(len(tokenizer))

# Define tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256  # Shorter sequences for faster training
    )

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Format datasets
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


Added 0 tokens to the tokenizer


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Load GPT-2 tokenizer and model
model_name = "gpt2"  # You can also try "gpt2-medium" if you have enough compute
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# GPT-2 tokenizer doesn't have a padding token by default, so we need to set one
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Make sure special tokens are properly handled
special_tokens_dict = {'pad_token': tokenizer.eos_token}
tokenizer.add_special_tokens(special_tokens_dict)

# Initialize GPT-2 model for sequence classification
config = GPT2Config.from_pretrained(model_name, num_labels=2)
# Set pad_token_id in the config to be the same as eos_token_id
config.pad_token_id = tokenizer.eos_token_id
model = GPT2ForSequenceClassification.from_pretrained(model_name, config=config)

# Resize the model embedding to account for the pad token
model.resize_token_embeddings(len(tokenizer))

# Remove return_tensors="pt" from tokenize_function as it causes issues with the Trainer
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        pad_to_max_length=True
    )

# Re-tokenize with the updated function
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Format datasets for the model
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Load metrics
    accuracy = evaluate.load("accuracy")
    f1 = evaluate.load("f1")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")

    # Calculate metrics
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1_score = f1.compute(predictions=predictions, references=labels, average="binary")["f1"]
    precision_score = precision.compute(predictions=predictions, references=labels, average="binary")["precision"]
    recall_score = recall.compute(predictions=predictions, references=labels, average="binary")["recall"]

    return
    {
        "accuracy": accuracy_score,
        "f1": f1_score,
        "precision": precision_score,
        "recall": recall_score
    }



Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/22500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [10]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_imdb_results",
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Reduced batch size for better memory management
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./gpt2_imdb_logs",
    logging_steps=500,
    eval_strategy="epoch",  # Updated from evaluation_strategy to avoid warning
    save_strategy="epoch",  # Matches eval_strategy
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    report_to="tensorboard",
    gradient_accumulation_steps=4  # Accumulate gradients to compensate for smaller batch size
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

In [11]:
# Train the model
print("Starting training...")
train_results = trainer.train()
print(train_results)

# Evaluate on validation set
print("Evaluating on validation set...")
val_results = trainer.evaluate(eval_dataset=tokenized_val)
print(val_results)

# Evaluate on test set
print("Evaluating on test set...")
test_results = trainer.evaluate(eval_dataset=tokenized_test)
print(test_results)

# Save the final model
model_path = "./gpt2_imdb_sentiment"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.243,0.225797,0.926,0.925373,0.938625,0.91249
2,0.0814,0.319357,0.9312,0.931255,0.935743,0.92681


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

TrainOutput(global_step=4218, training_loss=0.1760493622622031, metrics={'train_runtime': 1644.8427, 'train_samples_per_second': 41.037, 'train_steps_per_second': 2.564, 'total_flos': 1.7628123981938688e+16, 'train_loss': 0.1760493622622031, 'epoch': 2.9984})
Evaluating on validation set...


{'eval_loss': 0.3193569779396057, 'eval_accuracy': 0.9312, 'eval_f1': 0.9312549960031974, 'eval_precision': 0.9357429718875502, 'eval_recall': 0.9268098647573588, 'eval_runtime': 22.4424, 'eval_samples_per_second': 111.396, 'eval_steps_per_second': 27.849, 'epoch': 2.9984}
Evaluating on test set...
{'eval_loss': 0.27056363224983215, 'eval_accuracy': 0.9392, 'eval_f1': 0.9395241505530357, 'eval_precision': 0.9345417128383726, 'eval_recall': 0.94456, 'eval_runtime': 203.7759, 'eval_samples_per_second': 122.684, 'eval_steps_per_second': 30.671, 'epoch': 2.9984}
Model saved to ./gpt2_imdb_sentiment
