<a href="https://colab.research.google.com/github/zodbot/llm_finetuning/blob/main/evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Evaluating Fine-tuned GPT-2 Sentiment Classifier

1. Loads previously fine-tuned GPT-2 model
2. Evaluates performance on test set
3. Provides function for classifying new reviews
4. Demonstrates usage with example reviews

before fine-tuning:

* Training accuracy: 55.00%
* Validation accuracy: 62.50%
* Test accuracy: 61.25%

In [None]:
import sys
import torch
from google.colab import drive
drive.mount('/content/drive')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
loaded_state = torch.load('/content/drive/MyDrive/gpt2_finetuned_sst.pt',
                         map_location=device)

!git clone https://github.com/zodbot/llm_finetuning.git

# Change into repo directory
%cd llm_finetuning

from src.model import GPTModel
from src.config import GPT_CONFIGS
from src.utils import load_weights_into_gpt
import torch

# Get configuration
config = GPT_CONFIGS["gpt2-small (124M)"]

model = GPTModel(config)

# 2. Modify model architecture for classification (just like during training)
num_classes = 2
model.out_head = torch.nn.Linear(config["emb_dim"], num_classes)

# load the state dict
model.load_state_dict(loaded_state['model_state_dict'])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  loaded_state = torch.load('/content/drive/MyDrive/gpt2_finetuned_sst.pt',


Cloning into 'llm_finetuning'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (54/54), done.[K
remote: Total 64 (delta 24), reused 38 (delta 8), pack-reused 0 (from 0)[K
Receiving objects: 100% (64/64), 72.43 KiB | 927.00 KiB/s, done.
Resolving deltas: 100% (24/24), done.
/content/llm_finetuning/llm_finetuning


<All keys matched successfully>

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [None]:
# SST-2 (Stanford Sentiment Treebank) is a great dataset for binary sentiment classification
from datasets import load_dataset
ds = load_dataset('nyu-mll/glue', 'sst2', split='train')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [None]:
import pandas as pd
import tiktoken

import os
from google.colab import drive
drive.mount('/content/drive')

# Define file paths in Drive
drive_path = "/content/drive/MyDrive/sst2_data/"
train_path = os.path.join(drive_path, "train.csv")
val_path = os.path.join(drive_path, "validation.csv")
test_path = os.path.join(drive_path, "test.csv")

# Create directory if it doesn't exist
if not os.path.exists(drive_path):
    os.makedirs(drive_path)

# Check if files already exist
if all(os.path.exists(f) for f in [train_path, val_path, test_path]):
    print("Loading existing data from Drive...")
    train_df = pd.read_csv(train_path)
    validation_df = pd.read_csv(val_path)
    test_df = pd.read_csv(test_path)

67349
Label
1    37569
0    29780
Name: count, dtype: int64
Label distribution in sample:
Label
1    2833
0    2167
Name: count, dtype: int64


In [None]:
import torch
from torch.utils.data import Dataset
# it identifies the longest sequence in the training dataset, encodes the text messages,
# and ensures that all other sequences are padded with a padding token to match the length of the longest sequence.
class Sst2DataSet(Dataset):
  def __init__(self, csv_file, tokenizer, max_length=None,
                 pad_token_id=50256):
      self.data = pd.read_csv(csv_file)
      self.encoded_texts = [tokenizer.encode(data) for data in self.data["Text"]]
      if max_length is None:
          self.max_length = self._longest_length()
      else:
          self.max_length = max_length
      # Truncates sequences if they are longer than max_length
      self.encoded_texts = [
                  encoded_text[:self.max_length]
                  for encoded_text in self.encoded_texts
      ]
      # add padding
      self.encoded_texts = [
        encoded_text + [pad_token_id] *
        (self.max_length - len(encoded_text))
        for encoded_text in self.encoded_texts
      ]


  def __getitem__(self, index):
      encoded = self.encoded_texts[index]
      label = self.data.iloc[index]["Label"]
      return (
          torch.tensor(encoded, dtype=torch.long),
          torch.tensor(label, dtype=torch.long)
      )
  def __len__(self):
      return len(self.data)

  def _longest_length(self):
      max_length = 0
      for encoded_text in self.encoded_texts:
          encoded_length = len(encoded_text)
          if encoded_length > max_length:
              max_length = encoded_length
      return max_length


train_dataset = Sst2DataSet(
    csv_file="train.csv",
    max_length=None,
    tokenizer=tokenizer
)

val_dataset = Sst2DataSet(
    csv_file="validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
test_dataset = Sst2DataSet(
    csv_file="test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
print(train_dataset.max_length)

65


In [None]:
import torch


from torch.utils.data import DataLoader
num_workers = 0
batch_size = 8
torch.manual_seed(123)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)
print("# of batches: ", len(train_loader))
for train, target in train_loader:
  print(train.shape, target.shape)
  break


# of batches:  5893
torch.Size([8, 65]) torch.Size([8])


In [None]:
def calc_accuracy(data_loader, model, device, num_batches=None):
    if len(data_loader) == 0:
        return float("nan")

    if num_batches is None:
        num_batches = len(data_loader)

    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():
        for i, (input_batch, target_batch) in enumerate(data_loader):
            if i >= num_batches:
                break

            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)

            # Forward pass
            outputs = model(input_batch)
            outputs = outputs[:, -1, :]  # Get last token predictions
            _, predicted = torch.max(outputs, 1)

            # Calculate accuracy
            total += target_batch.size(0)
            correct += (predicted == target_batch).sum().item()

    accuracy = correct / total if total > 0 else 0
    return accuracy  # Make sure we return the accuracy!

train_accuracy = calc_accuracy(train_loader, model, device, 4)
val_accuracy = calc_accuracy(val_loader, model, device, 4)
test_accuracy = calc_accuracy(test_loader, model, device, 4)
print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")

Training accuracy: 96.88%
Validation accuracy: 87.50%
Test accuracy: 93.75%


In [None]:
def text_to_token_ids(text, tokenizer, max_length=None, pad_token_id=50256):
    # Encode text to token IDs
    tokens = tokenizer.encode(text)

    # Convert to tensor and add batch dimension
    tokens = torch.tensor(tokens).unsqueeze(0)  # Add batch dimension

    # Handle max length if specified
    if max_length is not None:
        if tokens.size(1) > max_length:
            # Truncate if too long
            tokens = tokens[:, :max_length]
        elif tokens.size(1) < max_length:
            # Pad if too short
            padding = torch.full((1, max_length - tokens.size(1)),
                               pad_token_id,
                               dtype=torch.long)
            tokens = torch.cat([tokens, padding], dim=1)

    return tokens

In [None]:
def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256):
    model.eval()
    with torch.no_grad():
        # Tokenize the input text
        tokens = text_to_token_ids(text, tokenizer).to(device)

        # Get model prediction
        logits = model(tokens)
        logits = logits[:, -1, :]  # Get last token's logits

        # Convert to probabilities
        probs = torch.softmax(logits, dim=-1)
        prediction = torch.argmax(probs, dim=-1).item()
        confidence = probs[0, prediction].item()

        # Convert to sentiment
        sentiment = "Positive" if prediction == 1 else "Negative"

        return {
            "sentiment": sentiment,
            "confidence": confidence,
            "probabilities": {
                "negative": probs[0, 0].item(),
                "positive": probs[0, 1].item()
            }
        }


In [None]:
# Example usage:
test_reviews = [
    "This movie was absolutely fantastic!",
    "What a terrible waste of time.",
    "The acting was okay, but the plot was confusing."
]

for review in test_reviews:
    result = classify_review(review, model, tokenizer, device)
    print(f"\nReview: {review}")
    print(f"Sentiment: {result['sentiment']}")
    print(f"Confidence: {result['confidence']:.2%}")
    print(f"Probabilities: Positive: {result['probabilities']['positive']:.2%}, "
          f"Negative: {result['probabilities']['negative']:.2%}")


Review: This movie was absolutely fantastic!
Sentiment: Positive
Confidence: 99.49%
Probabilities: Positive: 99.49%, Negative: 0.51%

Review: What a terrible waste of time.
Sentiment: Negative
Confidence: 86.95%
Probabilities: Positive: 13.05%, Negative: 86.95%

Review: The acting was okay, but the plot was confusing.
Sentiment: Negative
Confidence: 61.85%
Probabilities: Positive: 38.15%, Negative: 61.85%
