In [1]:
import torch
import torch.nn as nn

import gpt.classification
import gpt.configs
import gpt.mygpt
import gpt.spam

from gpt.text import Generator
from gpt.utils import (
    count_trainable_parameters,
    gpt_from_pretrained_url,
)

In [2]:
# Load the pretrained model.
gpt2 = gpt_from_pretrained_url(
    gpt.configs.GPT_CONFIG_124M,
    "gpt2-small (124M)",
)

# Verify it's correct, this should produce coherent text.
gen = Generator(gpt2)
gen.encode("Every effort moves you")
print(gen.generate(25))
print("Trainable parameters:", count_trainable_parameters(gpt2))

Every effort moves you forward.

The first step is to understand the importance of your work.

The second step is to understand the
Trainable parameters: 163037184


In [3]:
spam_data = gpt.spam.prepare_spam_data(gen._tokenizer)

.cache/sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction


In [4]:
# Freeze the model.
for param in gpt2.parameters():
    param.requires_grad = False

# Replace the final output head with a classification layer.
torch.manual_seed(123)
num_classes = 2
gpt2.out_head = torch.nn.Linear(
    in_features=gpt.configs.GPT_CONFIG_124M["emb_dim"],
    out_features=num_classes,
)

# Make the final layer norm and last transformer block trainable.
# This leads to better results in practice.
for param in gpt2.trf_blocks[-1].parameters():
    param.requires_grad = True
for param in gpt2.final_norm.parameters():
    param.requires_grad = True

In [5]:
import time

start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(gpt2.parameters(), lr=5e-5, weight_decay=0.1)
num_epochs = 5

train_losses, val_losses, train_accs, val_accs, examples_seen = (
    gpt.classification.train_classifier_simple(
        gpt2,
        spam_data["train"],
        spam_data["validation"],
        optimizer,
        torch.device('cpu'),
        num_epochs=num_epochs,
        eval_freq=50,
        eval_iter=5,
    )
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 2.155, Val loss 2.861
Ep 1 (Step 000050): Train loss 0.631, Val loss 0.618
Ep 1 (Step 000100): Train loss 0.498, Val loss 0.561
Training accuracy: 75.00% | Validation accuracy: 77.500000%
Ep 2 (Step 000150): Train loss 0.494, Val loss 0.507
Ep 2 (Step 000200): Train loss 0.363, Val loss 0.478
Ep 2 (Step 000250): Train loss 0.455, Val loss 0.359
Training accuracy: 70.00% | Validation accuracy: 92.500000%
Ep 3 (Step 000300): Train loss 0.414, Val loss 0.442
Ep 3 (Step 000350): Train loss 0.491, Val loss 0.350
Training accuracy: 87.50% | Validation accuracy: 87.500000%
Ep 4 (Step 000400): Train loss 0.234, Val loss 0.223
Ep 4 (Step 000450): Train loss 0.212, Val loss 0.223
Ep 4 (Step 000500): Train loss 0.153, Val loss 0.094
Training accuracy: 97.50% | Validation accuracy: 97.500000%
Ep 5 (Step 000550): Train loss 0.139, Val loss 0.099
Ep 5 (Step 000600): Train loss 0.049, Val loss 0.097
Training accuracy: 95.00% | Validation accuracy: 95.000000%
Training co

In [6]:
def count_trainable_parameters(model):
  return sum(
      p.numel() for p in model.parameters() if p.requires_grad
  )

In [7]:
count_trainable_parameters(gpt2)

7090946

In [8]:
class LoRALayer(nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        std = 1 / torch.sqrt(torch.tensor(rank).float())
        self.A = nn.Parameter(torch.randn(in_dim, rank) * std)
        self.B = nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x):
        x = self.alpha * (x @ self.A @ self.B)
        return x

class LinearWithLoRA(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )
        self.linear = linear

    def forward(self, x):
        return self.linear(x) + self.lora(x)        

In [9]:
gpt_lora = gpt_from_pretrained_url(
    gpt.configs.GPT_CONFIG_124M,
    "gpt2-small (124M)",
)
print("Trainable parameters:", count_trainable_parameters(gpt_lora))

Trainable parameters: 163037184


In [10]:
def lora_linear_layers(model, rank, alpha):
    children = list(model.named_children())
    for name, child in children:
        if isinstance(child, nn.Linear):
            lora = LinearWithLoRA(child, rank, alpha)
            setattr(model, name, lora)
        else:
            # Recursively freeze linear layers in children modules
            lora_linear_layers(child, rank, alpha)

# Freeze the model.
for param in gpt_lora.parameters():
    param.requires_grad = False

# Replace the final output head with a classification layer.
torch.manual_seed(123)
num_classes = 2
gpt_lora.out_head = torch.nn.Linear(
    in_features=gpt.configs.GPT_CONFIG_124M["emb_dim"],
    out_features=num_classes,
)

lora_linear_layers(gpt_lora, 8, 0.1)

print(
    "Trainable parameters after LoRA:",
    count_trainable_parameters(gpt_lora)
)

Trainable parameters after LoRA: 1334802


In [11]:
def train(gpt_model, num_epochs=5):
    start_time = time.time()
    torch.manual_seed(123)
    optimizer = torch.optim.AdamW(gpt_lora.parameters(), lr=5e-5, weight_decay=0.1)
    
    train_losses, val_losses, train_accs, val_accs, examples_seen = (
        gpt.classification.train_classifier_simple(
            gpt_model,
            spam_data["train"],
            spam_data["validation"],
            optimizer,
            torch.device('cpu'),
            num_epochs=num_epochs,
            eval_freq=50,
            eval_iter=5,
        )
    )
    
    end_time = time.time()
    execution_time_minutes = (end_time - start_time) / 60
    print(f"Training completed in {execution_time_minutes:.2f} minutes.")

In [12]:
!pip install line_profiler



In [13]:
%lprun -f gpt.classification.train_classifier_simple train(gpt_lora)

UsageError: Line magic function `%lprun` not found.


In [14]:
%%prun -s cumulative -T prun0
train(gpt_lora, num_epochs=1)

Ep 1 (Step 000000): Train loss 2.252, Val loss 2.991
Ep 1 (Step 000050): Train loss 0.646, Val loss 0.580
Ep 1 (Step 000100): Train loss 0.416, Val loss 0.517
Training accuracy: 77.50% | Validation accuracy: 90.000000%
Training completed in 1.96 minutes.
 
*** Profile printout saved to text file 'prun0'.


         1271349 function calls (1134340 primitive calls) in 117.669 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       35    0.525    0.015  118.187    3.377 {method 'control' of 'select.kqueue' objects}
       34    0.026    0.001   91.586    2.694 base_events.py:1960(_run_once)
58650/170    0.067    0.000   64.581    0.380 module.py:1747(_wrapped_call_impl)
58650/170    1.222    0.000   64.580    0.380 module.py:1755(_call_impl)
      170    0.046    0.000   64.572    0.380 mygpt.py:142(forward)
 2210/170    0.151    0.000   64.274    0.378 container.py:238(forward)
     2040    0.907    0.000   64.033    0.031 mygpt.py:112(forward)
      160    0.003    0.000   60.876    0.380 classification.py:34(_calc_loss_batch)
      130    0.001    0.000   51.762    0.398 _tensor.py:592(backward)
      130    0.001    0.000   51.253    0.394 __init__.py:243(backward)
      130    0.175    0.001   51.248    0.394 graph.py:815(

In [None]:
!ls