<a href="https://colab.research.google.com/github/zodbot/llm_finetuning/blob/main/notebooks/finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install tensorflow>=2.15.0  tqdm>=4.66

In [6]:
import urllib.request
url = (
    "https://raw.githubusercontent.com/rasbt/"
    "LLMs-from-scratch/main/ch05/"
    "01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
urllib.request.urlretrieve(url, filename)

('gpt_download.py', <http.client.HTTPMessage at 0x7d40e5cbbe10>)

In [7]:
# from gpt_download import download_and_load_gpt2
# settings, params = download_and_load_gpt2(
#     model_size="124M", models_dir="gpt2"
# )

In [8]:
from gpt_download import download_and_load_gpt2
from google.colab import drive
drive.mount('/content/drive')

# Change your download path to Google Drive
settings, params = download_and_load_gpt2(
    model_size="124M",
    models_dir="/content/drive/MyDrive/gpt2"  # Save to Drive instead
)

Mounted at /content/drive
File already exists and is up-to-date: /content/drive/MyDrive/gpt2/124M/checkpoint
File already exists and is up-to-date: /content/drive/MyDrive/gpt2/124M/encoder.json
File already exists and is up-to-date: /content/drive/MyDrive/gpt2/124M/hparams.json
File already exists and is up-to-date: /content/drive/MyDrive/gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: /content/drive/MyDrive/gpt2/124M/model.ckpt.index
File already exists and is up-to-date: /content/drive/MyDrive/gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: /content/drive/MyDrive/gpt2/124M/vocab.bpe


In [9]:
print("Settings:", settings)
print("Parameter dictionary keys:", params.keys())
print("Token embedding weight tensor dimensions:", params["wte"].shape)
print("Position embedding weight tensor dimensions:", params["wte"].shape)

Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
Parameter dictionary keys: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])
Token embedding weight tensor dimensions: (50257, 768)
Position embedding weight tensor dimensions: (50257, 768)


In [10]:

GPT_CONFIGS = {
    "gpt2-small (124M)": {
        "vocab_size": 50257,    # Vocabulary size
        "context_length": 1024, # Context length
        "emb_dim": 768,        # Embedding dimension
        "n_heads": 12,         # Number of attention heads
        "n_layers": 12,        # Number of transformer layers
        "drop_rate": 0.1,      # Dropout rate
        "qkv_bias": True       # Whether to use bias in query, key, value projections
    }
}

# You can then get a config like this:
model_name = "gpt2-small (124M)"
config = GPT_CONFIGS[model_name]

In [11]:
import sys


!git clone https://github.com/zodbot/llm_finetuning.git

# Change into repo directory
%cd llm_finetuning


from src.model import GPTModel
from src.model import GPTModel
from src.transformer import TransformerBlock

# Create model instance
gpt = GPTModel(config)
gpt.eval()


Cloning into 'llm_finetuning'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 34 (delta 6), reused 28 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (34/34), 10.27 KiB | 1.71 MiB/s, done.
Resolving deltas: 100% (6/6), done.
/content/llm_finetuning


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Li

In [12]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, "
                          "Right: {right.shape}"
        )
    return torch.nn.Parameter(torch.tensor(right))


## Loading pretrained weights from OpenAI

In [13]:
import numpy as np
import torch
# Sets the model’s positional and token embedding weights to those specified in params.
def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    # The np.split function is used to divide the attention and bias weights into three equal parts for the query, key, and value components.
    # Iterates over each transformer block in the model


    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)
        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)
        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T)

        gpt.trf_blocks[b].att.out_proj.bias = assign(
          gpt.trf_blocks[b].att.out_proj.bias,
          params["blocks"][b]["attn"]["c_proj"]["b"])
        gpt.trf_blocks[b].ff.layers[0].weight = assign(
          gpt.trf_blocks[b].ff.layers[0].weight,
          params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
          gpt.trf_blocks[b].ff.layers[0].bias,
          params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
          gpt.trf_blocks[b].ff.layers[2].weight,
          params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
          gpt.trf_blocks[b].ff.layers[2].bias,
          params["blocks"][b]["mlp"]["c_proj"]["b"])
        gpt.trf_blocks[b].norm1.scale = assign(
          gpt.trf_blocks[b].norm1.scale,
          params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
          gpt.trf_blocks[b].norm1.shift,
          params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
          gpt.trf_blocks[b].norm2.scale,
          params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
          gpt.trf_blocks[b].norm2.shift,
          params["blocks"][b]["ln_2"]["b"])

        # The original GPT-2 model byOpenAI reused the token embedding weights in the output layer
        # to reduce the total number of parameters, which is a concept known as weight tying.
        gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
        gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
        gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])



In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

load_weights_into_gpt(gpt, params)
gpt.to(device)

Using device: cuda


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Li

In [15]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [16]:
def generate(model, idx, max_new_tokens, context_size,
             temperature=0.0, top_k=None, eos_id=None):
    for _ in range(max_new_tokens):
        # idx is a (batch, n_tokens) array of indices in the current context.
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        #  we only need the last position
        logits = logits[:, -1, :]

        if top_k is not None:
          top_logits, _ = torch.topk(logits, top_k)
          min_val = top_logits[:, -1]
          logits = torch.where(
              logits < min_val,
              torch.tensor(float('-inf')).to(logits.device),
              logits
          )
        #  adds a probabilistic selection process; higher tempreture, More random/creative/diverse outputs
        if temperature > 0.0:
            logits = logits / temperature
            probs = torch.softmax(logits, dim=-1)
            # probabilistic sampling
            idx_next = torch.multinomial(probs, num_samples=1)
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)

        if idx_next == eos_id:
          break

        idx = torch.cat((idx, idx_next), dim=1)

    return idx

In [17]:
import tiktoken

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

tokenizer = tiktoken.get_encoding("gpt2")
torch.manual_seed(123)
token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("Every effort moves you", tokenizer).to(device), max_new_tokens=25,
    context_size=config["context_length"],
    top_k=30,
    temperature=1
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you as far as the eye can see. It is not a perfect game. This is simply a very small number of things I


* Small k (1-10): Very focused, conservative outputs
* Medium k (20-50): Balanced between focus and variety
* Large k (50-1000): More diverse, creative outputs

In [18]:
# greedy decoding
# Completely deterministic - same input always produces same output
def generate_simple(model, idx, max_new_tokens, context_size):
  model.eval()

  for _ in range(max_new_tokens):

    idx = idx[: , -context_size:]

    with torch.no_grad():
      logits = model(idx) # batch, context_size, vector

    logits = logits[: , -1,: ]

    next_word = torch.argmax(logits, -1, keepdim=True)

    idx = torch.cat((idx, next_word), dim=1)
  return idx

token_ids = generate_simple(
    model=gpt,
    idx=text_to_token_ids("Every effort moves you", tokenizer).to(device), max_new_tokens=25,
    context_size=config["context_length"]
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))


Output text:
 Every effort moves you forward.

The first step is to understand the importance of your work.

The second step is to understand the


In [19]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [20]:
# SST-2 (Stanford Sentiment Treebank) is a great dataset for binary sentiment classification
from datasets import load_dataset
ds = load_dataset('nyu-mll/glue', 'sst2', split='train')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [54]:
# Convert to pandas DataFrame
import pandas as pd
df = pd.DataFrame({
    'Label': ds['label'],
    'Text': ds['sentence']
})

print(len(df))
print(df["Label"].value_counts())

# Take random sample of 1000 rows
# df_sample = df.sample(n=2000, random_state=42)  # random_state for reproducibility

# Optional: Look at the distribution of labels to ensure it's balanced
# print("Label distribution in sample:")
# print(df_sample['Label'].value_counts())

def random_split(df, train_frac, validation_frac):
    df = df.sample(
        frac=1, random_state=123
    ).reset_index(drop=True)
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]
    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(df, 0.7, 0.1)

train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

f = open("train.csv")
inputs = []
max_length = 0
for line in f.read():
  ids = tokenizer.encode(line)
  max_length = max(max_length, len(ids))
  inputs.append(ids)


for input in inputs:
  for _ in range(max_length - len(input)):
    input.append(5027)



67349
Label
1    37569
0    29780
Name: count, dtype: int64


In [55]:
import torch
from torch.utils.data import Dataset
# it identifies the longest sequence in the training dataset, encodes the text messages,
# and ensures that all other sequences are padded with a padding token to match the length of the longest sequence.
class Sst2DataSet(Dataset):
  def __init__(self, csv_file, tokenizer, max_length=None,
                 pad_token_id=50256):
      self.data = pd.read_csv(csv_file)
      self.encoded_texts = [tokenizer.encode(data) for data in self.data["Text"]]
      if max_length is None:
          self.max_length = self._longest_length()
      else:
          self.max_length = max_length
      # Truncates sequences if they are longer than max_length
      self.encoded_texts = [
                  encoded_text[:self.max_length]
                  for encoded_text in self.encoded_texts
      ]
      # add padding
      self.encoded_texts = [
        encoded_text + [pad_token_id] *
        (self.max_length - len(encoded_text))
        for encoded_text in self.encoded_texts
      ]


  def __getitem__(self, index):
      encoded = self.encoded_texts[index]
      label = self.data.iloc[index]["Label"]
      return (
          torch.tensor(encoded, dtype=torch.long),
          torch.tensor(label, dtype=torch.long)
      )
  def __len__(self):
      return len(self.data)

  def _longest_length(self):
      max_length = 0
      for encoded_text in self.encoded_texts:
          encoded_length = len(encoded_text)
          if encoded_length > max_length:
              max_length = encoded_length
      return max_length


train_dataset = Sst2DataSet(
    csv_file="train.csv",
    max_length=None,
    tokenizer=tokenizer
)

val_dataset = Sst2DataSet(
    csv_file="validation.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
test_dataset = Sst2DataSet(
    csv_file="test.csv",
    max_length=train_dataset.max_length,
    tokenizer=tokenizer
)
print(train_dataset.max_length)

65


In [56]:
import torch
print(f"GPU Memory Available: {torch.cuda.get_device_properties(0).total_memory/1e9:.2f} GB")

from torch.utils.data import DataLoader
num_workers = 0
batch_size = 8
torch.manual_seed(123)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True,
)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=False,
)
print("# of batches: ", len(train_loader))
for train, target in train_loader:
  print(train.shape, target.shape)
  break


GPU Memory Available: 15.83 GB
# of batches:  5893
torch.Size([8, 65]) torch.Size([8])


In [57]:
test_queries = [
    "Is this review positive or negative? Answer with 'positive' or 'negative': This movie was absolutely fantastic with great acting",
    "Is this review positive or negative? Answer with 'positive' or 'negative': The plot was boring and the characters were flat",
    "Is this review positive or negative? Answer with 'positive' or 'negative': A masterpiece of modern cinema",
    "Is this review positive or negative? Answer with 'positive' or 'negative': I fell asleep during the first hour"
]

for query in test_queries:
  token_ids = generate_simple(
    model=gpt,
    idx=text_to_token_ids(query, tokenizer).to(device), max_new_tokens=25,
    context_size=config["context_length"]
  )
  print(token_ids_to_text(token_ids, tokenizer))


# the model is struggling to follow instructions.
#  This result is expected, as it has only undergone pretraining and lacks instruction fine-tuning.


Is this review positive or negative? Answer with 'positive' or 'negative': This movie was absolutely fantastic with great acting, great music, great acting, great acting, great acting, great acting, great acting, great acting, great acting,
Is this review positive or negative? Answer with 'positive' or 'negative': The plot was boring and the characters were flat. The plot was not interesting. The plot was not interesting. The plot was not interesting. The plot was not interesting.
Is this review positive or negative? Answer with 'positive' or 'negative': A masterpiece of modern cinema.

A masterpiece of modern cinema. A masterpiece of modern cinema. A masterpiece of modern cinema. A masterpiece of modern
Is this review positive or negative? Answer with 'positive' or 'negative': I fell asleep during the first hour of the review. I was not able to sleep for more than an hour. I was able to sleep for about an hour


## Adding a classification head
we replace the original output layer, which maps the hidden representation to a vocabulary of 50,257, with a smaller output layer that maps to two classes: 0 and 1

In [58]:
# 1) change the last layer and fine-tune just the final transformer block

model = GPTModel(config)
load_weights_into_gpt(model, params)
model.eval()

# freeze the model
for param in model.parameters():
  param.requires_grad = False

# add a classification layer
num_classes = 2
model.out_head = torch.nn.Linear(config["emb_dim"], num_classes)

for param in model.trf_blocks[-1].parameters():
    param.requires_grad = True
for param in model.final_norm.parameters():
    param.requires_grad = True

In [59]:
def calc_accuracy(data_loader, model, device, num_batches=None):
    model.eval()
    correct_predictions, num_examples = 0, 0

    if not num_batches:
      return
    model.eval()
    for i, (input_batch, target_batch) in enumerate(data_loader):
      if i < num_batches:
        input_batch = input_batch.to(device)
        target_batch = target_batch.to(device)
        with torch.no_grad():
          output = model(input_batch)
        output = output[:, -1, :]
        result = torch.argmax(output, -1)
        correct_predictions += sum(predicted == target_batch[i] for i, predicted in enumerate(result))
        num_examples += output.shape[0]

    return correct_predictions / num_examples


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
torch.manual_seed(123)
train_accuracy = calc_accuracy(
    train_loader, model, device, num_batches=10
)
val_accuracy = calc_accuracy(
    val_loader, model, device, num_batches=10
)
test_accuracy = calc_accuracy(
    test_loader, model, device, num_batches=10
)
print(f"Training accuracy: {train_accuracy*100:.2f}%")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
print(f"Test accuracy: {test_accuracy*100:.2f}%")


Training accuracy: 55.00%
Validation accuracy: 62.50%
Test accuracy: 61.25%


In [60]:
def calc_loss(data_loader, model, device, num_batches=None):
  if len(data_loader) == 0:
    return float("nan")

  if num_batches is None:
    num_batches = len(data_loader)

  model.eval()

  all_loss = 0

  for i, (input_batch, target_batch) in enumerate(data_loader):
    if i < num_batches:
      input_batch = input_batch.to(device)
      target_batch = target_batch.to(device)

      with torch.no_grad():
        predicted = model(input_batch)
      # we only care about the last token
      predicted = predicted[:, -1, :]
      loss = torch.nn.functional.cross_entropy(predicted, target_batch)
      all_loss += loss.item()
    else:
      break

  return all_loss / num_batches



In [61]:
def calc_loss_batch(input_batch, target_batch, model, device):
  input_batch = input_batch.to(device)
  target_batch = target_batch.to(device)
  logits = model(input_batch)[:, -1, :]
  loss = torch.nn.functional.cross_entropy(logits, target_batch)
  return loss

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
      train_loss = calc_loss(
            train_loader, model, device, num_batches=eval_iter
        )
      val_loss = calc_loss(
            val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

def train_classifier(model, data_loader, val_loader, optimizer, device,
               epoch_num, eval_freq, eval_iter):
  model.train()
  examples_seen = 0
  steps = -1
  train_losses, val_losses, train_accs, val_accs = [], [], [], []

  for epoch in range(epoch_num):
    model.train()
    for i, (input_batch, target_batch) in enumerate(data_loader):
      # reset optimizer
      optimizer.zero_grad()

      loss = calc_loss_batch(input_batch, target_batch, model, device)

      # cal loss gradient
      loss.backward()

      # update weight
      optimizer.step()
      # we care about number of smples not tokens here
      examples_seen += input_batch.shape[0]
      steps += 1

      if steps % eval_freq == 0:
          train_loss, val_loss = evaluate_model(
            model, data_loader, val_loader, device, eval_iter)
          train_losses.append(train_loss)
          val_losses.append(val_loss)
          print(f"Ep {epoch+1} (Step {steps:06d}): "
                f"Train loss {train_loss:.3f}, "
                f"Val loss {val_loss:.3f}")

    train_accuracy = calc_accuracy(
      data_loader, model, device, num_batches=eval_iter
    )
    val_accuracy = calc_accuracy(
      val_loader, model, device, num_batches=eval_iter
    )
    print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
    print(f"Validation accuracy: {val_accuracy*100:.2f}%")
    train_accs.append(train_accuracy)
    val_accs.append(val_accuracy)

  return train_losses, val_losses, train_accs, val_accs, examples_seen






In [None]:
import time
start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.1)
num_epochs = 5
train_losses, val_losses, train_accs, val_accs, examples_seen = \
    train_classifier(
        model, train_loader, val_loader, optimizer, device,
        num_epochs, eval_freq=2000,
        eval_iter=5
)
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 0.239, Val loss 0.576
Ep 1 (Step 002000): Train loss 0.343, Val loss 0.505
Ep 1 (Step 004000): Train loss 0.228, Val loss 0.480
Training accuracy: 90.00% | Validation accuracy: 75.00%
Ep 2 (Step 006000): Train loss 0.257, Val loss 0.595
Ep 2 (Step 008000): Train loss 0.155, Val loss 0.542
Ep 2 (Step 010000): Train loss 0.307, Val loss 0.536
Training accuracy: 87.50% | Validation accuracy: 72.50%
Ep 3 (Step 012000): Train loss 0.243, Val loss 0.482
Ep 3 (Step 014000): Train loss 0.356, Val loss 0.488
Ep 3 (Step 016000): Train loss 0.458, Val loss 0.458
Training accuracy: 85.00% | Validation accuracy: 77.50%
Ep 4 (Step 018000): Train loss 0.202, Val loss 0.508
Ep 4 (Step 020000): Train loss 0.332, Val loss 0.539
Ep 4 (Step 022000): Train loss 0.112, Val loss 0.442
Training accuracy: 92.50% | Validation accuracy: 80.00%
Ep 5 (Step 024000): Train loss 0.347, Val loss 0.421
Ep 5 (Step 026000): Train loss 0.156, Val loss 0.424
