# Setting up the model

Initial project setup includes importing the required packages and downloading the training dataset.

In [2]:
# Imports and setup
import os
import sys
import torch
import matplotlib.pyplot as plt
import pandas as pd
import torch.nn as nn
from torch.nn import functional as F

# Optional import for data visualization
# import matplotlib.pyplot as plt

In [4]:
# Location of training data
ZIP_URL = 'https://github.com/tpchikumbu/GPTiny/archive/main.zip'
PROJECT_DIR = os.getcwd() + '/LMDatasets'

# Check if the data is already downloaded
print('Searching for data in ', PROJECT_DIR)
if not os.path.isdir(PROJECT_DIR):
  # Download the compressed project files
  !wget -O "GPTiny-main.zip" "$ZIP_URL"

  # Extract only the specific folder from the ZIP file
  !unzip -q "GPTiny-main.zip" "GPTiny-main/LMDatasets/*" -d "."
  !mv "GPTiny-main/LMDatasets" "."

  # Remove temporary files
  !rm -rf "GPTiny-main.zip"
  !rm -rf "GPTiny-main"

else:
  print('Data already downloaded')

Searching for data in  /home/peter/Documents/Hons/NLP/GPTiny/LMDatasets
Data already downloaded


The following code selects which language from the dataset must be used for training. Available languages are:

- isiXhosa (xh)
- isiZulu (zu)
- siSwati (ss)
- isiNdebele (nr)

In [None]:
language = "nr" # Options are nr, ss, xh, zu
file_name = f'{PROJECT_DIR}/nchlt_text.{language}'

# Load data
with open(file_name + ".train", 'r', encoding='utf-8') as f:
    train_df = f.read()
with open(file_name + ".valid", 'r', encoding='utf-8') as f:
    dev_df = f.read()
with open(file_name + ".test", 'r', encoding='utf-8') as f:
    test_df = f.read()

print('Train shape: ', len(train_df))
print('Dev shape: ', len(dev_df))
print('Test shape: ', len(test_df))

Train shape:  6382803
Dev shape:  441906
Test shape:  444199


## Generating Vocabulary
In natural language processing, a vocabulary refers to the collection of valid tokens within a dataset. The predictive nature of GPT prevents it from generating any new tokens.

For ease of operation, these tokens are encoded as integers. The encoding used for this project was based on their order of appearance within the ASCII table.

In [4]:
# Generate vocabulary
used_chars = sorted(list(set(train_df)))
vocab_size = len(used_chars)
print("Tokens: ", ''.join(used_chars))
print("Token count: ", vocab_size)

# char to int mapping
char_to_int = { ch:i for i,ch in enumerate(used_chars) }
int_to_char = { i:ch for i,ch in enumerate(used_chars) }
encode = lambda s: [char_to_int[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([int_to_char[i] for i in l]) # decoder: take a list of integers, output a string

Tokens:  
 !"$%&')*+,-./0123456789:;<>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_abcdefghijklmnopqrstuvwxyz{|}~ ¡£©«­°±²³´¸¹ºÂÃÅž
Token count:  110


In [5]:
# Tokenise the corpus and place on tensors
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Torch device: ", device)

train_encoded = torch.tensor(encode(train_df), device=device)
dev_encoded = torch.tensor(encode(dev_df), device=device)
test_encoded = torch.tensor(encode(test_df), device=device)

Torch device:  cpu


## Neural network definition

Specifies the self-attention, multi-head attention and feed-forward modules contained in each decoder block.

In [11]:
# Self attention head

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size, n_embd, block_size, dropout=0.1):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [12]:
# Multi-head to find parallel attention

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size, n_embd, block_size, head_dropout=0.1, multi_dropout=0.1):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, n_embd, block_size, head_dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(multi_dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [13]:
# Feed-forward network
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    widening : int = 4

    def __init__(self, n_embd, wide = 4, dropout=0.1):
        super().__init__()
        self.widening = wide
        self.net = nn.Sequential(
            nn.Linear(n_embd, self.widening * n_embd),
            nn.ReLU(),
            nn.Linear(self.widening * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [14]:
# Compute + communicate
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head, block_size, widen = 4, head_dropout=0.1, multi_dropout=0.1, ff_dropout=0.1, block_dropout=0.1):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, block_size, head_dropout, multi_dropout)
        self.ffwd = FeedFoward(n_embd, widen, ff_dropout)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        self.dropout = nn.Dropout(block_dropout)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) # residual to attention block
        x = x + self.ffwd(self.ln2(x)) # residual to feed forward block
        return x



## Training Hyperparameters
The following block calculates average sentence length in the corpus. This value can be used for context size when training the model. Prioritise in sentence context, over cross sentence context. Other model hyperparameters like batch size, amount of dropout applied and connections between layers are also specified here.

In [15]:
# Split training data on newline character and calculate average length of the split elements
avg_length = 0
sentence_count = 0
for element in train_df.split('\n'):
  avg_length += len(element)
  sentence_count += 1

avg_length = round(avg_length / sentence_count)

print("Total splits: ", sentence_count)
print("Average length of split elements: ", avg_length)

# ------------
# Hyperparameters
torch.manual_seed(1337)
batch_size = 256 # how many independent sequences will we process in parallel?
block_size = 64 # avg_length # what is the maximum context length for predictions?
max_iters = 1000
eval_interval = 100
learning_rate = 4e-2
eval_iters = 100
n_embd = 64
n_head = 16
n_layer = 8
dropout = 0.2
# ------------

Total splits:  57099
Average length of split elements:  111


Other functions required during training can be defined to load the desired dataset and estimate the loss of the model

In [16]:
# Generate dataloader with different block and batch sizes

def get_batch(split: str):
  # generate a small batch of data of inputs x and targets y

  if split == "train":
    data = train_encoded
  elif split == "dev":
    data = dev_encoded
  elif split == "test":
    data = test_encoded

  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x, y

# Estimate loss
@torch.no_grad()
def estimate_loss(mode: str, model: nn.Module):
    out = {}
    splits = []
    model.eval()
    # Determine datasets to be used
    if mode == "train":
        splits = ['train', 'dev']
    elif mode == "test":
        splits = ['test']

    # Calculate losses for chosen datasets
    for split in splits:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


## Language Model

In [None]:
# Basic bigram language model
class LanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

## Train Model
This section starts with an optional bit of code to mount your Google drive when running this notebook on Colabs. This allows training outputs to be maintained after training completes. Alternatively, output will be placed into the 'GPTiny_losses' directory within the current working directory.

In [None]:
# OPTIONAL: Mount drive and create directory to store results
# from google.colab import drive
# drive.mount('/content/drive')
# os.makedirs('/content/drive/MyDrive/GPTiny_loss', exist_ok=True)

In [19]:
# Specify output directory for results
if 'google.colab' in sys.modules:
  if os.path.isdir('/content/drive/MyDrive/GPTiny_loss'):
    # Store on mounted drive if available
    output = '/content/drive/MyDrive/GPTiny_loss/'
#   else:
#     os.makedirs('/content/GPTiny_losses', exist_ok=True)
#     output = '/content/GPTiny_losses/'
else:
  os.makedirs('./GPTiny_losses', exist_ok=True)
  output = './GPTiny_losses'

In [20]:
# Specify output file for results
param_str = f"lr{learning_rate}_bth{batch_size}_blk{block_size}_drop{dropout}_hed{n_head}_lay{n_layer}"
loss_file = f"{output}losses_{param_str}.csv"

if not os.path.isfile(loss_file):
    with open(loss_file, 'w') as f:
        # Add header to file
        f.write("iter,train_loss,val_loss,BPC\n")

Train model for specified iterations

In [None]:
import time

model = LanguageModel().to(device)
#m = model

# create a PyTorch optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=learning_rate, steps_per_epoch=len(train_encoded) // batch_size, epochs=max_iters)

start_time = time.time()

for iter in range(max_iters):
    # Calculate validation loss after fixed interval
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss("train")
        bpc = losses['dev'] / torch.log(torch.tensor(2.0))
        print(f"Epoch {iter}: | Train loss {losses['train']:.4f} | Validation loss {losses['dev']:.4f} | BPC: {bpc:.4f}")

        # Write the losses to a csv file
        with open(loss_file, 'a') as f:
            f.write(f"{iter},{losses['train']},{losses['dev']},{bpc:.4f}\n")

    # Sample batch of training data
    xb, yb = get_batch('train')

    # Feed data to model to model and evaluate inputs
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Update the learning rate
    scheduler.step()

# Calculate elapsed time in minutes
elapsed_time = (time.time() - start_time) / 60
print(f"Training time: {elapsed_time} minutes")

Visualize training performance

In [None]:
# Generate line plot of training and validation loss

# Read the loss file into a pandas DataFrame
df = pd.read_csv(loss_file, nrows = 1 + (max_iters // eval_iters))

# Extract the iteration, training loss, and validation loss columns
iters = df['iter']
train_losses = df['train_loss']
val_losses = df['val_loss']
val_bpc = df['BPC']

# Plot the training and validation losses
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

# Plot the losses
# ax1.axhline(y=min(val_losses), color='r', linestyle='--', label="Minimum")
ax1.plot(iters, train_losses, label='Training Loss')
ax1.plot(iters, val_losses, label='Validation Loss')
ax1.set_xlabel('Iteration')
ax1.set_ylabel('Loss')
ax1.set_title('Training and Validation Loss')
ax1.legend()

# Plot BPC
ax2.plot(iters, val_bpc, label='BPC')
ax2.set_xlabel('Iteration')
ax2.set_ylabel('BPC')
ax2.set_title('Validation BPC')
ax2.legend()

plt.show()
