# Setting up the model

This section imports the required packages and downloads the data used for training.

In [1]:
# Imports and setup
import os
import torch
import torch.nn as nn
from torch.nn import functional as F

# import matplotlib.pyplot as plt

Importing copy of required datasets from personal Github repository if not found in current directory.

In [2]:
# Download the ZIP file using wget
ZIP_URL = 'https://github.com/tpchikumbu/GPTiny/archive/main.zip'
PROJECT_DIR = os.getcwd() + '/LMDatasets'
print('Searching for data in ', PROJECT_DIR)

if not os.path.isdir(PROJECT_DIR):

  !wget -O "GPTiny-main.zip" "$ZIP_URL"
  # Extract only the specific folder from the ZIP file
  !unzip -q "GPTiny-main.zip" "GPTiny-main/LMDatasets/*" -d "."
  !mv "GPTiny-main/LMDatasets" "."

  # Remove temporary files
  !rm -rf "GPTiny-main.zip"
  !rm -rf "GPTiny-main"

else:
  print('Data already downloaded')

Searching for data in  /home/peter/Documents/Hons/NLP/GPTiny/LMDatasets
Data already downloaded


Run to choose a language for the datasets and read them into environment

In [3]:
language = "nr" # Options include nr, ss, xh, zu
file_name = f'{PROJECT_DIR}/nchlt_text.{language}'

# Load data
with open(file_name + ".train", 'r', encoding='utf-8') as f:
    train_df = f.read()
with open(file_name + ".valid", 'r', encoding='utf-8') as f:
    dev_df = f.read()
with open(file_name + ".test", 'r', encoding='utf-8') as f:
    test_df = f.read()

print('Train shape: ', len(train_df))
print('Dev shape: ', len(dev_df))
print('Test shape: ', len(test_df))

Train shape:  6382803
Dev shape:  441906
Test shape:  444199


## Generating Vocabulary
The next block must be run to generate a vocabulary and encode the datasets accordingly.

In [4]:
# Generate vocabulary
used_chars = sorted(list(set(train_df)))
vocab_size = len(used_chars)
print("Tokens: ", ''.join(used_chars))
print("Token count: ", vocab_size)

# char to int mapping
char_to_int = { ch:i for i,ch in enumerate(used_chars) }
int_to_char = { i:ch for i,ch in enumerate(used_chars) }
encode = lambda s: [char_to_int[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([int_to_char[i] for i in l]) # decoder: take a list of integers, output a string

Tokens:  
 !"$%&')*+,-./0123456789:;<>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_abcdefghijklmnopqrstuvwxyz{|}~ ¡£©«­°±²³´¸¹ºÂÃÅž
Token count:  110


In [5]:
# Tokenise the corpus and place on tensors
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Torch device: ", device)

train_encoded = torch.tensor(encode(train_df), device=device)
dev_encoded = torch.tensor(encode(dev_df), device=device)
test_encoded = torch.tensor(encode(test_df), device=device)

Torch device:  cpu


## Neural network definition
Contains the code required to define the neural network

In [6]:
# Self attention head

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size, n_embd, block_size, dropout=0.1):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [7]:
# Multi-head to find parallel attention

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size, n_embd, block_size, head_dropout=0.1, multi_dropout=0.1):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, n_embd, block_size, head_dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(multi_dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [8]:
# Feed-forward network
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    widening : int = 4

    def __init__(self, n_embd, wide = 4, dropout=0.1):
        super().__init__()
        self.widening = wide
        self.net = nn.Sequential(
            nn.Linear(n_embd, self.widening * n_embd),
            nn.ReLU(),
            nn.Linear(self.widening * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [9]:
# Compute + communicate
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head, block_size, widen = 4, head_dropout=0.1, multi_dropout=0.1, ff_dropout=0.1, block_dropout=0.1):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size, n_embd, block_size, head_dropout, multi_dropout)
        self.ffwd = FeedFoward(n_embd, widen, ff_dropout)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        self.dropout = nn.Dropout(block_dropout)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) # residual to attention block
        x = x + self.ffwd(self.ln2(x)) # residual to feed forward block
        return x



## Training Hyperparameters
The following block calculates average sentence length in the corpus. This value can be used for context size when training the model. Prioritise in sentence context, over cross sentence context. Other model hyperparameters like batch size, amount of dropout applied and connections between layers are also specified here.

In [None]:
# Split training data on newline character and calculate average length of the split elements
avg_length = 0
sentence_count = 0
for element in train_df.split('\n'):
  avg_length += len(element)
  sentence_count += 1

avg_length = round(avg_length / sentence_count)

print("Total splits: ", sentence_count)
print("Average length of split elements: ", avg_length)

# ------------
# Hyperparameters
torch.manual_seed(1337)
batch_size = 256 # how many independent sequences will we process in parallel?
block_size = 64 # avg_length # what is the maximum context length for predictions?
max_iters = 1000
eval_interval = 100
learning_rate = 4e-2
eval_iters = 100
n_embd = 64
n_head = 16
n_layer = 8
dropout = 0.2
# ------------

Other functions required during training can be defined to load the desired dataset and estimate the loss of the model

In [None]:
# Generate dataloader with different block and batch sizes

def get_batch(split: str):
  # generate a small batch of data of inputs x and targets y

  if split == "train":
    data = train_encoded
  elif split == "dev":
    data = dev_encoded
  elif split == "test":
    data = test_encoded

  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x, y

# Estimate loss
@torch.no_grad()
def estimate_loss(mode: str, model: nn.Module):
    out = {}
    splits = []
    model.eval()
    # Determine datasets to be used
    if mode == "train":
        splits = ['train', 'dev']
    elif mode == "test":
        splits = ['test']

    # Calculate losses for chosen datasets
    for split in splits:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out
