# Assignment 3 Top-Level Code/Notebook
### Training a language model base on Karpathy's minGPT codebase


In [None]:
# The code below is needed for using Google Colab, so un comment this if that is what you're using
"""
import nltk
nltk.download('punkt')
"""

"\nimport nltk\nnltk.download('punkt')\n"

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# The code below is also needed for using Google Colab
# BEFORE executing this, you must place the mingpt folder supplied in the assignment
# your google drive, within the folder "Colab Notebooks"
#
# It mounts and changes into the folder that contains mingpt, which you must upload to google drive
# So un-comment it if you've uploaded mingpt to your google drive, into the  "Colab Notebooks" folder
"""
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab\ Notebooks/
"""

"\nfrom google.colab import drive\ndrive.mount('/content/drive')\n%cd /content/drive/MyDrive/Colab\\ Notebooks/\n"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Colab\ Notebooks/

/content/drive/MyDrive/Colab Notebooks


In [None]:
import torch
import numpy as np

from nltk.tokenize import sent_tokenize

from pathlib import Path
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.bpe import BPETokenizer
from mingpt.utils import set_seed
set_seed(1234)

In [None]:
"""
Prepare the dataset to train the Language Model (LM)
This implementation splits the sentences and so doesn't create training
examples that cross sentences.

This code is set so that it uses one of two possible datasets, which were also used in Assignment 1:
SmallSimpleCorpus.txt or LargerCorpus.txt

Arguments:
            ds_choice: str. "small" or "large". (i.e. selects which of the two datasets)
            split: str. "train" or "test".
            truncation: int. If -1: no truncation on sentences. Otherwise: truncate to this specific length.
"""

class LanguageModelingDataset(Dataset):

    def __init__(self, ds_choice="small", split="train", truncation=-1):

        base_path = "./"
        fn = {"small": "SmallSimpleCorpus.txt", "large": "LargerCorpus.txt"}
        self.ds_choice = ds_choice
        self.truncation = truncation  # int. If -1, then
        text = Path(base_path, fn[ds_choice]).read_text()
        if ds_choice == "large":
            # Remove the newline char in the middle of sentences
            # The "paragraph splitting" newlines appear to be \n\n -- remove the duplications there
            text = text.replace("\n\n", "$$^^$$").replace("\n", " ").replace("$$^^$$", "\n")
        sentences = sent_tokenize(text)

        # Train / test split
        train, val = train_test_split(sentences, test_size=0.2, shuffle=False)
        if split == "train":
            raw_data = train
        else:
            raw_data = val

        # Tokenize
        self.tokenizer = BPETokenizer()
        self.data = []  # List of 1-d pytorch tensor
        for sent in raw_data:
            tokenized = self.tokenizer(sent).view(-1)  # pytorch tensor
            if truncation >= 0:
                self.data.append(tokenized[:truncation])
            else:
                self.data.append(tokenized)

        # Count some items
        self.max_sentence_length = np.max([len(d) for d in self.data])

    def __len__(self):
        return len(self.data)

    def get_vocab_size(self):
        """
        We have to set this to the max vocab size (i.e., that decided by the BPE tokenizer),
        but actually, only a small number of vocab is used, especially for the small text.
        """
        return 50257

    def __getitem__(self, idx):
        """
        The output should be a tuple x and y, both as pytorch tensors.
        Please refer to the `run()` method in the mingpt/trainer.py script for
        how the x and y are going to be used.
        """
        x = self.data[idx][:-1]
        y = self.data[idx][1:]
        return (x, y)

    def get_block_size(self):
        """
        block_size is the size at which lines are truncated to ensure they are equal-length.
        """
        return self.max_sentence_length

# Instantiate the Training Dataset
#train_dataset = LanguageModelingDataset(ds_choice="small", split="train")  # use this for the short corpus
train_dataset = LanguageModelingDataset(ds_choice="large", split="train", truncation=512) #use this for long

# Instantiate a Validation Dataset (this is only really needed for the fine-tune task, not the LM task)
#val_dataset = LanguageModelingDataset(ds_choice="small", split="validation")
val_dataset = LanguageModelingDataset(ds_choice="large", split="validation", truncation=512)

downloading https://openaipublic.blob.core.windows.net/gpt-2/models/124M/encoder.json to /root/.cache/mingpt/encoder.json
downloading https://openaipublic.blob.core.windows.net/gpt-2/models/124M/vocab.bpe to /root/.cache/mingpt/vocab.bpe


In [None]:
def lm_collate_fn(batch, device):
    x = [item[0] for item in batch]  # List (len B) of varying lengths
    y = [item[1] for item in batch]  # List (len B) of the same lengths as x
    maxlen = max([len(s) for s in x])

    padded_x, padded_y = [], []
    for sx, sy in zip(x, y):
        padded_x.append(torch.cat([sx, torch.ones(maxlen - len(sx))]))
        padded_y.append(torch.cat([sy, torch.ones(maxlen - len(sy))]))
    return torch.stack(padded_x).long().to(device), torch.stack(padded_y).long().to(device)


In [None]:
# Print out an example of the data - this is processed more once it reaches lm_collate_fn (above)
x,y = train_dataset[5]
print(x, y)
print("X: ",train_dataset.tokenizer.decode(x))
print("Y: ",train_dataset.tokenizer.decode(y))

tensor([  40, 6437,  262, 3290]) tensor([6437,  262, 3290,   13])
X:  I rub the dog
Y:   rub the dog.


In [None]:
weights = torch.tensor([0, 10, 3, 0], dtype=torch.float)
torch.multinomial(weights, num_samples=3)

tensor([1, 2, 0])

In [None]:
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model_config.n_classification_class = 2
model = GPT(model_config)

number of parameters: 2.52M


In [None]:
# Create a Trainer object and set the core hyper-parameters
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 100000  # For small corpus: 3000 iterations is plenty. For large corpus: 100000 iterations is needed
train_config.num_workers = 0
train_config.batch_size = 16    # For small corpus, batch size of 4 is fine.  For large corpus use 16
trainer = Trainer(train_config, model, train_dataset, val_dataset, collate_fn=lm_collate_fn)

running on device cuda


In [None]:
# This function is called at the end of every batch in training
# and is used to report the amount of time per 100 batches, and the loss at that point

def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

# Train!
trainer.run()

iter_dt 0.00ms; iter 0: train loss 1.44767
iter_dt 21.43ms; iter 100: train loss 0.52188
iter_dt 27.08ms; iter 200: train loss 0.43902
iter_dt 29.23ms; iter 300: train loss 0.93636
iter_dt 20.58ms; iter 400: train loss 0.50063
iter_dt 22.77ms; iter 500: train loss 1.00063
iter_dt 18.90ms; iter 600: train loss 0.79821
iter_dt 31.31ms; iter 700: train loss 0.60460
iter_dt 17.54ms; iter 800: train loss 0.95457
iter_dt 20.28ms; iter 900: train loss 1.20016
iter_dt 17.60ms; iter 1000: train loss 1.05357
iter_dt 24.21ms; iter 1100: train loss 0.57932
iter_dt 16.88ms; iter 1200: train loss 1.25023
iter_dt 19.54ms; iter 1300: train loss 0.64882
iter_dt 160.96ms; iter 1400: train loss 0.26326
iter_dt 26.06ms; iter 1500: train loss 0.31194
iter_dt 25.53ms; iter 1600: train loss 0.83218
iter_dt 15.57ms; iter 1700: train loss 0.69789
iter_dt 17.50ms; iter 1800: train loss 1.25245
iter_dt 22.42ms; iter 1900: train loss 0.89088
iter_dt 32.73ms; iter 2000: train loss 0.85548
iter_dt 24.60ms; iter 210

In [None]:
model.to(trainer.device)
# store the saved model in a file, so can re-use later
modelsavename= "model_filename.pt"  # change the name here to save in a specific file (and restore below)
with open(modelsavename, "wb") as f:
    torch.save(trainer.model.state_dict(), f)

In [None]:
# Use the trained language model to predict a sequence of words following a few words
encoded_prompt = train_dataset.tokenizer("He and I").to(trainer.device)
generated_sequence = trainer.model.generate(encoded_prompt, trainer.device, temperature=0.8, max_new_tokens=10)
train_dataset.tokenizer.decode(generated_sequence[0])

'He and I hold the dog.. dog. dog and cat'

In [None]:
# Another example
encoded_prompt = train_dataset.tokenizer("She rubs").to(trainer.device)
generated_sequence = trainer.model.generate(encoded_prompt, trainer.device, temperature=0.6, max_new_tokens=10)
train_dataset.tokenizer.decode(generated_sequence[0])

'She rubs and holds the cat. dog. dog. dog'

**2.3**

In [None]:
encoded_prompt = train_dataset.tokenizer("He and I").to(trainer.device)
generated_sequence,probability = trainer.model.generate0(encoded_prompt, trainer.device, temperature=0.8, max_new_tokens=10)
train_dataset.tokenizer.decode(generated_sequence[0])

'He and I hold the dog.. dog. dog and cat'

In [None]:
print(generated_sequence)

tensor([[1544,  290,  314, 1745,  262, 3290,   13,   13, 3290,   13, 3290,  290,
         3797]], device='cuda:0')


In [None]:
print(probability)

tensor([[1.0000, 1.0000, 1.0000, 0.6165, 0.5366, 0.5386, 0.9104, 0.9458, 0.5335,
         0.9421, 0.6213, 0.6957, 0.9843]], device='cuda:0',
       grad_fn=<CatBackward0>)


In [None]:
# Another example
encoded_prompt = train_dataset.tokenizer("She rubs").to(trainer.device)
generated_sequence,probability = trainer.model.generate0(encoded_prompt, trainer.device, temperature=0.6, max_new_tokens=10)
train_dataset.tokenizer.decode(generated_sequence[0])

'She rubs a dog and cat. cat. dog. cat'

In [None]:
print(generated_sequence)

tensor([[3347, 6437,   82,  257, 3290,  290, 3797,   13, 3797,   13, 3290,   13,
         3797]], device='cuda:0')


In [None]:
print(probability)

tensor([[1.0000, 1.0000, 1.0000, 0.4260, 0.5635, 0.5800, 0.9999, 0.9960, 0.7251,
         0.9741, 0.9095, 0.9940, 0.6573]], device='cuda:0',
       grad_fn=<CatBackward0>)


In [None]:
# own choosing example
encoded_prompt = train_dataset.tokenizer("He rubs").to(trainer.device)
generated_sequence,probability = trainer.model.generate0(encoded_prompt, trainer.device, temperature=0.6, max_new_tokens=10)
train_dataset.tokenizer.decode(generated_sequence[0])

'He rubs a dog and cat. cat. dog. dog'

In [None]:
print(generated_sequence)

tensor([[1544, 6437,   82,  257, 3290,  290, 3797,   13, 3797,   13, 3290,   13,
         3290]], device='cuda:0')


In [None]:
print(probability)

tensor([[1.0000, 1.0000, 1.0000, 0.3608, 0.6248, 0.6104, 0.9994, 0.9983, 0.7570,
         0.9853, 0.8682, 0.9914, 0.6459]], device='cuda:0',
       grad_fn=<CatBackward0>)


**2.4**

In [None]:
encoded_prompt = train_dataset.tokenizer("He and I").to(trainer.device)
generated_sequence,worldList,probability = trainer.model.generate1(encoded_prompt, trainer.device, temperature=0.8, max_new_tokens=10)
train_dataset.tokenizer.decode(generated_sequence[0])

'He and I hold the cat.. dog. dog and dog'

In [None]:
print((worldList[1:]))

tensor([[1745., 6437.,  460., 6622., 3290., 3797.],
        [ 262.,  257.,  290., 1745., 3290., 6437.],
        [3797., 3290., 6437.,  262.,  257.,  290.],
        [  13.,  764.,  290., 3797., 6437., 1745.],
        [  13.,  764., 3290., 3797., 1745., 6437.],
        [3290., 3797.,  290.,   13.,  262.,  257.],
        [  13.,  764.,  290., 6622., 1745., 6437.],
        [3290., 3797.,   13.,  290.,  257.,  262.],
        [ 290.,   13.,  257.,  262., 6622.,  460.],
        [3290., 3797.,  290.,   13., 6622.,  262.]], device='cuda:0')


In [None]:
train_dataset.tokenizer.decode(worldList[1])

' hold rub can holds dog cat'

In [None]:
print(torch.round(probability,decimals=3))

tensor([[0.0000, 0.0000, -0.0000, 0.0000, -0.0000, 0.0000],
        [0.6670, 0.2200, 0.1130, 0.0010, 0.0000, 0.0000],
        [0.5420, 0.4560, 0.0010, 0.0000, 0.0000, 0.0000],
        [0.5030, 0.4970, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.9930, 0.0060, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.9320, 0.0440, 0.0160, 0.0030, 0.0020, 0.0010],
        [0.8590, 0.1400, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.8420, 0.1500, 0.0030, 0.0020, 0.0010, 0.0010],
        [0.7500, 0.2360, 0.0060, 0.0060, 0.0010, 0.0010],
        [0.6150, 0.3830, 0.0010, 0.0010, 0.0000, 0.0000],
        [0.9840, 0.0150, 0.0000, 0.0000, 0.0000, 0.0000]], device='cuda:0')


In [None]:
import pandas as pd

In [None]:
print(worldList.shape[1])

6


In [None]:
worldList0 = worldList[1:]
probability0 = probability[1:]

table = []

for i in range(worldList0.shape[0]):
  column = []
  for t in range(worldList0.shape[1]):
    column.append(train_dataset.tokenizer.decode(worldList0[i][t].reshape(1)) + f" / {probability0[i][t]:.3f}")
  table.append(column)

df = pd.DataFrame(data=table)

In [None]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,hold / 0.667,the / 0.542,cat / 0.503,. / 0.993,. / 0.932,dog / 0.859,. / 0.842,dog / 0.750,and / 0.615,dog / 0.984
1,rub / 0.220,a / 0.456,dog / 0.497,. / 0.006,. / 0.044,cat / 0.140,. / 0.150,cat / 0.236,. / 0.383,cat / 0.015
2,can / 0.113,and / 0.001,rub / 0.000,and / 0.000,dog / 0.016,and / 0.000,and / 0.003,. / 0.006,a / 0.001,and / 0.000
3,holds / 0.001,hold / 0.000,the / 0.000,cat / 0.000,cat / 0.003,. / 0.000,holds / 0.002,and / 0.006,the / 0.001,. / 0.000
4,dog / 0.000,dog / 0.000,a / 0.000,rub / 0.000,hold / 0.002,the / 0.000,hold / 0.001,a / 0.001,holds / 0.000,holds / 0.000
5,cat / 0.000,rub / 0.000,and / 0.000,hold / 0.000,rub / 0.001,a / 0.000,rub / 0.001,the / 0.001,can / 0.000,the / 0.000


In [None]:
encoded_prompt = train_dataset.tokenizer("She rubs").to(trainer.device)
generated_sequence,worldList,probability = trainer.model.generate1(encoded_prompt, trainer.device, temperature=0.8, max_new_tokens=10)
train_dataset.tokenizer.decode(generated_sequence[0])

'She rubs and holds the cat. dog . and dog.'

In [None]:
worldList0 = worldList[1:]
probability0 = probability[1:]

table = []

for i in range(worldList0.shape[0]):
  column = []
  for t in range(worldList0.shape[1]):
    column.append(train_dataset.tokenizer.decode(worldList0[i][t].reshape(1)) + f" / {probability0[i][t]:.3f}")
  table.append(column)

df1 = pd.DataFrame(data=table)

In [None]:
df1.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,and / 0.346,holds / 0.994,the / 0.544,cat / 0.633,. / 0.931,dog / 0.582,. / 0.551,and / 0.775,dog / 0.950,. / 0.613
1,a / 0.341,can / 0.002,a / 0.453,dog / 0.367,. / 0.067,cat / 0.417,. / 0.438,. / 0.197,I / 0.038,and / 0.386
2,the / 0.312,and / 0.001,and / 0.001,rub / 0.000,and / 0.001,s / 0.000,hold / 0.004,dog / 0.016,hold / 0.006,a / 0.001
3,. / 0.000,. / 0.001,s / 0.001,. / 0.000,rub / 0.000,. / 0.000,s / 0.002,the / 0.005,cat / 0.004,the / 0.000
4,holds / 0.000,s / 0.001,dog / 0.000,hold / 0.000,hold / 0.000,and / 0.000,and / 0.002,a / 0.005,rub / 0.001,holds / 0.000
5,can / 0.000,dog / 0.000,cat / 0.000,. / 0.000,cat / 0.000,rub / 0.000,holds / 0.001,holds / 0.001,holds / 0.001,can / 0.000


section 3

In [None]:
# The code below shows how to reload the model from the saved file; is useful things that take long to train
model.load_state_dict(torch.load('model_large100K.pt'))

<All keys matched successfully>

In [None]:
# Example showing how the reloaded model still works
encoded_prompt = train_dataset.tokenizer("he believe").to(trainer.device)
generated_sequence = trainer.model.generate(encoded_prompt, trainer.device, temperature=0.6, max_new_tokens=10)
train_dataset.tokenizer.decode(generated_sequence[0])

'he believe of Mr. Wheeler a point of so intense that'

In [None]:
# Example showing how the reloaded model still works
encoded_prompt = train_dataset.tokenizer("in the end").to(trainer.device)
generated_sequence = trainer.model.generate(encoded_prompt, trainer.device, temperature=0.6, max_new_tokens=10)
train_dataset.tokenizer.decode(generated_sequence[0])

'in the end of the same5 had taken at a lever above'

In [None]:
# Example showing how the reloaded model still works
encoded_prompt = train_dataset.tokenizer("coin").to(trainer.device)
generated_sequence = trainer.model.generate(encoded_prompt, trainer.device, temperature=0.6, max_new_tokens=10)
train_dataset.tokenizer.decode(generated_sequence[0])

'coin there is a room of the one thousand of copper'

In [None]:
# Example showing how the reloaded model still works
encoded_prompt = train_dataset.tokenizer("United States").to(trainer.device)
generated_sequence = trainer.model.generate(encoded_prompt, trainer.device, temperature=0.6, max_new_tokens=10)
train_dataset.tokenizer.decode(generated_sequence[0])

'United States Mint    was shield by the  '

In [None]:
model.to(trainer.device)
# store the saved model in a file, so can re-use later
modelsavename= "model_Large.pt"  # change the name here to save in a specific file (and restore below)
with open(modelsavename, "wb") as f:
    torch.save(trainer.model.state_dict(), f)