<a href="https://colab.research.google.com/github/vggls/language_models/blob/main/notebooks/Pre_trained_transformer_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Imports

In [None]:
import pickle
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import math
#import string #string.punctuation contains punctuation symbols

In [None]:
# for google colab import run this cell as well
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [None]:
from nltk.corpus import treebank

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import GPT2LMHeadModel

In [None]:
#custom written code
from preprocessing import lower, add_unk_tokens_for_training, unk_for_reduced_vocab, replace_with_unk_for_testing, tokens_to_indices
from training import Train
from perplexity import perplexity_network_model

#### Dataset and Preprocessing

In [None]:
# Penn Treebank

def load_treebank(left_limit, right_limit):

    symbols_to_remove = set(['-LRB-', '-RRB-', '-LSB-', '-RSB-', '-LCB-', '-RCB-']) # parentheses

    #sos_token = ['<bos>']
    eos_token = ['<eos>']

    tokenized_sentences = []
    for j in range(left_limit, right_limit):
        for i in treebank.sents(treebank.fileids()[j]):
            l = [token for token in i if ('*' not in token) and ('\/' not in token) and (token not in symbols_to_remove)]
            l = l + eos_token
            tokenized_sentences.append(l)

    return tokenized_sentences

In [None]:
train_treebank = load_treebank(0, 150)
val_treebank = load_treebank(150, 175)
test_treebank = load_treebank(175, 199)

len(train_treebank), len(val_treebank), len(test_treebank)

(3262, 314, 338)

In [None]:
#lower first letter of each token
lower_train_treebank = lower(train_treebank)
lower_val_treebank = lower(val_treebank)
lower_test_treebank = lower(test_treebank)

In [None]:
# insert <unk> token to training data for case I model
train_sentences = add_unk_tokens_for_training(lower_train_treebank) #replace all tokens that appear less than 3 times with <unk>

In [None]:
# case I vocabulary
vocabulary = set([item for sublist in train_sentences for item in sublist])
len(vocabulary), '<unk>' in vocabulary, '<eos>' in vocabulary

(3259, True, True)

In [None]:
# not the same index assignments every time i run the cell
word_to_index = {word: idx for idx, word in enumerate(vocabulary)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

with open('transformer_word_index_mappings.pickle', 'wb') as f:
        pickle.dump([word_to_index, index_to_word], f)

word_to_index['<eos>'], word_to_index['<unk>']

(3002, 302)

In [None]:
# training sequence of indices
train_int_sequence = tokens_to_indices(word_to_index, train_sentences)

# validation sequence of indices
val_sentences = replace_with_unk_for_testing(vocabulary, lower_val_treebank)
val_int_sequence = tokens_to_indices(word_to_index, val_sentences)

# testing sequence of indices
test_sentences = replace_with_unk_for_testing(vocabulary, lower_test_treebank)
test_int_sequence = tokens_to_indices(word_to_index, test_sentences)

len(train_int_sequence), len(val_int_sequence), len(test_int_sequence)

(82372, 8003, 8319)

In [None]:
len(set(train_int_sequence)), len(set(val_int_sequence)), len(set(test_int_sequence))

(3259, 1165, 1272)

The following cell is used to give a brief description on how to feed a recurrent neural net. For simplicity, we consider the case of the first two sentences.

Recall that '< eos>' is represented by the integer 3002 and '< unk>' by 302.

So if we process the data in sequences of length = 5, the model will learn as follows:

- map [302, 302, 3075, 1564, 365] to [302, 3075, 1564, 365, 1986]
- map [302, 3075, 1564, 365, 1986] to [3075, 1564, 365, 1986, 3075]
- i.e. shift by 1-step to the future and continue like this

During training and validation, the sequences of (sequence_length+1)-length are fed in batches during training.

In [None]:
print(train_sentences[0], train_sentences[1], '\n')
print(train_int_sequence[:28])

['<unk>', '<unk>', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.', '<eos>'] ['mr.', '<unk>', 'is', 'chairman', 'of', '<unk>', 'n.v.', ',', 'the', 'dutch', 'publishing', 'group', '.', '<eos>'] 

[302, 302, 3075, 1564, 365, 1986, 3075, 2920, 348, 1471, 2255, 1690, 2778, 1890, 2824, 69, 18, 2901, 3002, 1342, 302, 2578, 1226, 2067, 302, 1445, 3075, 1471]


#### Model

In [None]:
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
gpt2_model # 768 = embeddings size (hyperparam)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
print('No. of trainable params', sum(p.numel() for p in gpt2_model.parameters() if p.requires_grad))

No. of trainable params 124439808


**run ONE of the following two cells**

In [None]:
# Freeze ALL the transformer parameters
for param in gpt2_model.transformer.parameters():
      param.requires_grad = False

In [None]:
'''
# Freeze the transformer parameters

for layer in [gpt2_model.transformer.wte, gpt2_model.transformer.wpe]:
    for param in layer.parameters():
        param.requires_grad = False

for block_index in range(10):
    for param in gpt2_model.transformer.h[block_index].parameters():
        param.requires_grad = False

In [None]:
# The parameters of both the transformer and the head are part of the overall model parameters. Thus now result should be 0.
print('No. of trainable params', sum(p.numel() for p in gpt2_model.transformer.parameters() if p.requires_grad))

No. of trainable params 0


In [None]:
# create new head
gpt2_model.lm_head = nn.Linear(in_features=768, out_features=len(vocabulary), bias=False)

In [None]:
gpt2_model #visualize to verify that the nn.Linear layer is succesfully inserted

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=3259, bias=False)
)

In [None]:
# Trainable parameters including the new ones inserted by the nn.Linear layer
print('No. of trainable params', sum(p.numel() for p in gpt2_model.parameters() if p.requires_grad))

No. of trainable params 2502912


In [None]:
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(gpt2_model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)

In [None]:
instance = Train(model=gpt2_model,
                model_type = 'transformer',
                loss_fct=criterion,
                optimizer=optimizer,
                scheduler=scheduler,
                train_sequence=train_int_sequence,
                val_sequence=val_int_sequence,
                sequence_length=50,
                batch_size=128,
                epochs=20,
                patience=5,
                name='gpt2_with_trainable_head')

Device: cuda:0


In [None]:
train_loss, val_loss, checkpoints = instance.training()

Starting training..
  lr value 0.001
Epoch: 1/20 - Perplexity: training 867.027, validation 201.092
  lr value 0.001
Epoch: 2/20 - Perplexity: training 214.392, validation 181.622 - E.S. checkpoint
  lr value 0.001
Epoch: 3/20 - Perplexity: training 144.452, validation 177.499 - E.S. checkpoint
  lr value 0.0005
Epoch: 4/20 - Perplexity: training 87.742, validation 127.403 - E.S. checkpoint
  lr value 0.0005
Epoch: 5/20 - Perplexity: training 74.805, validation 127.506
  lr value 0.0005
Epoch: 6/20 - Perplexity: training 67.073, validation 128.349
  lr value 0.00025
Epoch: 7/20 - Perplexity: training 54.771, validation 116.973 - E.S. checkpoint
  lr value 0.00025
Epoch: 8/20 - Perplexity: training 51.541, validation 117.038
  lr value 0.00025
Epoch: 9/20 - Perplexity: training 49.338, validation 117.294
  lr value 0.000125
Epoch: 10/20 - Perplexity: training 44.673, validation 113.627 - E.S. checkpoint
  lr value 0.000125
Epoch: 11/20 - Perplexity: training 43.324, validation 113.727
 

In [None]:
instance = Train(model=gpt2_model,
                 model_type = 'transformer',
                 loss_fct=criterion,
                 optimizer=optimizer,
                 scheduler=None,
                 train_sequence=val_int_sequence,
                 val_sequence=None,
                 sequence_length=50,
                 batch_size=128,
                 epochs=19,
                 patience=None,
                 name=None)

train_loss_of_val_data = instance.training()

Device: cuda:0
Starting training..
No validation data is used.
Epoch: 1/19 - Perplexity: training 115.754
Epoch: 2/19 - Perplexity: training 110.533
Epoch: 3/19 - Perplexity: training 106.786
Epoch: 4/19 - Perplexity: training 103.651
Epoch: 5/19 - Perplexity: training 100.855
Epoch: 6/19 - Perplexity: training 98.745
Epoch: 7/19 - Perplexity: training 96.999
Epoch: 8/19 - Perplexity: training 94.992
Epoch: 9/19 - Perplexity: training 93.484
Epoch: 10/19 - Perplexity: training 92.109
Epoch: 11/19 - Perplexity: training 90.636
Epoch: 12/19 - Perplexity: training 89.499
Epoch: 13/19 - Perplexity: training 88.240
Epoch: 14/19 - Perplexity: training 87.096
Epoch: 15/19 - Perplexity: training 86.105
Epoch: 16/19 - Perplexity: training 85.066
Epoch: 17/19 - Perplexity: training 84.311
Epoch: 18/19 - Perplexity: training 83.439
Epoch: 19/19 - Perplexity: training 82.464
Training complete !


In [None]:
torch.save(gpt2_model, 'model_epoch19_gpt2_with_trainable_head.pth')

#### Perplexity

In [None]:
perplexity_network_model(test_sequence_of_integers = test_int_sequence,
                        sequence_length = 50,
                        model = gpt2_model,
                        model_type = 'transformer',
                        loss_fct = nn.CrossEntropyLoss(),
                        vocab_size = len(vocabulary))

139.0744322586039