<a href="https://colab.research.google.com/github/vggls/language_models/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### General imports for all models

In [1]:
import pickle
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import math
#import string #string.punctuation contains punctuation symbols

In [2]:
# for google colab import run this cell as well
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [3]:
from nltk.corpus import treebank

### A. 3-gram language model with Laplace smoothing

In [None]:
# custom written code
from preprocessing import lower, add_unk_tokens_for_training, replace_with_unk_for_testing, create_ngrams
from laplace_model import count_n_grams, laplace_model, perplexity_ngram_model

In [None]:
# Penn Treebank
symbols_to_remove = set(['-LRB-', '-RRB-', '-LSB-', '-RSB-', '-LCB-', '-RCB-']) # parentheses

train_treebank = []
for j in range(175):
    for i in treebank.sents(treebank.fileids()[j]):
        l = [token for token in i if ('*' not in token) and ('\/' not in token) and (token not in symbols_to_remove)] # Remove tokens that contain '*', '\/' or symbols_to_remove
        train_treebank.append(l) # Append the sentence to the training data

test_treebank = []
for j in range(175, 199):
    for i in treebank.sents(treebank.fileids()[j]):
        l = [token for token in i if '*' not in token and token not in symbols_to_remove]
        test_treebank.append(l)

len(train_treebank), len(test_treebank)

(3576, 338)

In [None]:
#lower first letter of each token
train_tokenized_sentences = lower(train_treebank)
test_tokenized_sentences = lower(test_treebank)

In [None]:
# insert <unk> token to training data
train_tokenized_sentences = add_unk_tokens_for_training(train_tokenized_sentences) #replace all tokens that appear less than 3 times with <unk>

In [None]:
vocabulary = set([item for sublist in train_tokenized_sentences for item in sublist])
len(vocabulary)

3466

In [None]:
'<unk>' in vocabulary, '<bos>' in vocabulary, '<eos>' in vocabulary

(True, False, False)

In [None]:
# insert <unk> token to test data
test_tokenized_sentences = replace_with_unk_for_testing(vocabulary, test_tokenized_sentences)

In [None]:
#add <bos> and <eos> tokens and compute ngrams
train_bigrams = create_ngrams(2, train_tokenized_sentences)
train_trigrams = create_ngrams(3, train_tokenized_sentences)
test_trigrams = create_ngrams(3, test_tokenized_sentences)

len(train_bigrams), len(train_trigrams), len(test_trigrams)

(90375, 93951, 8663)

In [None]:
#example of 2-grams and 3-grams extracted from the first training sentence
print(train_treebank[0], '\n')
print(train_tokenized_sentences[0], '\n')
print(train_bigrams[:19], '\n')
print(train_trigrams[:20])

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 

['<unk>', '<unk>', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.'] 

[['<bos>', '<unk>'], ['<unk>', '<unk>'], ['<unk>', ','], [',', '61'], ['61', 'years'], ['years', 'old'], ['old', ','], [',', 'will'], ['will', 'join'], ['join', 'the'], ['the', 'board'], ['board', 'as'], ['as', 'a'], ['a', 'nonexecutive'], ['nonexecutive', 'director'], ['director', 'nov.'], ['nov.', '29'], ['29', '.'], ['.', '<eos>']] 

[['<bos>', '<bos>', '<unk>'], ['<bos>', '<unk>', '<unk>'], ['<unk>', '<unk>', ','], ['<unk>', ',', '61'], [',', '61', 'years'], ['61', 'years', 'old'], ['years', 'old', ','], ['old', ',', 'will'], [',', 'will', 'join'], ['will', 'join', 'the'], ['join', 'the', 'board'], ['the', 'board', 'as'], ['board', 'as', 'a'], ['as', 'a', 'nonexecutive'], ['a', 'nonexecutive', 'dire

In [None]:
#example of 3-grams extracted from the first test sentence
print(test_treebank[0], '\n')
print(test_tokenized_sentences[0], '\n')
print(test_trigrams[:31])

['Xerox', 'Corp.', 'has', 'told', 'employees', 'in', 'its', 'Crum', '&', 'Forster', 'personal', 'insurance', 'operations', 'that', 'it', 'is', 'laying', 'off', 'about', '300', 'people', ',', 'or', '25', '%', 'of', 'the', 'staff', '.'] 

['<unk>', 'corp.', 'has', 'told', 'employees', 'in', 'its', '<unk>', '&', '<unk>', 'personal', 'insurance', 'operations', 'that', 'it', 'is', '<unk>', 'off', 'about', '300', 'people', ',', 'or', '25', '%', 'of', 'the', 'staff', '.'] 

[['<bos>', '<bos>', '<unk>'], ['<bos>', '<unk>', 'corp.'], ['<unk>', 'corp.', 'has'], ['corp.', 'has', 'told'], ['has', 'told', 'employees'], ['told', 'employees', 'in'], ['employees', 'in', 'its'], ['in', 'its', '<unk>'], ['its', '<unk>', '&'], ['<unk>', '&', '<unk>'], ['&', '<unk>', 'personal'], ['<unk>', 'personal', 'insurance'], ['personal', 'insurance', 'operations'], ['insurance', 'operations', 'that'], ['operations', 'that', 'it'], ['that', 'it', 'is'], ['it', 'is', '<unk>'], ['is', '<unk>', 'off'], ['<unk>', 'off',

In [None]:
#2-grams and 3-grams frequencies
bigrams_counts = count_n_grams(train_bigrams)
trigrams_counts = count_n_grams(train_trigrams)

In [None]:
with open('ngrams_counts.pickle', 'wb') as f:
        pickle.dump([bigrams_counts, trigrams_counts], f)

In [None]:
perplexity_ngram_model(nminus1_grams_counts=bigrams_counts,
                       n_grams_counts=trigrams_counts,
                       test_n_grams=test_trigrams,
                       vocab_size=len(vocabulary))

1082.933692249023

### B. LSTM language model

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

In [5]:
#custom written code
from preprocessing import lower, add_unk_tokens_for_training, unk_for_reduced_vocab, replace_with_unk_for_testing, tokens_to_indices
from training import Train
from lstm_model import LSTMModel
from perplexity_neural import perplexity_neural_model

In [6]:
# Penn Treebank

def load_treebank(left_limit, right_limit):

    symbols_to_remove = set(['-LRB-', '-RRB-', '-LSB-', '-RSB-', '-LCB-', '-RCB-']) # parentheses

    #sos_token = ['<bos>']
    eos_token = ['<eos>']

    tokenized_sentences = []
    for j in range(left_limit, right_limit):
        for i in treebank.sents(treebank.fileids()[j]):
            l = [token for token in i if ('*' not in token) and ('\/' not in token) and (token not in symbols_to_remove)]
            l = l + eos_token
            tokenized_sentences.append(l)

    return tokenized_sentences

train_treebank = load_treebank(0, 150)
val_treebank = load_treebank(150, 175)
test_treebank = load_treebank(175, 199)

len(train_treebank), len(val_treebank), len(test_treebank)

(3262, 314, 338)

In [7]:
#lower first letter of each token
lower_train_treebank = lower(train_treebank)
lower_val_treebank = lower(val_treebank)
lower_test_treebank = lower(test_treebank)

In [8]:
# insert <unk> token to training data for case I model
train_sentences_I = add_unk_tokens_for_training(lower_train_treebank) #replace all tokens that appear less than 3 times with <unk>

In [9]:
# case I vocabulary
vocabulary_I = set([item for sublist in train_sentences_I for item in sublist])
len(vocabulary_I), '<unk>' in vocabulary_I, '<eos>' in vocabulary_I

(3259, True, True)

In [10]:
#model architecture hyperparams - for both cases
embedding_dim = 300
num_layers = 2
hidden_dim = 300
dropout_rate = 0.3

#### **case I: model with learnable embeddings** (all variable names end in 'I')

In [11]:
# not the same index assignments every time i run the cell
word_to_index_I = {word: idx for idx, word in enumerate(vocabulary_I)}
index_to_word_I = {idx: word for word, idx in word_to_index_I.items()}

with open('caseI_word_index_mappings.pickle', 'wb') as f:
        pickle.dump([word_to_index_I, index_to_word_I], f)

word_to_index_I['<eos>'], word_to_index_I['<unk>']

(1204, 2714)

In [12]:
# training sequence of indices
train_int_sequence_I = tokens_to_indices(word_to_index_I, train_sentences_I)

# validation sequence of indices
val_sentences_I = replace_with_unk_for_testing(vocabulary_I, lower_val_treebank)
val_int_sequence_I = tokens_to_indices(word_to_index_I, val_sentences_I)

# testing sequence of indices
test_sentences_I = replace_with_unk_for_testing(vocabulary_I, lower_test_treebank)
test_int_sequence_I = tokens_to_indices(word_to_index_I, test_sentences_I)

len(train_int_sequence_I), len(val_int_sequence_I), len(test_int_sequence_I)

(82372, 8003, 8319)

In [13]:
len(set(train_int_sequence_I)), len(set(val_int_sequence_I)), len(set(test_int_sequence_I))

(3259, 1165, 1272)

In [14]:
# brief explanation how to feed a recurrent neural net
# for simplicity, consider the case of the first two sentences
print(train_sentences_I[0], train_sentences_I[1], '\n')
print(train_int_sequence_I[:28])

['<unk>', '<unk>', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.', '<eos>'] ['mr.', '<unk>', 'is', 'chairman', 'of', '<unk>', 'n.v.', ',', 'the', 'dutch', 'publishing', 'group', '.', '<eos>'] 

[2714, 2714, 1753, 2492, 219, 1659, 1753, 2974, 3240, 2363, 1853, 1248, 1416, 2739, 1137, 2638, 349, 2057, 1204, 3050, 2714, 2197, 1594, 2872, 2714, 2562, 1753, 2363]


In the above representation recall that '< eos>' is represented by 1204 and '< unk>' by 2714.

So if we process the data in sequences of length = 5, the model will learn as follows:

- map [2714, 2714, 1753, 2492, 219] to 1659
- map [2714, 1753, 2492, 219, 1659] to 1753
- i.e. shift input by 1-step to the future and continue like this

The sequences of (sequence_length+1) length are fed in batches during training.

In [60]:
# Create the model, loss function, and optimizer
vocabI_size = len(vocabulary_I)
model = LSTMModel(vocabI_size, embedding_dim, hidden_dim, num_layers, dropout_rate, True, None)
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'No. of trainable parameters: {num_params}')

#model training hyperparams
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

No. of trainable parameters: 2425759


In [61]:
instance = Train(model=model,
                loss_fct=criterion,
                optimizer=optimizer,
                train_sequence=train_int_sequence_I,
                val_sequence=val_int_sequence_I,
                sequence_length=50,
                batch_size=128,
                epochs=50,
                patience=10,
                name='lstm_with_learnable_embeddings')

Device: cuda:0


In [62]:
train_loss, val_loss, checkpoints = instance.training()

Starting training..
Epoch: 1/50 - Perplexity: training 354.656, validation 154.659
Epoch: 2/50 - Perplexity: training 203.227, validation 119.999 - E.S. checkpoint
Epoch: 3/50 - Perplexity: training 162.077, validation 104.285 - E.S. checkpoint
Epoch: 4/50 - Perplexity: training 135.357, validation 96.030 - E.S. checkpoint
Epoch: 5/50 - Perplexity: training 115.963, validation 89.816 - E.S. checkpoint
Epoch: 6/50 - Perplexity: training 100.713, validation 84.925 - E.S. checkpoint
Epoch: 7/50 - Perplexity: training 88.539, validation 82.535 - E.S. checkpoint
Epoch: 8/50 - Perplexity: training 78.624, validation 80.014 - E.S. checkpoint
Epoch: 9/50 - Perplexity: training 69.730, validation 78.391 - E.S. checkpoint
Epoch: 10/50 - Perplexity: training 62.042, validation 78.362 - E.S. checkpoint
Epoch: 11/50 - Perplexity: training 55.981, validation 76.301 - E.S. checkpoint
Epoch: 12/50 - Perplexity: training 50.432, validation 76.587
Epoch: 13/50 - Perplexity: training 45.421, validation 7

In [63]:
instance = Train(model=model,
                loss_fct=criterion,
                optimizer=optimizer,
                train_sequence=val_int_sequence_I,
                val_sequence=None,
                sequence_length=50,
                batch_size=128,
                epochs=13,
                patience=None,
                name=None)

train_loss_of_val_data = instance.training()

Device: cuda:0
Starting training..
No validation data is used.
Epoch: 1/13 - Perplexity: training 99.061
Epoch: 2/13 - Perplexity: training 54.576
Epoch: 3/13 - Perplexity: training 36.917
Epoch: 4/13 - Perplexity: training 27.575
Epoch: 5/13 - Perplexity: training 21.252
Epoch: 6/13 - Perplexity: training 17.244
Epoch: 7/13 - Perplexity: training 13.990
Epoch: 8/13 - Perplexity: training 11.945
Epoch: 9/13 - Perplexity: training 10.361
Epoch: 10/13 - Perplexity: training 8.694
Epoch: 11/13 - Perplexity: training 7.461
Epoch: 12/13 - Perplexity: training 6.429
Epoch: 13/13 - Perplexity: training 5.570
Training complete !


In [65]:
torch.save(model, 'model_epoch13_lstm_with_learnable_embeddings.pth')

In [64]:
perplexity_neural_model(test_sequence_of_integers = test_int_sequence_I,
                        sequence_length = 50,
                        model = model,
                        loss_fct = nn.CrossEntropyLoss(),
                        vocab_size = len(vocabulary_I))

165.01979374359573

#### **case II: model with pre-trained GloVe embeddings** (all variable names end in 'II')

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [22]:
# Load GloVe 300-dim embeddings into word_embeddings dictionary of (word,vector) pairs
import os

glove_dir ='/content/drive/MyDrive/Colab_Notebooks/language_models'

word_embeddings = {} # dictionary with (word, embedding) items

with open(os.path.join(glove_dir, 'glove.6B.300d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = torch.tensor([float(val) for val in values[1:]])
        word_embeddings[word] = embedding

In [23]:
# Do GloVe embeddings have representations for all tokens in the vocabulary_I?
intersection = set(word_embeddings.keys()) & vocabulary_I
words_not_in_glove = vocabulary_I - intersection
len(words_not_in_glove) #'<unk>', '<eos>' included

36

In [24]:
# How to deal with words_not_in_glove ?
# One approach, is to assign them to the '<unk>' token. This results in a reduced vocabulary.

vocabulary_II = vocabulary_I - words_not_in_glove
vocabulary_II.add('<unk>')
vocabulary_II.add('<eos>')

assert len(vocabulary_II) == len(vocabulary_I) - len(words_not_in_glove) + 2 # 3225 = 3259 - 36 + 2

In [25]:
# we replace with <unk> tokens that are not included in the vocabulary_II as well
train_sentences_II = unk_for_reduced_vocab(train_sentences_I, vocabulary_II)

In [26]:
# not the same index assignments every time i run the cell
word_to_index_II = {word: idx for idx, word in enumerate(vocabulary_II)}
index_to_word_II = {idx: word for word, idx in word_to_index_II.items()}

with open('caseII_word_index_mappings.pickle', 'wb') as f:
        pickle.dump([word_to_index_II, index_to_word_II], f)

In [27]:
# training sequence of indices
train_int_sequence_II = tokens_to_indices(word_to_index_II, train_sentences_II)

# validation sequence of indices
val_sentences_II = replace_with_unk_for_testing(vocabulary_II, lower_val_treebank)
val_int_sequence_II = tokens_to_indices(word_to_index_II, val_sentences_II)

# testing sequence of indices
test_sentences_II = replace_with_unk_for_testing(vocabulary_II, lower_test_treebank)
test_int_sequence_II = tokens_to_indices(word_to_index_II, test_sentences_II)

len(train_int_sequence_II), len(val_int_sequence_II), len(test_int_sequence_II)

(82372, 8003, 8319)

In [29]:
len(set(train_int_sequence_II)), len(set(val_int_sequence_II)), len(set(test_int_sequence_II))

(3225, 1160, 1269)

In [30]:
#create embedding layer weights

vocabII_size = len(vocabulary_II)
embeddings = torch.zeros(vocabII_size, embedding_dim)

# put the glove embeddings in the embeddings matrix
for (word, index) in word_to_index_II.items():
    if word not in ['<unk>', '<eos>']:
        embeddings[index] = word_embeddings[word]

eos_index = word_to_index_II['<eos>']
all_vectors = list(word_embeddings.values())
embeddings[eos_index] = torch.mean(torch.stack(all_vectors), dim=0)

unk_index = word_to_index_II['<unk>']
embeddings[unk_index] = torch.rand(embedding_dim)

In [80]:
# Create the model, loss function, and optimizer
model = LSTMModel(vocabII_size, embedding_dim, hidden_dim, num_layers, dropout_rate, True, embeddings)
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'No. of trainable parameters: {num_params}')

#model training hyperparams
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

No. of trainable parameters: 2415525


In [81]:
instance = Train(model=model,
                loss_fct=criterion,
                optimizer=optimizer,
                train_sequence=train_int_sequence_II,
                val_sequence=val_int_sequence_II,
                sequence_length=50,
                batch_size=128,
                epochs=50,
                patience=10,
                name='lstm_with_glove_embeddings')

Device: cuda:0


In [82]:
train_loss, val_loss, checkpoints = instance.training()

Starting training..
Epoch: 1/50 - Perplexity: training 355.058, validation 156.605
Epoch: 2/50 - Perplexity: training 204.421, validation 122.148 - E.S. checkpoint
Epoch: 3/50 - Perplexity: training 165.307, validation 106.202 - E.S. checkpoint
Epoch: 4/50 - Perplexity: training 137.588, validation 93.847 - E.S. checkpoint
Epoch: 5/50 - Perplexity: training 116.386, validation 87.447 - E.S. checkpoint
Epoch: 6/50 - Perplexity: training 100.704, validation 82.809 - E.S. checkpoint
Epoch: 7/50 - Perplexity: training 88.354, validation 80.285 - E.S. checkpoint
Epoch: 8/50 - Perplexity: training 78.662, validation 77.775 - E.S. checkpoint
Epoch: 9/50 - Perplexity: training 70.019, validation 75.650 - E.S. checkpoint
Epoch: 10/50 - Perplexity: training 62.629, validation 74.579 - E.S. checkpoint
Epoch: 11/50 - Perplexity: training 56.141, validation 73.851 - E.S. checkpoint
Epoch: 12/50 - Perplexity: training 50.232, validation 73.903
Epoch: 13/50 - Perplexity: training 45.204, validation 7

In [83]:
instance = Train(model=model,
                loss_fct=criterion,
                optimizer=optimizer,
                train_sequence=val_int_sequence_II,
                val_sequence=None,
                sequence_length=50,
                batch_size=128,
                epochs=11,
                patience=None,
                name=None)

train_loss_of_val_data = instance.training()

Device: cuda:0
Starting training..
No validation data is used.
Epoch: 1/11 - Perplexity: training 94.845
Epoch: 2/11 - Perplexity: training 53.007
Epoch: 3/11 - Perplexity: training 36.071
Epoch: 4/11 - Perplexity: training 26.743
Epoch: 5/11 - Perplexity: training 20.727
Epoch: 6/11 - Perplexity: training 16.667
Epoch: 7/11 - Perplexity: training 14.127
Epoch: 8/11 - Perplexity: training 11.869
Epoch: 9/11 - Perplexity: training 10.076
Epoch: 10/11 - Perplexity: training 8.703
Epoch: 11/11 - Perplexity: training 7.504
Training complete !


In [85]:
torch.save(model, 'model_epoch11_lstm_with_glove_embeddings.pth')

In [84]:
perplexity_neural_model(test_sequence_of_integers = test_int_sequence_II,
                        sequence_length = 50,
                        model = model,
                        loss_fct = nn.CrossEntropyLoss(),
                        vocab_size = len(vocabulary_II))

148.1001568025409

### C. Transformer

### D. Discussion (Results analysis, Comparisons & Text generation)