<a href="https://colab.research.google.com/github/vggls/language_models/blob/main/main_this_is_ok.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### General imports for all models

In [None]:
import pickle
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import math
#import string #string.punctuation contains punctuation symbols

In [None]:
# for google colab import run this cell as well
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [None]:
from nltk.corpus import treebank

### A. 3-gram language model with Laplace smoothing

In [None]:
# custom written code
from preprocessing import lower, add_unk_tokens_for_training, replace_with_unk_for_testing, replace_doubleslash_token_with_unk, create_ngrams
from laplace_model import count_n_grams, laplace_model, perplexity_ngram_model

In [None]:
# Penn Treebank
symbols_to_remove = set(['-LRB-', '-RRB-', '-LSB-', '-RSB-', '-LCB-', '-RCB-'])

train_treebank = []
for j in range(175):
    for i in treebank.sents(treebank.fileids()[j]):

        # Remove tokens that contain '*' or are punctuation/symbols
        l = [token for token in i if '*' not in token and token not in symbols_to_remove]

        # Append the sentence to the training data
        train_treebank.append(l)

test_treebank = []
for j in range(175, 199):
    for i in treebank.sents(treebank.fileids()[j]):
        l = [token for token in i if '*' not in token and token not in symbols_to_remove]
        test_treebank.append(l)

len(train_treebank), len(test_treebank)

(3576, 338)

In [None]:
#lower first letter of each token
train_tokenized_sentences = lower(train_treebank)
test_tokenized_sentences = lower(test_treebank)

In [None]:
# insert <unk> token to training data
train_tokenized_sentences = add_unk_tokens_for_training(train_tokenized_sentences) #replace all tokens that appear less than 3 times with <unk>
train_tokenized_sentences = replace_doubleslash_token_with_unk(train_tokenized_sentences)

After inspecting the resulting tokens, we observe that Penn Treebank comes with some bad-shaped tokens such as '1\\/4', '7\\/8', 'macmillan\/mcgraw-hill', 'macmillan\/mcgraw', '1\\/2' etc. In other words, tokens that include the sequence '\/', which makes no sense. Thus we will replace tokens including this sequence with '< unk>' as well. From now on, we will refer to this tokens as 'slash' tokens.

In [None]:
#the vocabulary is useful for the testing phase
vocabulary = set([item for sublist in train_tokenized_sentences for item in sublist])
len(vocabulary)

3466

In [None]:
'<unk>' in vocabulary, '<bos>' in vocabulary, '<eos>' in vocabulary

(True, False, False)

In [None]:
# insert <unk> token to test data
test_tokenized_sentences = replace_with_unk_for_testing(vocabulary, test_tokenized_sentences)
test_tokenized_sentences = replace_doubleslash_token_with_unk(test_tokenized_sentences)

In [None]:
#add <bos> and <eos> tokens and compute ngrams
train_bigrams = create_ngrams(2, train_tokenized_sentences)
train_trigrams = create_ngrams(3, train_tokenized_sentences)
test_trigrams = create_ngrams(3, test_tokenized_sentences)

len(train_bigrams), len(train_trigrams), len(test_trigrams)

(90526, 94102, 8663)

In [None]:
#example of 2-grams and 3-grams extracted from the first training sentence
print(train_treebank[0], '\n')
print(train_tokenized_sentences[0], '\n')
print(train_bigrams[:16], '\n')
print(train_trigrams[:17])

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 

['<unk>', '<unk>', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.'] 

[['<bos>', '<unk>'], ['<unk>', '<unk>'], ['<unk>', ','], [',', '61'], ['61', 'years'], ['years', 'old'], ['old', ','], [',', 'will'], ['will', 'join'], ['join', 'the'], ['the', 'board'], ['board', 'as'], ['as', 'a'], ['a', 'nonexecutive'], ['nonexecutive', 'director'], ['director', 'nov.']] 

[['<bos>', '<bos>', '<unk>'], ['<bos>', '<unk>', '<unk>'], ['<unk>', '<unk>', ','], ['<unk>', ',', '61'], [',', '61', 'years'], ['61', 'years', 'old'], ['years', 'old', ','], ['old', ',', 'will'], [',', 'will', 'join'], ['will', 'join', 'the'], ['join', 'the', 'board'], ['the', 'board', 'as'], ['board', 'as', 'a'], ['as', 'a', 'nonexecutive'], ['a', 'nonexecutive', 'director'], ['nonexecutive', 'director', 'nov.'],

In [None]:
#example of 3-grams extracted from the first test sentence
print(test_treebank[0], '\n')
print(test_tokenized_sentences[0], '\n')
print(test_trigrams[:27])

['Xerox', 'Corp.', 'has', 'told', 'employees', 'in', 'its', 'Crum', '&', 'Forster', 'personal', 'insurance', 'operations', 'that', 'it', 'is', 'laying', 'off', 'about', '300', 'people', ',', 'or', '25', '%', 'of', 'the', 'staff', '.'] 

['<unk>', 'corp.', 'has', 'told', 'employees', 'in', 'its', '<unk>', '&', '<unk>', 'personal', 'insurance', 'operations', 'that', 'it', 'is', '<unk>', 'off', 'about', '300', 'people', ',', 'or', '25', '%', 'of', 'the', 'staff', '.'] 

[['<bos>', '<bos>', '<unk>'], ['<bos>', '<unk>', 'corp.'], ['<unk>', 'corp.', 'has'], ['corp.', 'has', 'told'], ['has', 'told', 'employees'], ['told', 'employees', 'in'], ['employees', 'in', 'its'], ['in', 'its', '<unk>'], ['its', '<unk>', '&'], ['<unk>', '&', '<unk>'], ['&', '<unk>', 'personal'], ['<unk>', 'personal', 'insurance'], ['personal', 'insurance', 'operations'], ['insurance', 'operations', 'that'], ['operations', 'that', 'it'], ['that', 'it', 'is'], ['it', 'is', '<unk>'], ['is', '<unk>', 'off'], ['<unk>', 'off',

In [None]:
#2-grams and 3-grams frequencies
bigrams_counts = count_n_grams(train_bigrams)
trigrams_counts = count_n_grams(train_trigrams)

In [None]:
with open('ngrams_counts.pickle', 'wb') as f:
        pickle.dump([bigrams_counts, trigrams_counts], f)

In [None]:
perplexity_ngram_model(nminus1_grams_counts=bigrams_counts,
                       n_grams_counts=trigrams_counts,
                       test_n_grams=test_trigrams,
                       vocab_size=len(vocabulary))

1081.5835351523908

### B. LSTM language model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
#custom written code
from preprocessing import lower, add_unk_tokens_for_training, unk_for_reduced_vocab, replace_doubleslash_token_with_unk, replace_with_unk_for_testing, tokens_to_indices
from training import Train
from lstm_model import LSTMModel
from perplexity_neural import perplexity_neural_model

In [None]:
# Penn Treebank

def load_treebank(left_limit, right_limit):

    symbols_to_remove = set(['-LRB-', '-RRB-', '-LSB-', '-RSB-', '-LCB-', '-RCB-'])

    #sos_token = ['<bos>']
    eos_token = ['<eos>']

    tokenized_sentences = []
    for j in range(left_limit, right_limit):
        for i in treebank.sents(treebank.fileids()[j]):
            l = [token for token in i if '*' not in token and token not in symbols_to_remove]
            l = l + eos_token
            tokenized_sentences.append(l)

    return tokenized_sentences

train_treebank = load_treebank(0, 150)
val_treebank = load_treebank(150, 175)
test_treebank = load_treebank(175, 199)

len(train_treebank), len(val_treebank), len(test_treebank)

(3262, 314, 338)

In [None]:
#lower first letter of each token - this is common for both cases
lower_train_treebank = lower(train_treebank)
lower_val_treebank = lower(val_treebank)
lower_test_treebank = lower(test_treebank)

In [None]:
# insert <unk> token to training data - this is common for both cases (but case II needs one additional transformation)
train_sentences = add_unk_tokens_for_training(lower_train_treebank) #replace all tokens that appear less than 3 times with <unk>
train_sentences = replace_doubleslash_token_with_unk(train_sentences)

In [None]:
#the vocabulary is useful for the testing phase - this is for case I only
vocabulary = set([item for sublist in train_sentences for item in sublist])
len(vocabulary), '<unk>' in vocabulary, '<eos>' in vocabulary

(3259, True, True)

In [None]:
#model architecture hyperparams - common cell for both cases
embedding_dim = 300
num_layers = 2
hidden_dim = 300
dropout_rate = 0.3

#### case I: model with learnable embeddings

In [None]:
# not the same index assignments every time i run the cell
word_to_index = {word: idx for idx, word in enumerate(vocabulary)}
index_to_word = {idx: word for word, idx in word_to_index.items()}
word_to_index['<eos>'], word_to_index['<unk>']

(1667, 2118)

In [None]:
# training sequence of indices
train_int_sequence = tokens_to_indices(word_to_index, train_sentences)

# validation sequence of indices
val_sentences = replace_with_unk_for_testing(vocabulary, lower_val_treebank)
val_sentences = replace_doubleslash_token_with_unk(val_sentences)
val_int_sequence = tokens_to_indices(word_to_index, val_sentences)

# testing sequence of indices
test_sentences = replace_with_unk_for_testing(vocabulary, lower_test_treebank)
test_sentences = replace_doubleslash_token_with_unk(test_sentences)
test_int_sequence = tokens_to_indices(word_to_index, test_sentences)

len(train_int_sequence), len(val_int_sequence), len(test_int_sequence)

(82479, 8047, 8325)

In [None]:
len(set(train_int_sequence)), len(set(val_int_sequence)), len(set(test_int_sequence))

(3259, 1165, 1272)

In [None]:
# brief explanation how to feed a recurrent neural net
# for simplicity, consider the case of the first two sentences
print(train_sentences[0], train_sentences[1], '\n')
print(train_int_sequence[:28])

['<unk>', '<unk>', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.', '<eos>'] ['mr.', '<unk>', 'is', 'chairman', 'of', '<unk>', 'n.v.', ',', 'the', 'dutch', 'publishing', 'group', '.', '<eos>'] 

[2118, 2118, 19, 1371, 3158, 397, 19, 1701, 948, 2323, 174, 1843, 957, 1390, 879, 1496, 785, 3162, 1667, 2015, 2118, 262, 2345, 1861, 2118, 1365, 19, 2323]


In the above representation recall that '.' is represented by 2262 and the unknown word by 1855.

So if we process the data in sequences of length = 5, the model will learn as follows:

- map [1855, 1855, 1062, 419] to 1620
- map [1855, 1062, 419, 1620] to 885
- i.e. shift input by 1-step to the future and continue like this

In [None]:
vocab_size = len(vocabulary)
vocab_size

3259

In [None]:
# Create the model, loss function, and optimizer
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, True, None)
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'No. of trainable parameters: {num_params}')

#model training hyperparams
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

No. of trainable parameters: 2425759


In [None]:
instance = Train(model=model,
                loss_fct=criterion,
                optimizer=optimizer,
                train_sequence=train_int_sequence,
                val_sequence=val_int_sequence,
                sequence_length=50,
                batch_size=128,
                epochs=50,
                patience=10,
                name='lstm_with_learnable_embeddings')

Device: cuda:0


In [None]:
train_loss, val_loss, checkpoints = instance.training()

Starting training..
Epoch: 1/50 - Perplexity: training 351.068, validation 147.867
Epoch: 2/50 - Perplexity: training 215.300, validation 124.748 - E.S. checkpoint
Epoch: 3/50 - Perplexity: training 170.749, validation 108.157 - E.S. checkpoint
Epoch: 4/50 - Perplexity: training 139.524, validation 94.884 - E.S. checkpoint
Epoch: 5/50 - Perplexity: training 120.880, validation 89.324 - E.S. checkpoint
Epoch: 6/50 - Perplexity: training 101.748, validation 81.887 - E.S. checkpoint
Epoch: 7/50 - Perplexity: training 85.870, validation 75.808 - E.S. checkpoint
Epoch: 8/50 - Perplexity: training 73.522, validation 72.348 - E.S. checkpoint
Epoch: 9/50 - Perplexity: training 62.855, validation 71.383 - E.S. checkpoint
Epoch: 10/50 - Perplexity: training 54.470, validation 70.134 - E.S. checkpoint
Epoch: 11/50 - Perplexity: training 47.239, validation 70.779
Epoch: 12/50 - Perplexity: training 41.416, validation 70.907
Epoch: 13/50 - Perplexity: training 36.625, validation 72.729
Epoch: 14/50

In [None]:
instance = Train(model=model,
                loss_fct=criterion,
                optimizer=optimizer,
                train_sequence=val_int_sequence,
                val_sequence=None,
                sequence_length=50,
                batch_size=128,
                epochs=10,
                patience=None,
                name=None)

train_loss_of_val_data = instance.training()

Device: cuda:0
Starting training..
No validation data is used.
Epoch: 1/10 - Perplexity: training 96.871
Epoch: 2/10 - Perplexity: training 48.482
Epoch: 3/10 - Perplexity: training 29.794
Epoch: 4/10 - Perplexity: training 20.691
Epoch: 5/10 - Perplexity: training 15.023
Epoch: 6/10 - Perplexity: training 11.435
Epoch: 7/10 - Perplexity: training 8.997
Epoch: 8/10 - Perplexity: training 7.273
Epoch: 9/10 - Perplexity: training 5.953
Epoch: 10/10 - Perplexity: training 4.994
Training complete !


In [None]:
torch.save(model, 'model_epoch10_lstm_with_learnable_embeddings.pth')

In [None]:
perplexity_neural_model(test_sequence_of_integers = test_int_sequence,
                        sequence_length = 50,
                        model = model,
                        loss_fct = nn.CrossEntropyLoss(),
                        vocab_size = len(vocabulary))

163.55122742221675

#### case II: model with pre-trained GloVe embeddings

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load GloVe 300-dim embeddings into word_embeddings dictionary
import os

glove_dir ='/content/drive/MyDrive/Colab_Notebooks/language_models'

word_embeddings = {} # dictionary with (word, embedding) items

with open(os.path.join(glove_dir, 'glove.6B.300d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = torch.tensor([float(val) for val in values[1:]])
        word_embeddings[word] = embedding

In [None]:
# Do GloVe embeddings have representations for all tokens in the vocabulary?
intersection = set(word_embeddings.keys()) & vocabulary
words_not_in_glove = vocabulary - intersection
len(words_not_in_glove) #'<unk>', '<eos>' included

36

In [None]:
# How to deal with words_without_glove_representation ?
# One approach, is to assign them to the '<unk>' token. This results in a reduced vocabulary.

reduced_vocabulary = vocabulary - words_not_in_glove
reduced_vocabulary.add('<unk>')
reduced_vocabulary.add('<eos>')

assert len(reduced_vocabulary) == len(vocabulary) - len(words_not_in_glove) + 2 # 3225 = 3259 - 36 + 2

In [None]:
# we also replace with <unk> train_sentences tokens that are not included in the reduced_vocabulary
train_sentences = unk_for_reduced_vocab(train_sentences, reduced_vocabulary)

In [None]:
# not the same index assignments every time i run the cell
word_to_index = {word: idx for idx, word in enumerate(reduced_vocabulary)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

In [None]:
unk_index = word_to_index['<unk>']
eos_index = word_to_index['<eos>']

In [None]:
# training sequence of indices
train_int_sequence = tokens_to_indices(word_to_index, train_sentences)

# validation sequence of indices
val_sentences = replace_with_unk_for_testing(reduced_vocabulary, lower_val_treebank)
val_sentences = replace_doubleslash_token_with_unk(val_sentences)
val_int_sequence = tokens_to_indices(word_to_index, val_sentences)

# testing sequence of indices
test_sentences = replace_with_unk_for_testing(reduced_vocabulary, lower_test_treebank)
test_sentences = replace_doubleslash_token_with_unk(test_sentences)
test_int_sequence = tokens_to_indices(word_to_index, test_sentences)

len(train_int_sequence), len(val_int_sequence), len(test_int_sequence)

(82479, 8047, 8325)

In [None]:
len(set(train_int_sequence)), len(set(val_int_sequence)), len(set(test_int_sequence))

(3225, 1160, 1269)

In [None]:
#create embedding layer weights

reduced_vocab_size = len(reduced_vocabulary)
embeddings = torch.zeros(reduced_vocab_size, embedding_dim)

# put the glove embeddings in the embeddings matrix
for (word, index) in word_to_index.items():
    if word not in ['<unk>', '<eos>']:
        embeddings[index] = word_embeddings[word]

all_vectors = list(word_embeddings.values())
embeddings[eos_index] = torch.mean(torch.stack(all_vectors), dim=0)

embeddings[unk_index] = torch.rand(embedding_dim)

In [None]:
reduced_vocab_size = len(reduced_vocabulary)

In [None]:
# Create the model, loss function, and optimizer
model = LSTMModel(reduced_vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, True, embeddings)
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'No. of trainable parameters: {num_params}')

#model training hyperparams
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

No. of trainable parameters: 2415525


In [None]:
instance = Train(model=model,
                loss_fct=criterion,
                optimizer=optimizer,
                train_sequence=train_int_sequence,
                val_sequence=val_int_sequence,
                sequence_length=50,
                batch_size=128,
                epochs=50,
                patience=10,
                name='lstm_with_glove_embeddings')

Device: cuda:0


In [None]:
train_loss, val_loss, checkpoints = instance.training()

Starting training..
Epoch: 1/50 - Perplexity: training 339.894, validation 148.184
Epoch: 2/50 - Perplexity: training 196.740, validation 117.196 - E.S. checkpoint
Epoch: 3/50 - Perplexity: training 158.067, validation 102.477 - E.S. checkpoint
Epoch: 4/50 - Perplexity: training 132.385, validation 92.065 - E.S. checkpoint
Epoch: 5/50 - Perplexity: training 114.328, validation 86.670 - E.S. checkpoint
Epoch: 6/50 - Perplexity: training 99.596, validation 82.106 - E.S. checkpoint
Epoch: 7/50 - Perplexity: training 88.072, validation 80.030 - E.S. checkpoint
Epoch: 8/50 - Perplexity: training 78.478, validation 77.840 - E.S. checkpoint
Epoch: 9/50 - Perplexity: training 70.121, validation 76.481 - E.S. checkpoint
Epoch: 10/50 - Perplexity: training 63.404, validation 75.352 - E.S. checkpoint
Epoch: 11/50 - Perplexity: training 56.756, validation 74.600 - E.S. checkpoint
Epoch: 12/50 - Perplexity: training 51.132, validation 74.997
Epoch: 13/50 - Perplexity: training 46.583, validation 75

In [None]:
instance = Train(model=model,
                loss_fct=criterion,
                optimizer=optimizer,
                train_sequence=val_int_sequence,
                val_sequence=None,
                sequence_length=50,
                batch_size=128,
                epochs=11,
                patience=None,
                name=None)

train_loss_of_val_data = instance.training()

Device: cuda:0
Starting training..
No validation data is used.
Epoch: 1/11 - Perplexity: training 92.324
Epoch: 2/11 - Perplexity: training 52.651
Epoch: 3/11 - Perplexity: training 35.740
Epoch: 4/11 - Perplexity: training 26.891
Epoch: 5/11 - Perplexity: training 21.224
Epoch: 6/11 - Perplexity: training 16.992
Epoch: 7/11 - Perplexity: training 14.254
Epoch: 8/11 - Perplexity: training 11.996
Epoch: 9/11 - Perplexity: training 10.256
Epoch: 10/11 - Perplexity: training 9.004
Epoch: 11/11 - Perplexity: training 7.762
Training complete !


In [None]:
torch.save(model, 'model_epoch11_lstm_with_glove_embeddings.pth')

In [None]:
perplexity_neural_model(test_sequence_of_integers = test_int_sequence,
                        sequence_length = 50,
                        model = model,
                        loss_fct = nn.CrossEntropyLoss(),
                        vocab_size = len(reduced_vocabulary))

149.90886196455799

### C. Transformer

### D. Comparisons & Text generation