<a href="https://colab.research.google.com/github/vggls/language_models/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### General imports for all models

In [None]:
import pickle
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import math

In [None]:
# for google colab import run this cell as well
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [None]:
from nltk.corpus import treebank

### 3-gram language model with Laplace smoothing

In [None]:
# custom written code
from preprocessing import lower, add_unk_tokens_for_training, replace_with_unk_for_testing, create_ngrams
from laplace_model import count_n_grams, laplace_model, perplexity_ngram_model

In [None]:
# Penn Treebank
train_treebank = []
for j in range(175): # len(treebank.fileids()) = 199
    for i in treebank.sents(treebank.fileids()[j]):
        l = [j for j in i if '*' not in j] # remove tokens that contain '*'
        train_treebank.append(l)

test_treebank = []
for j in range(175, 199):
    for i in treebank.sents(treebank.fileids()[j]):
        l = [j for j in i if '*' not in j]
        test_treebank.append(l)

len(train_treebank), len(test_treebank)

(3576, 338)

In [None]:
#lower first letter of each token
train_tokenized_sentences = lower(train_treebank)
test_tokenized_sentences = lower(test_treebank)

In [None]:
# replace all tokens that appear less than 3 times with <unk>
train_tokenized_sentences = add_unk_tokens_for_training(train_tokenized_sentences)

In [None]:
#the vocabulary is useful for the testing phase
vocabulary = set([item for sublist in train_tokenized_sentences for item in sublist])
len(vocabulary)

3481

In [None]:
'<unk>' in vocabulary, '<bos>' in vocabulary, '<eos>' in vocabulary

(True, False, False)

In [None]:
test_tokenized_sentences = replace_with_unk_for_testing(vocabulary, test_tokenized_sentences)

In [None]:
#compute ngrams
train_bigrams = create_ngrams(2, train_tokenized_sentences)
train_trigrams = create_ngrams(3, train_tokenized_sentences)
test_trigrams = create_ngrams(3, test_tokenized_sentences)

len(train_bigrams), len(train_trigrams), len(test_trigrams)

(90748, 94324, 8687)

In [None]:
#example of 2-grams and 3-grams extracted from the first training sentence
print(train_treebank[0], '\n')
print(train_tokenized_sentences[0], '\n')
print(train_bigrams[:19], '\n')
print(train_trigrams[:20])

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 

['<unk>', '<unk>', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.'] 

[['<bos>', '<unk>'], ['<unk>', '<unk>'], ['<unk>', ','], [',', '61'], ['61', 'years'], ['years', 'old'], ['old', ','], [',', 'will'], ['will', 'join'], ['join', 'the'], ['the', 'board'], ['board', 'as'], ['as', 'a'], ['a', 'nonexecutive'], ['nonexecutive', 'director'], ['director', 'nov.'], ['nov.', '29'], ['29', '.'], ['.', '<eos>']] 

[['<bos>', '<bos>', '<unk>'], ['<bos>', '<unk>', '<unk>'], ['<unk>', '<unk>', ','], ['<unk>', ',', '61'], [',', '61', 'years'], ['61', 'years', 'old'], ['years', 'old', ','], ['old', ',', 'will'], [',', 'will', 'join'], ['will', 'join', 'the'], ['join', 'the', 'board'], ['the', 'board', 'as'], ['board', 'as', 'a'], ['as', 'a', 'nonexecutive'], ['a', 'nonexecutive', 'dire

In [None]:
#example of 3-grams extracted from the first test sentence
print(test_treebank[0], '\n')
print(test_tokenized_sentences[0], '\n')
print(test_trigrams[:31])

['Xerox', 'Corp.', 'has', 'told', 'employees', 'in', 'its', 'Crum', '&', 'Forster', 'personal', 'insurance', 'operations', 'that', 'it', 'is', 'laying', 'off', 'about', '300', 'people', ',', 'or', '25', '%', 'of', 'the', 'staff', '.'] 

['<unk>', 'corp.', 'has', 'told', 'employees', 'in', 'its', '<unk>', '&', '<unk>', 'personal', 'insurance', 'operations', 'that', 'it', 'is', '<unk>', 'off', 'about', '300', 'people', ',', 'or', '25', '%', 'of', 'the', 'staff', '.'] 

[['<bos>', '<bos>', '<unk>'], ['<bos>', '<unk>', 'corp.'], ['<unk>', 'corp.', 'has'], ['corp.', 'has', 'told'], ['has', 'told', 'employees'], ['told', 'employees', 'in'], ['employees', 'in', 'its'], ['in', 'its', '<unk>'], ['its', '<unk>', '&'], ['<unk>', '&', '<unk>'], ['&', '<unk>', 'personal'], ['<unk>', 'personal', 'insurance'], ['personal', 'insurance', 'operations'], ['insurance', 'operations', 'that'], ['operations', 'that', 'it'], ['that', 'it', 'is'], ['it', 'is', '<unk>'], ['is', '<unk>', 'off'], ['<unk>', 'off',

In [None]:
#2-grams and 3-grams frequencies
bigrams_counts = count_n_grams(train_bigrams)
trigrams_counts = count_n_grams(train_trigrams)

In [None]:
perplexity_ngram_model(nminus1_grams_counts=bigrams_counts,
                       n_grams_counts=trigrams_counts,
                       test_n_grams=test_trigrams,
                       vocab_size=len(vocabulary))

1091.699679451341

### LSTM language model

In [None]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

In [None]:
#custom written code
from preprocessing import lower, add_unk_tokens_for_training, replace_with_unk_for_testing, tokens_to_indices
from training import Train
from lstm_model import LSTMModel
from perplexity_neural import perplexity_neural_model

In [None]:
# Penn Treebank
train_treebank = []
for j in range(150): # len(treebank.fileids()) = 199
    for i in treebank.sents(treebank.fileids()[j]):
        l = [j for j in i if '*' not in j] # remove tokens that contain '*'
        train_treebank.append(l)

val_treebank = []
for j in range(150, 175): # len(treebank.fileids()) = 199
    for i in treebank.sents(treebank.fileids()[j]):
        l = [j for j in i if '*' not in j] # remove tokens that contain '*'
        val_treebank.append(l)

test_treebank = []
for j in range(175, 199):
    for i in treebank.sents(treebank.fileids()[j]):
        l = [j for j in i if '*' not in j]
        test_treebank.append(l)

len(train_treebank), len(val_treebank), len(test_treebank)

(3262, 314, 338)

In [None]:
#lower first letter of each token
train_tokenized_sentences = lower(train_treebank)
val_tokenized_sentences = lower(val_treebank)
test_tokenized_sentences = lower(test_treebank)

In [None]:
# replace all tokens that appear less than 3 times with <UNK>
train_tokenized_sentences = add_unk_tokens_for_training(train_tokenized_sentences)

The vocabulary is constructed by the training data only. Note that the training data is different between the 3-gram and the lstm model, because the later one needs validation as well (in order to hyper-tune; note that the 3-gram model is unique). Since the test set will be the same for all models, for the lstm model we use as training set the largest part of the 3-gram model training set and the remaining small part as validation.

In [None]:
#the vocabulary is useful for the testing phase
vocabulary = set([item for sublist in train_tokenized_sentences for item in sublist])
len(vocabulary)

3273

In [None]:
#Insert <EOS> token in the vocabulary? see source argument at the respective "Vocabulary" section
# BUT in my case I have included ".", whereas they do not. So, can i say that "." plays the role of "<EOS>"? Why not?

In [None]:
# not the same index assignments every time i run the cell
word_to_index = {word: idx for idx, word in enumerate(vocabulary)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

In [None]:
word_to_index['.'], word_to_index['<unk>']

(2097, 616)

In [None]:
# training
train_sequence = tokens_to_indices(word_to_index, train_tokenized_sentences)

# validation
val_tokenized_sentences = replace_with_unk_for_testing(vocabulary, val_tokenized_sentences)
val_sequence = tokens_to_indices(word_to_index, val_tokenized_sentences)

# testing
test_tokenized_sentences = replace_with_unk_for_testing(vocabulary, test_tokenized_sentences)
test_sequence = tokens_to_indices(word_to_index, test_tokenized_sentences)

len(train_sequence), len(val_sequence), len(test_sequence)

(79427, 7745, 8011)

In [None]:
# brief explanation how to feed a recurrent neural net
# for simplicity, consider the case of the first two sentences
print(train_tokenized_sentences[0], train_tokenized_sentences[1], '\n')
print(train_sequence[:31])

['<unk>', '<unk>', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.'] ['mr.', '<unk>', 'is', 'chairman', 'of', '<unk>', 'n.v.', ',', 'the', 'dutch', 'publishing', 'group', '.'] 

[616, 616, 1495, 2981, 1118, 2085, 1495, 1742, 551, 1285, 704, 330, 2939, 680, 391, 1910, 3254, 2097, 1282, 616, 1288, 998, 2384, 616, 2878, 1495, 1285, 803, 3229, 2616, 2097]


In the above representation recall that '.' is represented by 2262 and the unknown word by 1855.

So if we process the data in sequences of length = 5, the model will learn as follows:

- map [1855, 1855, 1062, 419] to 1620
- map [1855, 1062, 419, 1620] to 885
- i.e. shift input by 1-step to the future and continue like this

In [None]:
#model architecture hyperparams
vocab_size = len(vocabulary)
embedding_dim = 256
num_layers = 2
hidden_dim = 256
output_dim = vocab_size
dropout_rate = 0.3

#model training hyperparams
learning_rate = 0.001

# Create the model, loss function, and optimizer
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, True)
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'No. of trainable parameters: {num_params}')

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

No. of trainable parameters: 1893833


In [None]:
instance = Train(model=model,
                loss_fct=criterion,
                optimizer=optimizer,
                train_sequence=train_sequence,
                val_sequence=val_sequence,
                sequence_length=50,
                batch_size=128,
                epochs=50,
                patience=10)

Device: cuda:0


In [None]:
train_loss, val_loss, checkpoints = instance.training()

Starting training..
Epoch: 1/50 - Perplexity: training 436.930, validation 199.241
Epoch: 2/50 - Perplexity: training 273.212, validation 160.250 - E.S. checkpoint
Epoch: 3/50 - Perplexity: training 222.196, validation 141.705 - E.S. checkpoint
Epoch: 4/50 - Perplexity: training 184.109, validation 125.460 - E.S. checkpoint
Epoch: 5/50 - Perplexity: training 154.708, validation 114.497 - E.S. checkpoint
Epoch: 6/50 - Perplexity: training 133.885, validation 107.699 - E.S. checkpoint
Epoch: 7/50 - Perplexity: training 117.756, validation 101.582 - E.S. checkpoint
Epoch: 8/50 - Perplexity: training 104.071, validation 99.292 - E.S. checkpoint
Epoch: 9/50 - Perplexity: training 92.353, validation 96.590 - E.S. checkpoint
Epoch: 10/50 - Perplexity: training 82.571, validation 94.993 - E.S. checkpoint
Epoch: 11/50 - Perplexity: training 74.300, validation 94.294 - E.S. checkpoint
Epoch: 12/50 - Perplexity: training 67.332, validation 93.491 - E.S. checkpoint
Epoch: 13/50 - Perplexity: train

In [None]:
instance = Train(model=model,
                loss_fct=criterion,
                optimizer=optimizer,
                train_sequence=val_sequence,
                val_sequence=None,
                sequence_length=50,
                batch_size=128,
                epochs=13,
                patience=None)

Device: cuda:0


In [None]:
# train on validation data as well
train_loss_of_val_data = instance.training()

Starting training..
No validation data is used.
Epoch: 1/13 - Perplexity: training 115.307
Epoch: 2/13 - Perplexity: training 63.275
Epoch: 3/13 - Perplexity: training 44.590
Epoch: 4/13 - Perplexity: training 34.307
Epoch: 5/13 - Perplexity: training 27.522
Epoch: 6/13 - Perplexity: training 22.898
Epoch: 7/13 - Perplexity: training 19.010
Epoch: 8/13 - Perplexity: training 16.374
Epoch: 9/13 - Perplexity: training 14.272
Epoch: 10/13 - Perplexity: training 12.182
Epoch: 11/13 - Perplexity: training 10.556
Epoch: 12/13 - Perplexity: training 9.340
Epoch: 13/13 - Perplexity: training 8.250
Training complete !


In [None]:
perplexity_neural_model(test_sequence_of_integers = test_sequence,
                        sequence_length = 50,
                        model = model,
                        loss_fct = nn.CrossEntropyLoss(),
                        vocab_size = len(vocabulary))

191.17134531847913

### Pre-Trained Neural Network

### Comparisons & Text generation