<a href="https://colab.research.google.com/github/vggls/language_models/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### General imports for all models

In [1]:
import pickle
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import math

In [2]:
# for google colab import run this cell as well
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [3]:
from nltk.corpus import treebank

In [5]:
#custom written code
from preprocessing import lower, add_unk_tokens_for_training, replace_with_unk_for_testing, create_ngrams

### 3-gram language model with Laplace smoothing

In [6]:
# custom written code
from laplace_model import count_n_grams, laplace_model, perplexity_ngram_model

In [7]:
# Penn Treebank
train_treebank = []
for j in range(175): # len(treebank.fileids()) = 199
    for i in treebank.sents(treebank.fileids()[j]):
        l = [j for j in i if '*' not in j] # remove tokens that contain '*'
        train_treebank.append(l)

test_treebank = []
for j in range(175, 199):
    for i in treebank.sents(treebank.fileids()[j]):
        l = [j for j in i if '*' not in j]
        test_treebank.append(l)

len(train_treebank), len(test_treebank)

(3576, 338)

In [8]:
#lower first letter of each token
train_tokenized_sentences = lower(train_treebank)
test_tokenized_sentences = lower(test_treebank)

In [9]:
# replace all tokens that appear less than 3 times with <unk>
train_tokenized_sentences = add_unk_tokens_for_training(train_tokenized_sentences)

In [10]:
#the vocabulary is useful for the testing phase
vocabulary = set([item for sublist in train_tokenized_sentences for item in sublist])
len(vocabulary)

3481

In [11]:
'<unk>' in vocabulary, '<bos>' in vocabulary, '<eos>' in vocabulary

(True, False, False)

In [12]:
test_tokenized_sentences = replace_with_unk_for_testing(vocabulary, test_tokenized_sentences)

In [14]:
#compute ngrams
train_bigrams = create_ngrams(2, train_tokenized_sentences)
train_trigrams = create_ngrams(3, train_tokenized_sentences)
test_trigrams = create_ngrams(3, test_tokenized_sentences)

len(train_bigrams), len(train_trigrams), len(test_trigrams)

(90748, 94324, 8687)

In [15]:
#example of 2-grams and 3-grams extracted from the first training sentence
print(train_treebank[0], '\n')
print(train_tokenized_sentences[0], '\n')
print(train_bigrams[:19], '\n')
print(train_trigrams[:20])

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 

['<unk>', '<unk>', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.'] 

[['<bos>', '<unk>'], ['<unk>', '<unk>'], ['<unk>', ','], [',', '61'], ['61', 'years'], ['years', 'old'], ['old', ','], [',', 'will'], ['will', 'join'], ['join', 'the'], ['the', 'board'], ['board', 'as'], ['as', 'a'], ['a', 'nonexecutive'], ['nonexecutive', 'director'], ['director', 'nov.'], ['nov.', '29'], ['29', '.'], ['.', '<eos>']] 

[['<bos>', '<bos>', '<unk>'], ['<bos>', '<unk>', '<unk>'], ['<unk>', '<unk>', ','], ['<unk>', ',', '61'], [',', '61', 'years'], ['61', 'years', 'old'], ['years', 'old', ','], ['old', ',', 'will'], [',', 'will', 'join'], ['will', 'join', 'the'], ['join', 'the', 'board'], ['the', 'board', 'as'], ['board', 'as', 'a'], ['as', 'a', 'nonexecutive'], ['a', 'nonexecutive', 'dire

In [16]:
#example of 3-grams extracted from the first test sentence
print(test_treebank[0], '\n')
print(test_tokenized_sentences[0], '\n')
print(test_trigrams[:31])

['Xerox', 'Corp.', 'has', 'told', 'employees', 'in', 'its', 'Crum', '&', 'Forster', 'personal', 'insurance', 'operations', 'that', 'it', 'is', 'laying', 'off', 'about', '300', 'people', ',', 'or', '25', '%', 'of', 'the', 'staff', '.'] 

['<unk>', 'corp.', 'has', 'told', 'employees', 'in', 'its', '<unk>', '&', '<unk>', 'personal', 'insurance', 'operations', 'that', 'it', 'is', '<unk>', 'off', 'about', '300', 'people', ',', 'or', '25', '%', 'of', 'the', 'staff', '.'] 

[['<bos>', '<bos>', '<unk>'], ['<bos>', '<unk>', 'corp.'], ['<unk>', 'corp.', 'has'], ['corp.', 'has', 'told'], ['has', 'told', 'employees'], ['told', 'employees', 'in'], ['employees', 'in', 'its'], ['in', 'its', '<unk>'], ['its', '<unk>', '&'], ['<unk>', '&', '<unk>'], ['&', '<unk>', 'personal'], ['<unk>', 'personal', 'insurance'], ['personal', 'insurance', 'operations'], ['insurance', 'operations', 'that'], ['operations', 'that', 'it'], ['that', 'it', 'is'], ['it', 'is', '<unk>'], ['is', '<unk>', 'off'], ['<unk>', 'off',

In [17]:
#2-grams and 3-grams frequencies
bigrams_counts = count_n_grams(train_bigrams)
trigrams_counts = count_n_grams(train_trigrams)

In [18]:
perplexity_ngram_model(bigrams_counts,
                       trigrams_counts,
                       test_trigrams,
                       len(vocabulary))

0.8622973986362099

### LSTM language model

TO BE TRANSFERED INTO README.md

https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/

*In this tutorial, you will split the book text up into subsequences with a fixed length of 100 characters, an arbitrary length. You could just as easily split the data by sentences, padding the shorter sequences and truncating the longer ones.* (This is why we say that recurrent nets can handle arbitrary size inputs; because we either put all sentences together or handle them one by one even if the have different length.)

*Each training pattern of the network comprises 100 time steps of one character (X) followed by one character output (y). When creating these sequences, you slide this window along the whole book one character at a time, allowing each character a chance to be learned from the 100 characters that preceded it (except the first 100 characters, of course).*

In [None]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

In [None]:
#custom written code

In [None]:
# Penn Treebank
train_treebank = []
for j in range(150): # len(treebank.fileids()) = 199
    for i in treebank.sents(treebank.fileids()[j]):
        l = [j for j in i if '*' not in j] # remove tokens that contain '*'
        train_treebank.append(l)

val_treebank = []
for j in range(150, 175): # len(treebank.fileids()) = 199
    for i in treebank.sents(treebank.fileids()[j]):
        l = [j for j in i if '*' not in j] # remove tokens that contain '*'
        val_treebank.append(l)

test_treebank = []
for j in range(175, 199):
    for i in treebank.sents(treebank.fileids()[j]):
        l = [j for j in i if '*' not in j]
        test_treebank.append(l)

len(train_treebank), len(val_treebank), len(test_treebank)

(3262, 314, 338)

In [None]:
#lower first letter of each token
train_tokenized_sentences = lower(train_treebank)
val_tokenized_sentences = lower(val_treebank)
test_tokenized_sentences = lower(test_treebank)

In [None]:
# replace all tokens that appear less than 3 times with <UNK>
train_tokenized_sentences = add_unk_tokens(train_tokenized_sentences)

The vocabulary is constructed by the training data only. Note that the training data is different between the 3-gram and the lstm model, because the later one needs validation as well (in order to hyper-tune; note that the 3-gram model is unique). Since the test set will be the same for all models, for the lstm model we use as training set the largest part of the 3-gram model training set and the remaining small part as validation.

In [None]:
#the vocabulary is useful for the testing phase
vocabulary = set([item for sublist in train_tokenized_sentences for item in sublist])
len(vocabulary)

3273

In [None]:
#Insert <EOS> token in the vocabulary? see source argument at the respective "Vocabulary" section
# BUT in my case I have included ".", whereas they do not. So, can i say that "." plays the role of "<EOS>"? Why not?

In [None]:
# not the same index assignments every time i run the cell
word_to_index = {word: idx for idx, word in enumerate(vocabulary)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

In [None]:
word_to_index['.'], word_to_index['<unk>']

(857, 1965)

In [None]:
# duplicate code - write function

# training
train_sequences = [[word_to_index[word] for word in sentence] for sentence in train_tokenized_sentences]
train_sequence = []
for seq in train_sequences:
    train_sequence.extend(seq)

# validation
for sent in val_tokenized_sentences:
    for i, token in enumerate(sent):
        if token not in vocabulary:
            sent[i] = '<unk>'
val_sequences = [[word_to_index[word] for word in sentence] for sentence in val_tokenized_sentences]

val_sequence = []
for seq in val_sequences:
    val_sequence.extend(seq)

# testing
for sent in test_tokenized_sentences:
    for i, token in enumerate(sent):
        if token not in vocabulary:
            sent[i] = '<unk>'
test_sequences = [[word_to_index[word] for word in sentence] for sentence in test_tokenized_sentences]

test_sequence = []
for seq in test_sequences:
    test_sequence.extend(seq)

In [None]:
len(train_sequence), len(val_sequence), len(test_sequence)

(79427, 7745, 8011)

In [None]:
# brief explanation how to feed a recurrent neural net
# for simplicity, consider the case of the first two sentences
print(train_tokenized_sentences[0], train_tokenized_sentences[1], '\n')
print(train_sequences[0], train_sequences[1], '\n')
print(train_sequence[:31])

['<unk>', '<unk>', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.'] ['mr.', '<unk>', 'is', 'chairman', 'of', '<unk>', 'n.v.', ',', 'the', 'dutch', 'publishing', 'group', '.'] 

[1965, 1965, 1388, 1999, 1051, 758, 1388, 2209, 2410, 955, 3191, 2587, 2882, 3178, 2374, 2203, 63, 857] [816, 1965, 820, 3238, 2864, 1965, 1707, 1388, 955, 3170, 20, 1034, 857] 

[1965, 1965, 1388, 1999, 1051, 758, 1388, 2209, 2410, 955, 3191, 2587, 2882, 3178, 2374, 2203, 63, 857, 816, 1965, 820, 3238, 2864, 1965, 1707, 1388, 955, 3170, 20, 1034, 857]


In the above representation recall that '.' is represented by 2262 and the unknown word by 1855.

So if we process the data in sequences of length = 5, the model will learn as follows:

- map [1855, 1855, 1062, 419] to 1620
- map [1855, 1062, 419, 1620] to 885
- i.e. shift input by 1-step to the future and continue like this

In [None]:
# @title LSTM model

# https://stackoverflow.com/questions/49224413/difference-between-1-lstm-with-num-layers-2-and-2-lstms-in-pytorch

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights):
        super(LSTMModel, self).__init__()

        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers = num_layers,
                            batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)          #regulazirer
        self.fc = nn.Linear(hidden_dim, vocab_size)

        if tie_weights:
            assert embedding_dim == hidden_dim
            self.embedding.weight = self.fc.weight

        self.init_weights()

    def forward(self, x, h, c):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded, (h, c))
        dropout_out = self.dropout(lstm_out)
        output = self.fc(dropout_out[:, -1, :])
        return output, (hidden, cell)

    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hidden_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.embedding_dim,
                    self.hidden_dim).uniform_(-init_range_other, init_range_other)
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hidden_dim,
                    self.hidden_dim).uniform_(-init_range_other, init_range_other)

In [None]:
#model architecture hyperparams
vocab_size = len(vocabulary)
embedding_dim = 256
num_layers = 2
hidden_dim = 256
output_dim = vocab_size
dropout_rate = 0.3

#model training hyperparams
learning_rate = 0.001

# Create the model, loss function, and optimizer
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, True)
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'No. of trainable parameters: {num_params}')

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

No. of trainable parameters: 1893833


In [None]:
# @title this is an important remark

#num_train_batches = len(train_sequence)//batch_size #+ 1*(len(train_sequence)%batch_size != 0)
#num_val_batches = len(val_sequence)//batch_size #+ 1*(len(val_sequence)%batch_size != 0)

# the last batch will consist of less than 128 30-word sequences
# and it is ok to compute them as well.. but there is a problem with the shape of the (hidden, cell) dimensions..
# as hidden.shape[1] and cell.shape[1] is no longer batch_size (i.e. 128)
# i have not found any way to overcome this, so for now I will not use these last points
# Note that I cannot reset the (hidden,cell) values to zero for the last smaller batch,
#because i will lose the computed ones, which is wrong

#num_train_batches, num_val_batches

In [None]:
from training import Train

In [None]:
instance = Train(model=model,
                loss_fct=criterion,
                optimizer=optimizer,
                train_sequence=train_sequence,
                val_sequence=val_sequence,
                sequence_length=50,
                batch_size=128,
                epochs=50,
                patience=20)

Device: cuda:0


In [None]:
# embedding_dim = 256, num_layers = 2, hidden_dim = 256, dropout_rate = 0.3, learning_rate = 0.001
# sequence_length=50, batch_size=128, epochs=50, patience=5
train_loss, val_loss, checkpoints = instance.training()

Starting training..
Epoch: 1/50 - Training: loss 6.075- Validation: loss 5.304, perplexity 201.209
Epoch: 2/50 - Training: loss 5.601 - Validation: loss 5.059, perplexity 157.506 - E.S. checkpoint
Epoch: 3/50 - Training: loss 5.384 - Validation: loss 4.910, perplexity 135.678 - E.S. checkpoint
Epoch: 4/50 - Training: loss 5.197 - Validation: loss 4.814, perplexity 123.175 - E.S. checkpoint
Epoch: 5/50 - Training: loss 5.034 - Validation: loss 4.730, perplexity 113.351 - E.S. checkpoint
Epoch: 6/50 - Training: loss 4.894 - Validation: loss 4.668, perplexity 106.465 - E.S. checkpoint
Epoch: 7/50 - Training: loss 4.765 - Validation: loss 4.635, perplexity 103.077 - E.S. checkpoint
Epoch: 8/50 - Training: loss 4.644 - Validation: loss 4.595, perplexity 98.949 - E.S. checkpoint
Epoch: 9/50 - Training: loss 4.536 - Validation: loss 4.576, perplexity 97.147 - E.S. checkpoint
Epoch: 10/50 - Training: loss 4.424 - Validation: loss 4.548, perplexity 94.402 - E.S. checkpoint
Epoch: 11/50 - Traini

In [None]:
# embedding_dim = 512, num_layers = 2, hidden_dim = 512, dropout_rate = 0.4, learning_rate = 0.001
# sequence_length=30, batch_size=256, epochs=50, patience=5
# good configuration as well

In [None]:
# @title notebook training loop

# Training loop
for epoch in range(num_epochs):

    model.train()
    train_loss = 0

    (hidden, cell) = (torch.zeros(num_layers, batch_size, hidden_dim).to(device),
                      torch.zeros(num_layers, batch_size, hidden_dim).to(device))

    for n in range(num_train_batches):

        batch = train_sequences[n*batch_size:(n+1)*batch_size]
        batch = torch.tensor(batch)
        # batch.shape is (batch_size, sequence_length+1)

        batch = batch.to(device)

        optimizer.zero_grad()                             #set gradients to zero before back prop

        hidden.detach_()
        cell.detach_()

        output, (hidden, cell) = model(batch[:,:-1], hidden, cell)      #forward

        target = batch[:,-1]                              #set target

        loss = criterion(output, target)                  #compute loss

        loss.backward()                                   #backprop
        optimizer.step()                                  #update weights

        train_loss += loss.item()                         #accumulate batch loss

    train_loss = train_loss / num_train_batches           #avg epoch loss

    # Validation loop
    model.eval()
    val_loss = 0

    with torch.no_grad(): #since in validation phase there is no backprop and weight updates

        (hidden, cell) = (torch.zeros(num_layers, batch_size, hidden_dim).to(device),
                          torch.zeros(num_layers, batch_size, hidden_dim).to(device))

        for n in range(num_val_batches):

            batch = val_sequences[n*batch_size:(n+1)*batch_size]
            batch = torch.tensor(batch)

            batch = batch.to(device)

            hidden.detach_()
            cell.detach_()

            output, (hidden, cell) = model(batch[:,:-1], hidden, cell)
            target = batch[:,-1]
            loss = criterion(output, target)
            val_loss += loss.item()

        val_loss /= num_val_batches

        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.3f}, Val Loss: {val_loss:.3f}, Val Perplexity: {math.exp(val_loss):.3f}')

In [None]:
# to replace the one in github
import math
import torch
import torch.optim as optim

def perplexity_neural_model(model, test_sequence, sequence_length, criterion):

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()
    loss = 0

    num_layers = model.lstm.num_layers
    hidden_dim = model.lstm.hidden_size
    test_sequences = [test_sequence[i:i+sequence_length+1] for i in range(0, len(test_sequence)-sequence_length)]

    with torch.no_grad(): #since in validation phase there is no backprop and weight updates

        (hidden, cell) = (torch.zeros(num_layers, 1, hidden_dim).to(device),
                          torch.zeros(num_layers, 1, hidden_dim).to(device))

        for seq in test_sequences:

            seq = torch.tensor(seq)
            seq = seq.view(1, -1)
            seq = seq.to(device)
            output, (hidden, cell) = model(seq[:,:-1], hidden, cell)
            target = seq[:,-1]
            #print(output.shape, target.shape)
            loss = criterion(output, target)
            loss += loss.item()

        loss /= len(test_sequences)

    perplexity = math.exp(loss)

    return perplexity

In [None]:
perplexity_neural_model(model, test_sequence, 30, criterion)

### Pre-Trained Neural Network

### Comparisons & Text generation