<a href="https://colab.research.google.com/github/vggls/language_models/blob/main/notebooks/LSTM_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Imports

In [None]:
import pickle
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import math
#import string #string.punctuation contains punctuation symbols

In [None]:
# for google colab import run this cell as well
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [None]:
from nltk.corpus import treebank

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
#custom written code
from preprocessing import lower, add_unk_tokens_for_training, unk_for_reduced_vocab, replace_with_unk_for_testing, tokens_to_indices
from training import Train
from lstm_model import LSTMModel
from perplexity_neural import perplexity_neural_model

#### Dataset and some preprocessing

In [None]:
# Penn Treebank

def load_treebank(left_limit, right_limit):

    symbols_to_remove = set(['-LRB-', '-RRB-', '-LSB-', '-RSB-', '-LCB-', '-RCB-']) # parentheses

    #sos_token = ['<bos>']
    eos_token = ['<eos>']

    tokenized_sentences = []
    for j in range(left_limit, right_limit):
        for i in treebank.sents(treebank.fileids()[j]):
            l = [token for token in i if ('*' not in token) and ('\/' not in token) and (token not in symbols_to_remove)]
            l = l + eos_token
            tokenized_sentences.append(l)

    return tokenized_sentences

In [None]:
train_treebank = load_treebank(0, 150)
val_treebank = load_treebank(150, 175)
test_treebank = load_treebank(175, 199)

len(train_treebank), len(val_treebank), len(test_treebank)

(3262, 314, 338)

In [None]:
#lower first letter of each token
lower_train_treebank = lower(train_treebank)
lower_val_treebank = lower(val_treebank)
lower_test_treebank = lower(test_treebank)

In [None]:
# insert <unk> token to training data for case I model
train_sentences_I = add_unk_tokens_for_training(lower_train_treebank) #replace all tokens that appear less than 3 times with <unk>

In [None]:
# case I vocabulary
vocabulary_I = set([item for sublist in train_sentences_I for item in sublist])
len(vocabulary_I), '<unk>' in vocabulary_I, '<eos>' in vocabulary_I

(3259, True, True)

In [None]:
#model architecture hyperparams - for both cases
embedding_dim = 300
num_layers = 2
hidden_dim = 256
dropout_rate = 0.3

#### case I - model with learnable embeddings: further preprocessing, model training and perplexity (all variable names end in 'I')

In [None]:
# not the same index assignments every time i run the cell
word_to_index_I = {word: idx for idx, word in enumerate(vocabulary_I)}
index_to_word_I = {idx: word for word, idx in word_to_index_I.items()}

with open('lstm_caseI_word_index_mappings.pickle', 'wb') as f:
        pickle.dump([word_to_index_I, index_to_word_I], f)

In [None]:
# training sequence of indices
train_int_sequence_I = tokens_to_indices(word_to_index_I, train_sentences_I)

# validation sequence of indices
val_sentences_I = replace_with_unk_for_testing(vocabulary_I, lower_val_treebank)
val_int_sequence_I = tokens_to_indices(word_to_index_I, val_sentences_I)

# testing sequence of indices
test_sentences_I = replace_with_unk_for_testing(vocabulary_I, lower_test_treebank)
test_int_sequence_I = tokens_to_indices(word_to_index_I, test_sentences_I)

len(train_int_sequence_I), len(val_int_sequence_I), len(test_int_sequence_I)

(82372, 8003, 8319)

In [None]:
len(set(train_int_sequence_I)), len(set(val_int_sequence_I)), len(set(test_int_sequence_I))

(3259, 1165, 1272)

In [None]:
# Create the model, loss function, and optimizer
vocabI_size = len(vocabulary_I)
model = LSTMModel(vocabI_size, embedding_dim, hidden_dim, num_layers, dropout_rate, None)
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'No. of trainable parameters: {num_params}')

#model training hyperparams
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)

No. of trainable parameters: 2912991


In [None]:
instance = Train(model=model,
                model_type = 'lstm',
                loss_fct=criterion,
                optimizer=optimizer,
                scheduler=None,
                train_sequence=train_int_sequence_I,
                val_sequence=val_int_sequence_I,
                sequence_length=50,
                batch_size=128,
                epochs=30,
                patience=10,
                name='lstm_with_learnable_embeddings')

Device: cuda:0


In [None]:
train_loss, val_loss, checkpoints = instance.training()

Starting training..
Epoch: 1/50 - Perplexity: training 310.144, validation 133.036
Epoch: 2/50 - Perplexity: training 178.727, validation 105.156 - E.S. checkpoint
Epoch: 3/50 - Perplexity: training 135.101, validation 93.555 - E.S. checkpoint
Epoch: 4/50 - Perplexity: training 106.588, validation 86.646 - E.S. checkpoint
Epoch: 5/50 - Perplexity: training 85.351, validation 82.556 - E.S. checkpoint
Epoch: 6/50 - Perplexity: training 68.566, validation 81.623 - E.S. checkpoint
Epoch: 7/50 - Perplexity: training 55.260, validation 83.485
Epoch: 8/50 - Perplexity: training 45.303, validation 87.138
Epoch: 9/50 - Perplexity: training 37.053, validation 93.132
Epoch: 10/50 - Perplexity: training 30.764, validation 100.262
Epoch: 11/50 - Perplexity: training 25.695, validation 111.074
Epoch: 12/50 - Perplexity: training 21.778, validation 123.306
Epoch: 13/50 - Perplexity: training 18.626, validation 140.248
Epoch: 14/50 - Perplexity: training 15.982, validation 155.976
Epoch: 15/50 - Perpl

In [None]:
instance = Train(model=model,
                model_type = 'lstm',
                loss_fct=criterion,
                optimizer=optimizer,
                scheduler=None,
                train_sequence=val_int_sequence_I,
                val_sequence=None,
                sequence_length=50,
                batch_size=128,
                epochs=6,
                patience=None,
                name=None)

train_loss_of_val_data = instance.training()

Device: cuda:0
Starting training..
No validation data is used.
Epoch: 1/6 - Perplexity: training 159.130
Epoch: 2/6 - Perplexity: training 52.802
Epoch: 3/6 - Perplexity: training 30.531
Epoch: 4/6 - Perplexity: training 20.326
Epoch: 5/6 - Perplexity: training 14.521
Epoch: 6/6 - Perplexity: training 10.756
Training complete !


In [None]:
torch.save(model, 'model_epoch6_lstm_with_learnable_embeddings.pth')

In [None]:
perplexity_network_model(test_sequence_of_integers = test_int_sequence_I,
                        sequence_length = 50,
                        model = model,
                        model_type = 'lstm',
                        loss_fct = nn.CrossEntropyLoss(),
                        vocab_size = len(vocabulary_I))

248.95954826843968

#### case II - model with pre-trained GloVe embeddings: further preprocessing, model training and perplexity (all variable names end in 'II')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load GloVe 300-dim embeddings into word_embeddings dictionary of (word,vector) pairs
import os

glove_dir ='/content/drive/MyDrive/Colab_Notebooks/GitHub_language_models_repo'

word_embeddings = {} # dictionary with (word, embedding) items

with open(os.path.join(glove_dir, 'glove.6B.300d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = torch.tensor([float(val) for val in values[1:]])
        word_embeddings[word] = embedding

In [None]:
# Do GloVe embeddings have representations for all tokens in the vocabulary_I?
intersection = set(word_embeddings.keys()) & vocabulary_I
words_not_in_glove = vocabulary_I - intersection
len(words_not_in_glove) #'<unk>', '<eos>' included

36

In [None]:
# How to deal with words_not_in_glove ?
# One approach, is to assign them to the '<unk>' token. This results in a reduced vocabulary.

vocabulary_II = vocabulary_I - words_not_in_glove
vocabulary_II.add('<unk>')
vocabulary_II.add('<eos>')

assert len(vocabulary_II) == len(vocabulary_I) - len(words_not_in_glove) + 2 # 3225 = 3259 - 36 + 2

In [None]:
# we replace with <unk> tokens that are not included in the vocabulary_II as well
train_sentences_II = unk_for_reduced_vocab(train_sentences_I, vocabulary_II)

In [None]:
# not the same index assignments every time i run the cell
word_to_index_II = {word: idx for idx, word in enumerate(vocabulary_II)}
index_to_word_II = {idx: word for word, idx in word_to_index_II.items()}

with open('lstm_caseII_word_index_mappings.pickle', 'wb') as f:
        pickle.dump([word_to_index_II, index_to_word_II], f)

In [None]:
# training sequence of indices
train_int_sequence_II = tokens_to_indices(word_to_index_II, train_sentences_II)

# validation sequence of indices
val_sentences_II = replace_with_unk_for_testing(vocabulary_II, lower_val_treebank)
val_int_sequence_II = tokens_to_indices(word_to_index_II, val_sentences_II)

# testing sequence of indices
test_sentences_II = replace_with_unk_for_testing(vocabulary_II, lower_test_treebank)
test_int_sequence_II = tokens_to_indices(word_to_index_II, test_sentences_II)

len(train_int_sequence_II), len(val_int_sequence_II), len(test_int_sequence_II)

(82372, 8003, 8319)

In [None]:
len(set(train_int_sequence_II)), len(set(val_int_sequence_II)), len(set(test_int_sequence_II))

(3225, 1160, 1269)

In [None]:
# Find the maximum and minimum values in GloVe vectors

tensors = list(word_embeddings.values())

tensors_tensor = torch.stack(tensors)

max_value = torch.max(tensors_tensor)
min_value = torch.min(tensors_tensor)

max_value.item(), min_value.item()

(3.25819993019104, -3.0638999938964844)

In [None]:
#create embedding layer weights

vocabII_size = len(vocabulary_II)
embeddings = torch.zeros(vocabII_size, embedding_dim)

# put the glove embeddings in the embeddings matrix
for (word, index) in word_to_index_II.items():
    if word not in ['<unk>', '<eos>']:
        embeddings[index] = word_embeddings[word]

eos_index = word_to_index_II['<eos>']
all_vectors = list(word_embeddings.values())
embeddings[eos_index] = torch.mean(torch.stack(all_vectors), dim=0)

unk_index = word_to_index_II['<unk>']
embeddings[unk_index] = (max_value.item() - min_value.item()) * torch.rand(embedding_dim) + min_value.item()

In [None]:
# Create the model, loss function, and optimizer
model = LSTMModel(vocabII_size, embedding_dim, hidden_dim, num_layers, dropout_rate, embeddings)
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'No. of trainable parameters: {num_params}')

#model training hyperparams
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

No. of trainable parameters: 1926553


In [None]:
instance = Train(model=model,
                model_type = 'lstm',
                loss_fct=criterion,
                optimizer=optimizer,
                scheduler=None,
                train_sequence=train_int_sequence_II,
                val_sequence=val_int_sequence_II,
                sequence_length=50,
                batch_size=128,
                epochs=30,
                patience=10,
                name='lstm_with_glove_embeddings')

Device: cuda:0


In [None]:
train_loss, val_loss, checkpoints = instance.training()

Starting training..
Epoch: 1/50 - Perplexity: training 347.218, validation 153.147
Epoch: 2/50 - Perplexity: training 205.620, validation 119.665 - E.S. checkpoint
Epoch: 3/50 - Perplexity: training 158.176, validation 99.479 - E.S. checkpoint
Epoch: 4/50 - Perplexity: training 128.392, validation 88.739 - E.S. checkpoint
Epoch: 5/50 - Perplexity: training 107.411, validation 82.375 - E.S. checkpoint
Epoch: 6/50 - Perplexity: training 90.900, validation 77.508 - E.S. checkpoint
Epoch: 7/50 - Perplexity: training 78.735, validation 74.539 - E.S. checkpoint
Epoch: 8/50 - Perplexity: training 67.797, validation 73.111 - E.S. checkpoint
Epoch: 9/50 - Perplexity: training 59.600, validation 73.099 - E.S. checkpoint
Epoch: 10/50 - Perplexity: training 52.220, validation 72.949 - E.S. checkpoint
Epoch: 11/50 - Perplexity: training 46.165, validation 73.987
Epoch: 12/50 - Perplexity: training 40.645, validation 75.250
Epoch: 13/50 - Perplexity: training 36.464, validation 76.762
Epoch: 14/50 -

In [None]:
instance = Train(model=model,
                model_type = 'lstm',
                loss_fct=criterion,
                optimizer=optimizer,
                scheduler=None,
                train_sequence=val_int_sequence_II,
                val_sequence=None,
                sequence_length=50,
                batch_size=128,
                epochs=10,
                patience=None,
                name=None)

train_loss_of_val_data = instance.training()

Device: cuda:0
Starting training..
No validation data is used.
Epoch: 1/10 - Perplexity: training 104.723
Epoch: 2/10 - Perplexity: training 55.702
Epoch: 3/10 - Perplexity: training 35.913
Epoch: 4/10 - Perplexity: training 25.857
Epoch: 5/10 - Perplexity: training 20.086
Epoch: 6/10 - Perplexity: training 15.860
Epoch: 7/10 - Perplexity: training 12.778
Epoch: 8/10 - Perplexity: training 10.504
Epoch: 9/10 - Perplexity: training 8.705
Epoch: 10/10 - Perplexity: training 7.483
Training complete !


In [None]:
torch.save(model, 'model_epoch10_lstm_with_glove_embeddings.pth')

In [None]:
perplexity_network_model(test_sequence_of_integers = test_int_sequence_II,
                        sequence_length = 50,
                        model = model,
                        model_type = 'lstm',
                        loss_fct = nn.CrossEntropyLoss(),
                        vocab_size = len(vocabulary_II))

195.7221341660012