Imports needed to run the cells below

In [1]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.nn.init as init
import json
import torch.optim as optim
import torch.nn as nn
import math
from functools import reduce
import sys
import torch.nn.functional as F

# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive
!pip3 install https://download.pytorch.org/whl/cu100/torch-1.1.0-cp36-cp36m-linux_x86_64.whl

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth, drive
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive.mount('/content/gdrive')
drive = GoogleDrive(gauth)


[?25l[K     |▎                               | 10kB 19.8MB/s eta 0:00:01[K     |▋                               | 20kB 3.3MB/s eta 0:00:01[K     |█                               | 30kB 4.8MB/s eta 0:00:01[K     |█▎                              | 40kB 3.1MB/s eta 0:00:01[K     |█▋                              | 51kB 3.8MB/s eta 0:00:01[K     |██                              | 61kB 4.5MB/s eta 0:00:01[K     |██▎                             | 71kB 5.2MB/s eta 0:00:01[K     |██▋                             | 81kB 5.8MB/s eta 0:00:01[K     |███                             | 92kB 6.4MB/s eta 0:00:01[K     |███▎                            | 102kB 5.0MB/s eta 0:00:01[K     |███▋                            | 112kB 5.0MB/s eta 0:00:01[K     |████                            | 122kB 5.0MB/s eta 0:00:01[K     |████▎                           | 133kB 5.0MB/s eta 0:00:01[K     |████▋                           | 143kB 5.0MB/s eta 0:00:01[K     |█████                     

The deterministic model implemented for the assignment

In [0]:
class DeterministicLSTM(nn.Module):
    def __init__(self, vocab, nb_layers=1, nb_lstm_units=100, embedding_dim=10, batch_size=64, on_gpu=False, pad_token="<PAD>", unk_token="<UNK>", reduced_vocab=False, dropout=0):
        super(DeterministicLSTM, self).__init__()

        self.vocab = vocab
        self.nb_layers = nb_layers
        self.nb_lstm_units = nb_lstm_units
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.on_gpu = on_gpu
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.reduced_vocab = reduced_vocab
        self.len_vocab = len(self.vocab) 
        self.dropout = dropout

        self.__build_model()
        self.init_weights()

    def __build_model(self):        
        # encoder to word embeddings
        self.encoder = nn.Embedding(
            num_embeddings=self.len_vocab,
            embedding_dim=self.embedding_dim,
            padding_idx=self.vocab[self.pad_token]
        )

        # LSTM
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.nb_lstm_units,
            num_layers=self.nb_layers,
            batch_first=True,
        )

        # decoder to output space
        self.decoder = nn.Linear(self.nb_lstm_units, self.len_vocab)
        
        self.dropout_layer = nn.Dropout(self.dropout)
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)


    def init_hidden(self):
        hidden_a = torch.zeros(self.nb_layers,
                               self.batch_size, self.nb_lstm_units)
        hidden_b = torch.zeros(self.nb_layers,
                               self.batch_size, self.nb_lstm_units)

        if self.on_gpu:
            hidden_a = hidden_a.cuda()
            hidden_b = hidden_b.cuda()

        hidden_a = Variable(hidden_a)
        hidden_b = Variable(hidden_b)

        return (hidden_a, hidden_b)
      
    
    def step(self, x, hidden):
        x = self.encoder(x)
        x = self.dropout_layer(x)
        x, hidden = self.lstm(x, hidden)
        x = self.dropout_layer(x)
        x = self.decoder(x)
        
        return x, hidden
        
    def forward(self, x):
        hidden = self.init_hidden()
        outputs = []
        
        for t in range(x.size(1)):
            previous_x = x[:, t].unsqueeze(-1)
            y_hat, hidden = self.step(previous_x, hidden)
             
            #print(y_hat.size())
            outputs.append(y_hat)
            
        return torch.cat(outputs, dim=1)
        
    
    def loss(self, Y_hat, Y, batch_size, seq_len, vocab_size, eval=False):
        '''tag_pad_token = self.vocab[self.pad_token]
        mask = (Y > tag_pad_token).float()
        nb_tokens = int(torch.sum(mask).data)
        
        Y_hat = Y_hat.view(-1, nb_tokens)'''
        
        Y_hat = Y_hat.permute(0,2,1)
        
        loss = F.cross_entropy(
            Y_hat,
            Y, 
            ignore_index=self.vocab[self.pad_token], 
            reduction="none"
        )
        
        #print(loss)
        
        loss = loss.sum()
        
        
        if eval:
            loss = loss.sum()
        else:
            loss = loss.mean()

        return loss
    '''

    def loss(self, Y_hat, Y):
        # TRICK 3 ********************************
        # before we calculate the negative log likelihood, we need to mask out the activations
        # this means we don't want to take into account padded items in the output vector
        # simplest way to think about this is to flatten ALL sequences into a REALLY long sequence
        # and calculate the loss on that.

        # flatten all the labels
        Y = Y.view(-1)
        print(Y)
        # flatten all predictions
        Y_hat = Y_hat.view(-1, self.len_vocab)
        
        print(Y_hat)
        print("--")

        # create a mask by filtering out all tokens that ARE NOT the padding token NOR the UNK token for reduced_vocab
        mask = None
        
        #if not self.reduced_vocab:
        tag_pad_token = self.vocab[self.pad_token]
        mask = (Y > tag_pad_token).float()
        
        #else:
        #    tag_unk_token = self.vocab[self.unk_token]
        #    mask = (Y > tag_unk_token).float()

        # count how many tokens we have
        # nb_tokens = int(torch.sum(mask).data[0])
        nb_tokens = int(torch.sum(mask).data)

        
        
        # pick the values for the label and zero out the rest with the mask
        Y_hat = Y_hat[range(Y_hat.shape[0]), Y] * mask


        # compute cross entropy loss which ignores all <PAD> tokens
        ce_loss = -torch.sum(Y_hat) / nb_tokens

        return ce_loss
    '''


Functions implemented to load and manipulate data, as well as wrapper functions for operations such as training, evaluating, and other things

In [0]:
def load_data():
    global ONLY_ALLOW_FREQUENT_WORDS
    global UNKNOWN
    global PAD
    global SOS
    global EOS
    global DROP_LONG_SENTENCES
    global USE_HALF_SETS
    
    #file_ids are in the order train, valid, test
    #file_ids = ["1jGgW9oyEWuKgMt32_s9BnSJt3A5CR7Z8", "1zwVW6-HA3KxyDuJIXuTK1OvKeoXiPHqb", "1f-rz6KNWUPO5ToHeLewlHezDNUQdNnQg"]
    file_ids = ["1tCivrO7xa9PzroVUw8s92nI7LtW6TOB5", "1zwVW6-HA3KxyDuJIXuTK1OvKeoXiPHqb"]
    
    data = []
    vocab = [PAD, UNKNOWN, SOS, EOS]
    word_frequencies = {}


    for file_id in file_ids:
        sentences = json.loads(drive.CreateFile({'id': file_id}).GetContentString())
        
        if USE_HALF_SETS:
            sentences = sentences[:len(sentences)//2]

        if DROP_LONG_SENTENCES:
            sentences = list(filter(lambda s: len(s) < 50, sentences))
            
        for sentence in sentences:
            for word in sentence:
                if word not in word_frequencies:
                    word_frequencies[word] = 0

                word_frequencies[word] += 1

        data.append(sentences)
        
    if not ONLY_ALLOW_FREQUENT_WORDS:
        vocab += word_frequencies.keys()
    
    else:
        updated_data = []
        
        for dataset in data:
            new_sentences = []

            for sentence in dataset:
                new_sentences.append([w if word_frequencies[w] > 1 else UNKNOWN for w in sentence])

            updated_data.append(new_sentences)
        
        data = updated_data
        
        vocab += list(filter(lambda w: word_frequencies[w] > 1, word_frequencies.keys()))
    
    dict_vocab = {}
    
    for i in range(len(vocab)):
        dict_vocab[vocab[i]] = i
    
    data.append(dict_vocab)
    data.append(vocab)

    return data

          
def get_indexed_vocab(vocab):
    indexed_vocab = {}
    counter = 0

    for w in vocab:
        indexed_vocab[w] = counter
        counter += 1

    return indexed_vocab


def get_longest_sentence(*datasets):
    longest_sentence = 0

    for ds in datasets:
        candidate = len(max(ds, key=len))

        if candidate > longest_sentence:
            longest_sentence = candidate

    return longest_sentence


def get_minibatches(dataset):
    global MINIBATCH_SIZE
    len_d = len(dataset)

    cutoff = len_d % MINIBATCH_SIZE
    cut_dataset = dataset[: len_d - cutoff]

    return [cut_dataset[i * MINIBATCH_SIZE: (i + 1) * MINIBATCH_SIZE] for i in range(len_d//MINIBATCH_SIZE)]


def format_minibatches(minibatches, vocab):
    global PAD
    global SOS
    global EOS
    global DEVICE

    pad_token = vocab[PAD]
    formatted_minibatches = []

    for minibatch in minibatches:
        x = []
        y = []
        x_lengths = []
        longest_sentence = len(max(minibatch, key=len))

        for sentence in minibatch:
            s_len = len(sentence)
            x_lengths.append(s_len)
            v_s = [vocab[w] for w in sentence]

            ind_s_x = [vocab[SOS]] + v_s
            pad_s_x = np.ones((longest_sentence)) * pad_token
            pad_s_x[0:s_len] = np.array(ind_s_x[:s_len])

            x.append(pad_s_x)

            ind_s_y = v_s + [vocab[SOS]]
            pad_s_y = np.ones((longest_sentence)) * pad_token
            pad_s_y[0:s_len] = np.array(ind_s_y[:s_len])

            y.append(pad_s_y)

        # have to sort for later use by padded_sequence
        sorted_indexes = sorted(range(len(x_lengths)),
                                key=x_lengths.__getitem__)
        sorted_indexes.reverse()

        x = torch.LongTensor([x[i] for i in sorted_indexes]).to(DEVICE)
        y = torch.LongTensor([y[i] for i in sorted_indexes]).to(DEVICE)
        x_lengths = torch.LongTensor([x_lengths[i] for i in sorted_indexes]).to(DEVICE)

        formatted_minibatches.append([x, y, x_lengths, longest_sentence])

    return formatted_minibatches

  
def create_categorical_distribution(*datasets):
    counter = 0
    cat_dist = {}

    for dataset in datasets:
        for sentence in dataset:
            for word in sentence:
                if word not in cat_dist:
                    cat_dist[word] = 0

                cat_dist[word] += 1
                counter += 1

    for key in cat_dist:
        cat_dist[key] /= counter

    return cat_dist
  

def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
      
      
def half_learning_rate(optimizer):
    global LEARNING_RATE

    LEARNING_RATE /= 2

    for param_group in optimizer.param_groups:
        param_group['lr'] = LEARNING_RATE
        
        
def save_model(model, name):
    global GOOGLE_SCHOLAR
    path = ""

    if GOOGLE_SCHOLAR:
        path = F"/content/gdrive/My Drive/nlp2/{name}.pt" 
    
    else:
        path = F"./data/{name}.pt"

    torch.save(model.state_dict(), path)

def load_model(model, name):
    global GOOGLE_SCHOLAR
    path = ""

    if GOOGLE_SCHOLAR:
        path = F"/content/gdrive/My Drive/nlp2/{name}.pt" 
    
    else:
        path = F"./data/{name}.pt"

    model.load_state_dict(torch.load(path))
  
    
    
def train_model(model, optimizer, train_data, vocab_size):
    global LEARNING_RATE
    global CLIP_VALUE
    global MINIBATCH_SIZE
    
    model.train()
    total_train_loss = 0

    for train_batch in train_data:
        model.zero_grad()

        x, y, x_lengths, seq_len = train_batch
        y_hat = model(x)
        
        #print(x.size())
        #print(y_hat.size())
        #print(y.size())
        loss = model.loss(y_hat, y, MINIBATCH_SIZE, seq_len, vocab_size)
        loss.backward(retain_graph=True)
        optimizer.step()
        
        total_train_loss += MINIBATCH_SIZE * loss.item()
        
        # not sure about good clip values, using value from word language model
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_VALUE)
        
        for p in model.parameters():
            p.data.add_(-LEARNING_RATE, p.grad.data)
    
    print(F"training loss: {total_train_loss}")
    
    return total_train_loss
    
def evaluate_model(model, valid_data, vocab_size):
    global MINIBATCH_SIZE
    
    model.eval()

    total_valid_loss = 0
    total_sentences_lengths = 0
    
    with torch.no_grad():
      for valid_batch in valid_data:
          model.zero_grad()

          x, y, x_lengths, seq_len = valid_batch
          y_hat = model(x)

          batch_loss = model.loss(y_hat, y, MINIBATCH_SIZE, seq_len, vocab_size)
          total_valid_loss += batch_loss.item()
          
          total_sentences_lengths += reduce(lambda a,b: a+b, x_lengths.tolist())
    
    total_valid_loss = total_valid_loss / total_sentences_lengths
    
    print(F"validation loss: {total_valid_loss}")

    
    ppl = 0
    
    try:
      ppl = math.exp(total_valid_loss)
    except OverflowError:
      ppl = math.inf
      
    print(F"PPL: {ppl}")
      

    return total_valid_loss, ppl
  
  
def should_keep_training(valid_loss):
    global BEST_VALID_LOSS
    global VALID_LOSS_DECREASED_LAST_EPOCH
    
    save_model = True
    keep_training = True
    
    
    if not BEST_VALID_LOSS:
        print("nulth")
        BEST_VALID_LOSS = valid_loss
        
    else:
        if valid_loss < BEST_VALID_LOSS:
            VALID_LOSS_DECREASED_LAST_EPOCH = True
            BEST_VALID_LOSS = valid_loss
            
        else:
            save_model = False 
         
            if VALID_LOSS_DECREASED_LAST_EPOCH:
                print("first")
                half_learning_rate(optimizer)
                VALID_LOSS_DECREASED_LAST_EPOCH = False
            
            else:
                print("second")
                keep_training = False
            
    return save_model, keep_training
  
  
def sample(model, vocab, ids2words, sentence_len=25):
    global MINIBATCH_SIZE
    global DEVICE
    
    predictions = []
    
    model.eval()
    
    with torch.no_grad():
        previous_x = torch.zeros(MINIBATCH_SIZE, 1, dtype=torch.long).fill_(vocab[SOS]).to(DEVICE)
        hidden = model.init_hidden()
        
        for t in range(sentence_len):
            scores, hidden = model.step(previous_x, hidden)
            
            p = torch.argmax(scores, dim=-1)
            predictions.append(p)
            
            prev_x = p.view(MINIBATCH_SIZE, 1)
    
    
    predictions = torch.cat(predictions, dim=1).tolist()
    
    for i in range(10):
        prediction = predictions[i]
        prediction_str = ' '.join([ids2words[i] for i in prediction])

        print(prediction_str)

Initialisation of globals and constants, loading and manipulating the data, instantiating the model and training it

In [10]:
#globals and constants
SOS = "<SOS>"
EOS = "<EOS>"
PAD = "<PAD>"
UNKNOWN = "<UNK>"
MINIBATCH_SIZE = 64
NUM_EPOCHS = 20
LEARNING_RATE = 0.001
EMBEDDING_DIM = 256
NUM_LSTM_UNITS = 256
NUM_LSTM_LAYERS = 1
DROPOUT_FACTOR = 0.75
CLIP_VALUE = 0.25
ON_GPU = True
GOOGLE_SCHOLAR = True
BEST_VALID_LOSS = False
VALID_LOSS_DECREASED_LAST_EPOCH = True
KEEP_TRAINING = True
DEVICE = torch.device("cuda" if ON_GPU else "cpu")
ONLY_ALLOW_FREQUENT_WORDS = True
DROP_LONG_SENTENCES = True
USE_HALF_SETS = False
SAMPLE = False


#loading data and vocab as words
#train_data, valid_data, test_data, vocab = load_data() 
train_data, valid_data, vocab, ids2words = load_data()
len_vocab = len(vocab)

train_minibatches = format_minibatches(get_minibatches(train_data), vocab)
valid_minibatches = format_minibatches(get_minibatches(valid_data), vocab)

model = DeterministicLSTM(
    vocab,
    NUM_LSTM_LAYERS,
    NUM_LSTM_UNITS,
    EMBEDDING_DIM,
    MINIBATCH_SIZE,
    ON_GPU,
    PAD,
    UNKNOWN,
    ONLY_ALLOW_FREQUENT_WORDS,
    DROPOUT_FACTOR
).to(DEVICE)

optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)


training_scores = []
validation_scores = []
ppl_scores = []
do_keep_training = True


for epoch in range(NUM_EPOCHS):
    print(F"epoch: {epoch}")
    
    total_train_loss = train_model(model, optimizer, train_minibatches, len_vocab)
    total_valid_loss, ppl = evaluate_model(model, valid_minibatches, len_vocab)
    
    training_scores.append(total_train_loss)
    validation_scores.append(total_valid_loss)
    ppl_scores.append(ppl)
    
    
    if SAMPLE:
        sample(model, vocab, ids2words, 10)
    
    do_save_model, do_keep_training = should_keep_training(total_valid_loss)
    
    print("\n")
    
    if do_save_model:
        save_model(model, "model_full")
    
    if not do_keep_training:
        break
        

print("Training done..")
print("Training scores")
print(training_scores)
print("Validation scores")
print(validation_scores)
print("Ppl scores")
print(ppl_scores)



epoch: 0
training loss: 423535989.1875
validation loss: 6.921768787842443
PPL: 1014.1121583454421
The Mr. , , , , , , , ,
The Mr. , , , , , , , ,
The Mr. , , , , , , , ,
The Mr. , , , , , , , ,
The Mr. , , , , , , , ,
The Mr. , , , , , , , ,
The Mr. , , , , , , , ,
The Mr. , , , , , , , ,
The Mr. , , , , , , , ,
The Mr. , , , , , , , ,
nulth


epoch: 1
training loss: 395124812.125
validation loss: 6.624879571077621
PPL: 753.6134502337645
The The , , , , , , , ,
The The , , , , , , , ,
The The , , , , , , , ,
The The , , , , , , , ,
The The , , , , , , , ,
The The , , , , , , , ,
The The , , , , , , , ,
The The , , , , , , , ,
The The , , , , , , , ,
The The , , , , , , , ,


epoch: 2
training loss: 382958098.65625
validation loss: 6.419578914935151
PPL: 613.744621012037
The The Mr. said said said said said said said
The The Mr. said said said said said said said
The The Mr. said said said said said said said
The The Mr. said said said said said said said
The The Mr. said said said said

KeyboardInterrupt: ignored