In [1]:
import numpy as np
from collections import namedtuple
import sys
from typing import List, Tuple, Dict, Set, Union
import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from nltk import word_tokenize

#from model_embeddings import ModelEmbeddings
from evaluator import Evaluator
from vocab import Vocab, VocabEntry
from utils import read_corpus, pad_sents, batch_iter

[nltk_data] Downloading package punkt to /Users/yuloucn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
definitions = []
unparsed_definition = []
words = []
src_sents = read_corpus('../data/data_train_definitions.txt')

In [3]:
with open('../data/data_train_words.txt') as f:
    words += f.read().splitlines()
    
with open('../data/data_train_definitions.txt') as f:
    unparsed_definition += f.read().splitlines()
    definitions += [word_tokenize(a) for a in unparsed_definition]
    
training_data = [(definitions[i], words[i]) for i in range(len(words))]

In [4]:
assert(len(words) == len(definitions))

In [5]:
eval = Evaluator()
glove_dict = eval.load_glove_embeddings(max_line = 100000)

src_sents = read_corpus('../data/data_train_definitions.txt')
vocab = VocabEntry.from_corpus(src_sents, 1000000, 0)

for word in words:
    vocab.add(word)
    
print(len(vocab))

number of word types: 23452, number of word types w/ frequency >= 0: 23452
28572


In [6]:
def create_emb_layer(weights_matrix, src_pad_token_idx, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim, src_pad_token_idx)
    emb_layer.weight.data.copy_(torch.from_numpy(weights_matrix)) #figure out what is here
    if non_trainable:
        emb_layer.weight.requires_grad = False
    return emb_layer, num_embeddings, embedding_dim

class ModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their embeddings.
    """

    def __init__(self, embed_size, vocab, glove_dict):
        """
        Init the Embedding layers.

        @param embed_size (int): Embedding size (dimensionality)
        @param vocab (VocabEntry)
        """
        super(ModelEmbeddings, self).__init__()

        self.embed_size = embed_size

        matrix_len = len(vocab)
        weights_matrix = np.zeros((matrix_len, self.embed_size))
        words_found = 0
        #print(len(vocab), weights_matrix.shape)
        for word, index in vocab.word2id.items():
            try:
                weights_matrix[index] = np.array(glove_dict[word])
                words_found += 1
            except KeyError:
                weights_matrix[index] = np.random.normal(scale=0.6, size=(self.embed_size,))

        # default values
        src_pad_token_idx = vocab['<pad>']
        self.source = create_emb_layer(weights_matrix, src_pad_token_idx, True)

In [8]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, vocab, glove_dict):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = ModelEmbeddings(input_size, vocab, glove_dict)
        self.gru = nn.GRU(input_size, hidden_size)
        
        self.linear = nn.Linear(self.hidden_size, self.hidden_size, bias = True)

    def forward(self, input_, hidden, lengths , dropout_rate = 0.3):
        embedded = self.embedding.source[0](input_)
        embedded = pack_padded_sequence(embedded, lengths)
        output, hidden = self.gru(embedded, hidden)
        dropout = nn.Dropout(dropout_rate)
        hidden_dropped = dropout(hidden.permute(1,0,2)) # you dont need dropout in validation
        projected = self.linear(hidden_dropped)
        return projected, hidden

    def initHidden(self, batch_size, device = None):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [9]:
model = EncoderRNN(50, 50, vocab, glove_dict)
loss_function = nn.SmoothL1Loss(reduction = "sum")
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))

In [None]:
# #check overfit

# definition_indices = vocab.words2indices(definitions)
# words_in = 0
# words_out = 0

# import timeit
# start = timeit.default_timer()
# losses = []

# batch_size = 256

# for src_sents, tgt_word in batch_iter(training_data, batch_size, True):
#     for i in range(5000):
#         model.zero_grad()
#         x_lengths = [len(sent) for sent in src_sents]
#         x = vocab.to_input_tensor(src_sents, "cpu")
#         init_hidden = model.initHidden(len(src_sents), "cpu")
#         tag_scores = model.forward(x, init_hidden, x_lengths)
#         y_array = model.embedding.source[0](torch.tensor(vocab.words2indices(tgt_word))).double()
#         y_pred = tag_scores[0].squeeze(dim = 1).double()
#         loss = loss_function(y_pred, y_array)
#         loss.backward()
#         optimizer.step() 
#         losses.append(loss)
#         if i % 100 == 0:
#             print(epoch, loss)
#     break
    
# stop = timeit.default_timer()

# print('Time: ', stop - start)

# import matplotlib.pyplot as plt
# print(plt.plot([l.double() for l in losses][:1200]))

# model.zero_grad()
# x_lengths = [len(sent) for sent in src_sents]
# x = vocab.to_input_tensor(src_sents, "cpu")
# init_hidden = model.initHidden(len(src_sents), "cpu")
# tag_scores = model.forward(x, init_hidden, x_lengths)
# y_pred = tag_scores[0].squeeze(dim = 1).double()

# print(y_pred.shape)
# for i in range(len(y_pred)):
#     eval.top_ten_hundred(validate_dict, tgt_word[i], y_pred[i].detach().numpy())

In [None]:
definition_indices = vocab.words2indices(definitions)
words_in = 0
words_out = 0

import timeit
start = timeit.default_timer()
losses = []

batch_size = 2048

for epoch in range(50000):
    for src_sents, tgt_word in batch_iter(training_data, batch_size, True):
        model.zero_grad()
        x_lengths = [len(sent) for sent in src_sents]
        x = vocab.to_input_tensor(src_sents, "cpu")
        init_hidden = model.initHidden(len(src_sents), "cpu")
        tag_scores = model.forward(x, init_hidden, x_lengths)
        y_array = model.embedding.source[0](torch.tensor(vocab.words2indices(tgt_word))).double()
        y_pred = tag_scores[0].squeeze(dim = 1).double()
        loss = loss_function(y_pred, y_array)
        loss.backward()
        optimizer.step() 
    losses.append(loss)
    print(epoch, loss)
    
stop = timeit.default_timer()

print('Time: ', stop - start)

import matplotlib.pyplot as plt
print(plt.plot([l.double() for l in losses][:1200]))

0 tensor(9929.5731, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
1 tensor(9722.5172, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
2 tensor(9432.3986, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
3 tensor(9672.4610, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
4 tensor(9355.1484, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
5 tensor(9280.2919, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
6 tensor(9405.4431, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
7 tensor(9203.3021, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
8 tensor(9418.9172, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
9 tensor(9447.1026, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
10 tensor(9238.8751, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
11 tensor(9284.6304, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
12 tensor(9316.2534, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
13 tensor(9300.5863, dtype=torch.float64, grad_f