In [1]:
import numpy as np
from collections import namedtuple
import sys
from typing import List, Tuple, Dict, Set, Union
import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from nltk import word_tokenize

#from model_embeddings import ModelEmbeddings
from evaluator import Evaluator
from vocab import Vocab, VocabEntry
from utils import read_corpus, pad_sents, batch_iter

[nltk_data] Downloading package punkt to /Users/yuloucn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
definitions = []
unparsed_definition = []
words = []
src_sents = read_corpus('../data/data_train_definitions.txt')

In [3]:
with open('../data/data_train_words.txt') as f:
    words += f.read().splitlines()
    
with open('../data/data_train_definitions.txt') as f:
    unparsed_definition += f.read().splitlines()
    definitions += [word_tokenize(a) for a in unparsed_definition]
    
training_data = [(definitions[i], words[i]) for i in range(len(words))]

In [4]:
assert(len(words) == len(definitions))

In [5]:
eval = Evaluator()
glove_dict = eval.load_glove_embeddings(max_line = 100000)

src_sents = read_corpus('../data/data_train_definitions.txt')
vocab = VocabEntry.from_corpus(src_sents, 1000000, 0)

for word in words:
    vocab.add(word)
    
print(len(vocab))

number of word types: 23452, number of word types w/ frequency >= 0: 23452
28572


In [6]:
def create_emb_layer(weights_matrix, src_pad_token_idx, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim, src_pad_token_idx)
    emb_layer.weight.data.copy_(torch.from_numpy(weights_matrix)) #figure out what is here
    if non_trainable:
        emb_layer.weight.requires_grad = False
    return emb_layer, num_embeddings, embedding_dim

class ModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their embeddings.
    """

    def __init__(self, embed_size, vocab, glove_dict):
        """
        Init the Embedding layers.

        @param embed_size (int): Embedding size (dimensionality)
        @param vocab (VocabEntry)
        """
        super(ModelEmbeddings, self).__init__()

        self.embed_size = embed_size

        matrix_len = len(vocab)
        weights_matrix = np.zeros((matrix_len, self.embed_size))
        words_found = 0
        #print(len(vocab), weights_matrix.shape)
        for word, index in vocab.word2id.items():
            try:
                weights_matrix[index] = np.array(glove_dict[word])
                words_found += 1
            except KeyError:
                weights_matrix[index] = np.random.normal(scale=0.6, size=(self.embed_size,))

        # default values
        src_pad_token_idx = vocab['<pad>']
        self.source = create_emb_layer(weights_matrix, src_pad_token_idx, True)

In [28]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, vocab, glove_dict):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = ModelEmbeddings(input_size, vocab, glove_dict)
        self.gru = nn.GRU(input_size, hidden_size)
        self.linear = nn.Linear(self.hidden_size, self.hidden_size, bias = False)

    def forward(self, input_, hidden, lengths):
        embedded = self.embedding.source[0](input_)
        embedded = pack_padded_sequence(embedded, lengths)
        output, hidden = self.gru(embedded, hidden)
        print("before", output[0].shape)
        output, _ = pad_packed_sequence(output)
        print("after", output[0].shape)
        projected = self.linear(hidden.permute(1,0,2))
        return projected, hidden

    def initHidden(self, batch_size, device = None):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [29]:
model = EncoderRNN(50, 50, vocab, glove_dict)
loss_function = nn.SmoothL1Loss(reduction = "sum")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)

In [30]:
definition_indices = vocab.words2indices(definitions)
words_in = 0
words_out = 0

import timeit
start = timeit.default_timer()
losses = []

batch_size = 2048

for epoch in range(50000):
    for src_sents, tgt_word in batch_iter(training_data, batch_size, False):
        model.zero_grad()
        x_lengths = [len(sent) for sent in src_sents]
        x = vocab.to_input_tensor(src_sents, "cpu")
        init_hidden = model.initHidden(len(src_sents), "cpu")
        tag_scores = model.forward(x, init_hidden, x_lengths)
        y_array = model.embedding.source[0](torch.tensor(vocab.words2indices(tgt_word))).double()
        y_pred = tag_scores[0].squeeze(dim = 1).double()
        loss = loss_function(y_pred, y_array)
        loss.backward()
        optimizer.step() 
    losses.append(loss)
    print(epoch, loss)
    
stop = timeit.default_timer()

print('Time: ', stop - start)


before tensor([[-0.1972,  0.0740,  0.0598,  ..., -0.1167, -0.3241, -0.2236],
        [-0.2561, -0.0915,  0.0117,  ..., -0.1869, -0.1928, -0.1908],
        [-0.1542, -0.0777, -0.1393,  ..., -0.2832, -0.1275, -0.2211],
        ...,
        [-0.1767,  0.2588, -0.3904,  ..., -0.0922, -0.2805, -0.2836],
        [-0.4672,  0.0194, -0.1071,  ..., -0.1755, -0.3842, -0.3940],
        [-0.3899, -0.1375, -0.1852,  ..., -0.2971, -0.2513, -0.1511]],
       grad_fn=<CatBackward>)
after tensor([[-0.1972,  0.0740,  0.0598,  ..., -0.1167, -0.3241, -0.2236],
        [-0.2561, -0.0915,  0.0117,  ..., -0.1869, -0.1928, -0.1908],
        [-0.1542, -0.0777, -0.1393,  ..., -0.2832, -0.1275, -0.2211],
        ...,
        [ 0.2282, -0.1450,  0.1607,  ..., -0.3200,  0.1188,  0.0300],
        [ 0.2394, -0.0607,  0.2097,  ..., -0.1027,  0.0502,  0.1278],
        [ 0.2266,  0.2485,  0.0360,  ..., -0.2232, -0.3111, -0.3713]],
       grad_fn=<SelectBackward>)
before tensor([[ 0.0703, -0.2964,  0.1920,  ..., -0.4197

KeyboardInterrupt: 

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, vocab, glove_dict):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = ModelEmbeddings(input_size, vocab, glove_dict)
        self.gru = nn.GRU(input_size, hidden_size)
        self.linear = nn.Linear(self.hidden_size, self.hidden_size, bias = False)

    def forward(self, input_, hidden):
        embedded = self.embedding.source[0](input_)
        output = embedded
        output, hidden = self.gru(output, hidden)
        projected = self.linear(hidden.permute(1,0,2))
        return projected, hidden

    def initHidden(self, device = None):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
model = EncoderRNN(50, 50, vocab, glove_dict)
loss_function = nn.SmoothL1Loss(reduction = "sum")
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)