In [3]:
import numpy as np
from collections import namedtuple
import sys
from typing import List, Tuple, Dict, Set, Union
import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from nltk import word_tokenize

#from model_embeddings import ModelEmbeddings
from evaluator import Evaluator
from vocab import Vocab, VocabEntry
from utils import read_corpus, pad_sents
Hypothesis = namedtuple('Hypothesis', ['value', 'score'])

In [4]:
definitions = []
unparsed_definition = []
words = []
src_sents = read_corpus('../data/data_train_definitions.txt')

In [5]:
with open('../data/data_train_words.txt') as f:
    words += f.read().splitlines()
    
with open('../data/data_train_definitions.txt') as f:
    unparsed_definition += f.read().splitlines()
    definitions += [word_tokenize(a) for a in unparsed_definition]

In [6]:
assert(len(words) == len(definitions))

In [None]:
eval = Evaluator()
glove_dict = eval.load_glove_embeddings(max_line = 50000)

src_sents = read_corpus('../data/data_train_definitions.txt')
vocab = VocabEntry.from_corpus(src_sents, 30000, 0)

number of word types: 23452, number of word types w/ frequency >= 0: 23452


In [None]:
def create_emb_layer(weights_matrix, src_pad_token_idx, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim, src_pad_token_idx)
    emb_layer.weight.data.copy_(torch.from_numpy(weights_matrix)) #figure out what is here
    if non_trainable:
        emb_layer.weight.requires_grad = False
    return emb_layer, num_embeddings, embedding_dim

class ModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their embeddings.
    """

    def __init__(self, embed_size, vocab, glove_dict):
        """
        Init the Embedding layers.

        @param embed_size (int): Embedding size (dimensionality)
        @param vocab (VocabEntry)
        """
        super(ModelEmbeddings, self).__init__()

        self.embed_size = embed_size

        matrix_len = len(vocab)
        weights_matrix = np.zeros((matrix_len, self.embed_size))
        words_found = 0
        #print(len(vocab), weights_matrix.shape)
        for word, index in vocab.word2id.items():
            try:
                weights_matrix[index] = np.array(glove_dict[word])
                words_found += 1
            except KeyError:
                weights_matrix[index] = np.random.normal(scale=0.6, size=(self.embed_size,))

        # default values
        src_pad_token_idx = vocab['<pad>']
        self.source = create_emb_layer(weights_matrix, src_pad_token_idx, True)
        ### END YOUR CODE


In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, vocab, glove_dict):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = ModelEmbeddings(input_size, vocab, glove_dict)
        self.gru = nn.GRU(input_size, hidden_size)
        self.linear = nn.Linear(self.hidden_size, self.hidden_size, bias = True)

    def forward(self, input_, hidden):
        embedded = self.embedding.source[0](input_)
        output = embedded
        output, hidden = self.gru(output, hidden)
        projected = self.linear(hidden.permute(1,0,2))
        return projected, hidden

    def initHidden(self, device = None):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
model = EncoderRNN(50, 50, vocab, glove_dict)
loss_function = nn.SmoothL1Loss(reduction = "sum")
optimizer = torch.optim.SGD(model.parameters(), lr=0.2)

In [None]:
definition_indices = vocab.words2indices(definitions)
words_in = 0
words_out = 0
for epoch in range(500000):  # again, normally you would NOT do 300 epochs, it is toy data
    for i in range(len(definition_indices)): #should be range(len(definition_indices))
        if words[i] not in glove_dict: 
            continue
        model.zero_grad()
        x = torch.tensor(definition_indices[i])

        init_hidden = model.initHidden()
        tag_scores = model.forward(x.view(x.shape[0], 1), init_hidden)
        y_array = np.array(glove_dict[words[i]]) if words[i] in glove_dict else np.random.normal(scale=0.6, size=(50,))
        y = torch.tensor(y_array).double()
        y_pred = tag_scores[0].view((tag_scores[0].shape[2])).double()

        loss = loss_function(y_pred, y)

        loss.backward()
        optimizer.step()
    print(epoch, loss)

0 tensor(0.0055, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
1 tensor(0.0033, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
2 tensor(0.0079, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
3 tensor(0.0068, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)
4 tensor(0.0040, dtype=torch.float64, grad_fn=<SmoothL1LossBackward>)


In [None]:
validate_dict = {}
with open('../data/data_train_words.txt') as f:
    lines = f.readlines()
lines = [line[:-1] for line in lines]
validate_words = sorted(list(set(lines)))
for word in validate_words:
    if word in glove_dict:
        validate_dict[word] = glove_dict[word]

In [None]:
count = 0
for i in range(200, 250): #len(definition_indices)
    if count >= 100:
        break
    x = torch.tensor(definition_indices[i])
    #print(x.shape)
    init_hidden = model.initHidden()
    tag_scores = model.forward(x.view(x.shape[0], 1), init_hidden)
    #print(tag_scores[0].shape, y.shape)
    if words[i] in validate_dict: 
        count += 1
        #print(words[i])
        y_array = np.array(glove_dict[words[i]]) if words[i] in glove_dict else np.random.normal(scale=0.6, size=(50,))
        y = torch.tensor(y).double()
        y_pred = tag_scores[0].view((tag_scores[0].shape[2])).double()
        eval.top_ten_hundred(validate_dict, words[i], y_pred.detach().numpy())


## print(eval.compute_th_accuracy())