In [32]:
import numpy as np
from collections import namedtuple
import sys
from typing import List, Tuple, Dict, Set, Union
import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F
from torchnlp.nn import Attention
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from nltk import word_tokenize

from model_embeddings import ModelEmbeddings
from evaluator import Evaluator
from vocab import Vocab, VocabEntry
from utils import read_corpus, pad_sents, batch_iter

In [33]:
import fasttext
import fasttext.util
ft = fasttext.load_model('../data/cc.en.300.bin')
print(ft.get_dimension())
fasttext.util.reduce_model(ft, 100)
print(ft.get_dimension())



300
100


In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [35]:
print(ft.get_word_vector('hdello'))

[ 0.11238685  0.03634512  0.06272481  0.09328334  0.07476841 -0.02136788
  0.03564939  0.06776655 -0.00441328  0.06022001  0.00861067  0.03479845
  0.0326384  -0.0459407   0.06693012 -0.03741141 -0.03098805  0.08830236
  0.01789446 -0.00656855 -0.01357604  0.07741845 -0.06603968  0.00508138
 -0.02323393  0.03813258  0.04005572  0.02402163 -0.0174519   0.04406795
  0.02939343  0.04441914  0.0360864   0.00104841  0.00701074 -0.03081188
  0.02208788  0.03518926 -0.03685221 -0.0505261   0.04664233 -0.00106685
 -0.00812358  0.00405111  0.00817128  0.10642597  0.04042358  0.01328057
 -0.01919837 -0.02258547 -0.00679503 -0.01966     0.04078562 -0.01269364
  0.04565648  0.00711938  0.03343869  0.02761208 -0.07310422 -0.02281325
  0.0070655  -0.03781745  0.03798665  0.0331741   0.02306999 -0.04951117
  0.03532698  0.02406708  0.00787704  0.0381924  -0.00423297  0.0656371
 -0.01671075 -0.00671035  0.02436351 -0.01863142  0.01288848  0.0659969
 -0.04377548  0.03321797  0.01051717 -0.02191262 -0.0

In [36]:
definitions = []
unparsed_definition = []
words = []
src_sents = read_corpus('../data/data_train_definitions.txt')

In [37]:
with open('../data/data_train_words.txt') as f:
    words += f.read().splitlines()
    
with open('../data/data_train_definitions.txt') as f:
    unparsed_definition += f.read().splitlines()
    definitions += [word_tokenize(a) for a in unparsed_definition]

training_data = [(definitions[i], words[i]) for i in range(len(words))]

In [38]:
eval = Evaluator()
# fasttext_dict = eval.load_vectors(fname ="../data/wiki-news-300d-1M-subword.vec", max_line = 10000000)
fasttext_dict = {}
sub_fasttext_dict = {}
#only train words in the dictionary
for i in range(len(words)-1, -1, -1):
    fasttext_dict[words[i]] = ft.get_word_vector(words[i])
    sub_fasttext_dict[words[i]] = fasttext_dict[words[i]]
        
# high_freq_dict = eval.load_vectors(fname ="../data/wiki-news-300d-1M-subword.vec", max_line = 30000)
# sub_fasttext_dict.update()

src_sents = read_corpus('../data/data_train_definitions.txt')
vocab = VocabEntry.from_corpus(src_sents, 1000000, 0)
    
print(len(vocab))

number of word types: 23452, number of word types w/ frequency >= 0: 23452
23456


In [39]:
import pickle
with open("../data/words_defs_dict.train", "wb") as f:
    pickle.dump((words, definitions, sub_fasttext_dict), f)

In [40]:
assert(len(words) == len(definitions))

In [41]:
def create_emb_layer(weights_matrix, src_pad_token_idx, non_trainable=True):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim, src_pad_token_idx)
    emb_layer.weight.data.copy_(torch.from_numpy(weights_matrix)) #figure out what is here
    if non_trainable:
        emb_layer.weight.requires_grad = False
    return emb_layer, num_embeddings, embedding_dim

class ModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their embeddings.
    """

    def __init__(self, embed_size, vocab, fasttext_dict):
        """
        Init the Embedding layers.

        @param embed_size (int): Embedding size (dimensionality)
        @param vocab (VocabEntry)
        """
        super(ModelEmbeddings, self).__init__()

        self.embed_size = embed_size

        matrix_len = len(vocab)
        weights_matrix = np.zeros((matrix_len, self.embed_size))
        words_found = 0
        #print(len(vocab), weights_matrix.shape)
        for word, index in vocab.word2id.items():
            try:
                weights_matrix[index] = np.array(fasttext_dict[word])
                words_found += 1
            except KeyError:
                weights_matrix[index] = np.random.normal(scale=0.6, size=(self.embed_size,))

        # default values
        src_pad_token_idx = vocab['<pad>']
        self.source = create_emb_layer(weights_matrix, src_pad_token_idx, True)

In [42]:
# class GRUModel(nn.Module):
#     def __init__(self, input_size, hidden_size, vocab, fasttext_dict):
#         super(GRUModel, self).__init__()
#         self.hidden_size = hidden_size
#         self.embedding = ModelEmbeddings(input_size, vocab, fasttext_dict)
#         self.gru = nn.GRU(input_size, hidden_size)
        
#         self.linear = nn.Linear(self.hidden_size, self.hidden_size, bias = True)

#     def forward(self, input_, hidden, lengths , dropout_rate = 0.3):
#         embedded = self.embedding.source[0](input_)
#         embedded = pack_padded_sequence(embedded, lengths)
#         output, hidden = self.gru(embedded, hidden)
#         dropout = nn.Dropout(dropout_rate)
#         hidden_dropped = dropout(hidden.permute(1,0,2)) # you dont need dropout in validation
#         projected = self.linear(hidden_dropped)
#         return projected, hidden

#     def initHidden(self, batch_size, device = None):
#         return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [43]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, vocab, fasttext_dict):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.vocab = vocab
        self.embedding = ModelEmbeddings(input_size, vocab, fasttext_dict)
        self.lstm = nn.LSTM(input_size, hidden_size, bidirectional = True)
        self.linear = nn.Linear(self.hidden_size * 2, self.hidden_size, bias = True)
        self.attention = Attention(self.hidden_size)

    def forward(self, input_, hidden, lengths, dropout_rate = 0.3):
        embedded = self.embedding.source[0](input_)
        embedded = pack_padded_sequence(embedded, lengths)
        output, (h_n, c_n) = self.lstm(embedded)
        o, w = self.attention(h_n, output)
        #print(h_n.shape, c_n.shape)
        dropout = nn.Dropout(dropout_rate)
        hidden_dropped = dropout(h_n.contiguous().view(1, -1, self.hidden_size * 2).permute(1,0,2)) # you dont need dropout in validation
        projected = self.linear(hidden_dropped)
        return projected, hidden

    def initHidden(self, batch_size, device = None):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [44]:
model = LSTMModel(100, 100, vocab, fasttext_dict)
loss_function = nn.CosineEmbeddingLoss(margin=0.0, reduction='sum')
optimizer = torch.optim.Adagrad(model.parameters(), lr = 0.1)

In [45]:
# model = GRUModel(50, 50, vocab, fasttext_dict)
# loss_function = nn.L1Loss(reduction = "sum")
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01, betas=(0.9, 0.999))

In [46]:
# #check overfit

# definition_indices = vocab.words2indices(definitions)
# words_in = 0
# words_out = 0

# import timeit
# start = timeit.default_timer()
# losses = []

# batch_size = 128

# for src_sents, tgt_word in batch_iter(training_data, batch_size, False):
#     for i in range(300):
#         model.zero_grad()
#         x_lengths = [len(sent) for sent in src_sents]
#         x = vocab.to_input_tensor(src_sents, "cpu")
#         init_hidden = model.initHidden(len(src_sents), "cpu")
#         tag_scores = model.forward(x, init_hidden, x_lengths)
#         y_array = model.embedding.source[0](torch.tensor(vocab.words2indices(tgt_word))).double()
#         y_pred = tag_scores[0].squeeze(dim = 1).double()
#         loss = loss_function(y_pred, y_array, torch.tensor(1))
#         loss.backward()
#         optimizer.step() 
#         losses.append(loss)
#         if i % 100 == 0:
#             print(i, loss)
#     break
    
# stop = timeit.default_timer()

# print('Time: ', stop - start)

# import matplotlib.pyplot as plt
# print(plt.plot([l.double() for l in losses][:1200]))

# model.zero_grad()
# x_lengths = [len(sent) for sent in src_sents]
# x = vocab.to_input_tensor(src_sents, "cpu")
# init_hidden = model.initHidden(len(src_sents), "cpu")
# tag_scores = model.forward(x, init_hidden, x_lengths)
# y_pred = tag_scores[0].squeeze(dim = 1).double()

# validate_dict = dict([(w, model.embedding.source[0](torch.tensor(vocab[w])).numpy()) for w in set(words)])
# print(len(validate_dict))

# print(y_pred.shape)
# for i in range(len(y_pred)):
#     eval.top_ten_hundred(validate_dict, tgt_word[i], y_pred[i].detach().numpy())

In [None]:
definition_indices = vocab.words2indices(definitions)
words_in = 0
words_out = 0

import timeit
start = timeit.default_timer()
losses = []

batch_size = 128

for epoch in range(5000):
    for src_sents, tgt_word in batch_iter(training_data, batch_size, False):
        model.zero_grad()
        x_lengths = [len(sent) for sent in src_sents]
        x = vocab.to_input_tensor(src_sents, device)
        init_hidden = model.initHidden(len(src_sents), device)
        tag_scores = model.forward(x, init_hidden, x_lengths)
        y_array = model.embedding.source[0](torch.tensor(vocab.words2indices(tgt_word))).double()
        y_pred = tag_scores[0].squeeze(dim = 1).double()
        loss = loss_function(y_pred, y_array, torch.tensor(1))
        loss.backward()
        optimizer.step() 
    losses.append(loss)
    print(epoch, loss, timeit.default_timer() - start)
    
stop = timeit.default_timer()

print('Time: ', stop - start)

import matplotlib.pyplot as plt
print(plt.plot([l.double() for l in losses]))

0 tensor(33.2199, dtype=torch.float64, grad_fn=<SumBackward0>) 61.331675972032826
1 tensor(33.1883, dtype=torch.float64, grad_fn=<SumBackward0>) 122.25149954605149
2 tensor(33.2089, dtype=torch.float64, grad_fn=<SumBackward0>) 183.20772688701982
3 tensor(33.1495, dtype=torch.float64, grad_fn=<SumBackward0>) 252.38635815604357
4 tensor(33.1733, dtype=torch.float64, grad_fn=<SumBackward0>) 325.45885230600834
5 tensor(33.1164, dtype=torch.float64, grad_fn=<SumBackward0>) 389.9770028950297
6 tensor(33.2105, dtype=torch.float64, grad_fn=<SumBackward0>) 453.89217092405306
7 tensor(33.2641, dtype=torch.float64, grad_fn=<SumBackward0>) 528.9641185130458
8 tensor(33.1684, dtype=torch.float64, grad_fn=<SumBackward0>) 595.7289804510074
9 tensor(33.0121, dtype=torch.float64, grad_fn=<SumBackward0>) 666.4902101700427
10 tensor(33.0295, dtype=torch.float64, grad_fn=<SumBackward0>) 736.4332790100016
11 tensor(32.6346, dtype=torch.float64, grad_fn=<SumBackward0>) 807.7851033370243
12 tensor(32.4849, d

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
print(plt.plot([l.double() for l in losses]))

In [None]:
model.zero_grad()
x_lengths = [len(sent) for sent in src_sents]
x = vocab.to_input_tensor(src_sents, device)
init_hidden = model.initHidden(len(src_sents), device)
tag_scores = model.forward(x, init_hidden, x_lengths)
y_pred = tag_scores[0].squeeze(dim = 1).double()

validate_dict = dict([(w, model.embedding.source[0](torch.tensor(vocab[w])).detach().numpy()) for w in set(words)])
print(len(validate_dict))

print(y_pred.shape)
for i in range(len(y_pred)):
    eval.top_ten_hundred(validate_dict, tgt_word[i], y_pred[i].detach().numpy())