In [2]:
import numpy as np
from collections import namedtuple
import sys
from typing import List, Tuple, Dict, Set, Union
import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from nltk import word_tokenize

from model_embeddings import ModelEmbeddings
from evaluator import Evaluator
from vocab import Vocab, VocabEntry
from utils import read_corpus, pad_sents, batch_iter

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\willi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
definitions = []
unparsed_definition = []
words = []
src_sents = read_corpus('../data/data_train_definitions.txt')

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 1799: character maps to <undefined>

In [3]:
with open('../data/data_train_words.txt') as f:
    words += f.read().splitlines()
    
with open('../data/data_train_definitions.txt') as f:
    unparsed_definition += f.read().splitlines()
    definitions += [word_tokenize(a) for a in unparsed_definition]

training_data = [(definitions[i], words[i]) for i in range(len(words))]

In [5]:
eval = Evaluator()
fasttext_dict = eval.load_vectors(fname ="../data/wiki-news-300d-1M-subword.vec", max_line = 10000000)
sub_fasttext_dict = {}
#only train words in the dictionary
for i in range(len(words)-1, -1, -1):
    if words[i] not in fasttext_dict:
        print(words[i])
        words.pop(i)
        definitions.pop(i)
    else:
        sub_fasttext_dict[words[i]] = fasttext_dict[words[i]]
        
high_freq_dict = eval.load_vectors(fname ="../data/wiki-news-300d-1M-subword.vec", max_line = 30000)
sub_fasttext_dict.update()

src_sents = read_corpus('../data/data_train_definitions.txt')
vocab = VocabEntry.from_corpus(src_sents, 1000000, 0)
    
print(len(vocab))

999994it [01:06, 15032.04it/s]
0it [00:00, ?it/s]

disheartenment
disheartenment
disheartenment
disheartenment
disheartenment
siss
siss
siss
siss
siss
unhorse
unhorse
unhorse
unhorse
splasher
splasher
splasher
splasher
brininess
brininess
despicableness
despicableness
despicableness
despicableness
entr'acte
entr'acte
entr'acte
entr'acte
fluidness
fluidness
fluidness
fluidness
perfecter
perfecter
perfecter
perfecter
perfecter
perfecter
butterfingered
butterfingered
butterfingered
butterfingered
butterfingered
undersoil
undersoil
whitewood
whitewood
whitewood
rosiness
rosiness
rosiness
rosiness
rosiness
hoggish
hoggish
hoggish
hoggish
hoggish
hoggish
fleetly
fleetly
fleetly
overshoe
overshoe
overshoe
liquidambar
liquidambar
liquidambar
plower
plower
plower
plower
pigfish
pigfish
pigfish
pigfish
eyehole
eyehole
eyehole
bedevilment
bedevilment
bedevilment
corrugate
corrugate
corrugate
corrugate
corrugate
corrugate
corrugate
footmark
footmark
footmark
egger
egger
egger
egger
countercheck
countercheck
countercheck
countercheck
countercheck
c

29064it [00:02, 14031.55it/s]


number of word types: 23452, number of word types w/ frequency >= 0: 23452
23456


In [16]:
import pickle
with open("../data/words_defs_dict.train", "wb") as f:
    pickle.dump((words, definitions, sub_fasttext_dict), f)

In [6]:
assert(len(words) == len(definitions))

In [7]:
def create_emb_layer(weights_matrix, src_pad_token_idx, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim, src_pad_token_idx)
    emb_layer.weight.data.copy_(torch.from_numpy(weights_matrix)) #figure out what is here
    if non_trainable:
        emb_layer.weight.requires_grad = False
    return emb_layer, num_embeddings, embedding_dim

class ModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their embeddings.
    """

    def __init__(self, embed_size, vocab, fasttext_dict):
        """
        Init the Embedding layers.

        @param embed_size (int): Embedding size (dimensionality)
        @param vocab (VocabEntry)
        """
        super(ModelEmbeddings, self).__init__()

        self.embed_size = embed_size

        matrix_len = len(vocab)
        weights_matrix = np.zeros((matrix_len, self.embed_size))
        words_found = 0
        #print(len(vocab), weights_matrix.shape)
        for word, index in vocab.word2id.items():
            try:
                weights_matrix[index] = np.array(fasttext_dict[word])
                words_found += 1
            except KeyError:
                weights_matrix[index] = np.random.normal(scale=0.6, size=(self.embed_size,))

        # default values
        src_pad_token_idx = vocab['<pad>']
        self.source = create_emb_layer(weights_matrix, src_pad_token_idx, False)

In [7]:
# class GRUModel(nn.Module):
#     def __init__(self, input_size, hidden_size, vocab, fasttext_dict):
#         super(GRUModel, self).__init__()
#         self.hidden_size = hidden_size
#         self.embedding = ModelEmbeddings(input_size, vocab, fasttext_dict)
#         self.gru = nn.GRU(input_size, hidden_size)
        
#         self.linear = nn.Linear(self.hidden_size, self.hidden_size, bias = True)

#     def forward(self, input_, hidden, lengths , dropout_rate = 0.3):
#         embedded = self.embedding.source[0](input_)
#         embedded = pack_padded_sequence(embedded, lengths)
#         output, hidden = self.gru(embedded, hidden)
#         dropout = nn.Dropout(dropout_rate)
#         hidden_dropped = dropout(hidden.permute(1,0,2)) # you dont need dropout in validation
#         projected = self.linear(hidden_dropped)
#         return projected, hidden

#     def initHidden(self, batch_size, device = None):
#         return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [4]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, vocab, fasttext_dict):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.vocab = vocab
        self.embedding = ModelEmbeddings(input_size, vocab, fasttext_dict)
        self.lstm = nn.LSTM(input_size, hidden_size, bidirectional = True)
        self.linear = nn.Linear(self.hidden_size * 2, self.hidden_size, bias = True)

    def forward(self, input_, hidden, lengths , dropout_rate = 0.3):
        embedded = self.embedding.source[0](input_)
        embedded = pack_padded_sequence(embedded, lengths)
        output, (h_n, c_n) = self.lstm(embedded)
        #print(h_n.shape, c_n.shape)
        dropout = nn.Dropout(dropout_rate)
        hidden_dropped = dropout(h_n.contiguous().view(1, -1, self.hidden_size * 2).permute(1,0,2)) # you dont need dropout in validation
        projected = self.linear(hidden_dropped)
        return projected, hidden

    def initHidden(self, batch_size, device = None):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [5]:
model = LSTMModel(300, 300, vocab, fasttext_dict)
loss_function = nn.CosineEmbeddingLoss(margin=0.0, reduction='sum')
optimizer = torch.optim.Adagrad(model.parameters(), lr = 0.1)

NameError: name 'vocab' is not defined

In [6]:
# model = GRUModel(50, 50, vocab, fasttext_dict)
# loss_function = nn.L1Loss(reduction = "sum")
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01, betas=(0.9, 0.999))

In [7]:
# #check overfit

# definition_indices = vocab.words2indices(definitions)
# words_in = 0
# words_out = 0

# import timeit
# start = timeit.default_timer()
# losses = []

# batch_size = 128

# for src_sents, tgt_word in batch_iter(training_data, batch_size, False):
#     for i in range(300):
#         model.zero_grad()
#         x_lengths = [len(sent) for sent in src_sents]
#         x = vocab.to_input_tensor(src_sents, "cpu")
#         init_hidden = model.initHidden(len(src_sents), "cpu")
#         tag_scores = model.forward(x, init_hidden, x_lengths)
#         y_array = model.embedding.source[0](torch.tensor(vocab.words2indices(tgt_word))).double()
#         y_pred = tag_scores[0].squeeze(dim = 1).double()
#         loss = loss_function(y_pred, y_array, torch.tensor(1))
#         loss.backward()
#         optimizer.step() 
#         losses.append(loss)
#         if i % 100 == 0:
#             print(i, loss)
#     break
    
# stop = timeit.default_timer()

# print('Time: ', stop - start)

# import matplotlib.pyplot as plt
# print(plt.plot([l.double() for l in losses][:1200]))

# model.zero_grad()
# x_lengths = [len(sent) for sent in src_sents]
# x = vocab.to_input_tensor(src_sents, "cpu")
# init_hidden = model.initHidden(len(src_sents), "cpu")
# tag_scores = model.forward(x, init_hidden, x_lengths)
# y_pred = tag_scores[0].squeeze(dim = 1).double()

# validate_dict = dict([(w, model.embedding.source[0](torch.tensor(vocab[w])).numpy()) for w in set(words)])
# print(len(validate_dict))

# print(y_pred.shape)
# for i in range(len(y_pred)):
#     eval.top_ten_hundred(validate_dict, tgt_word[i], y_pred[i].detach().numpy())

In [8]:
definition_indices = vocab.words2indices(definitions)
words_in = 0
words_out = 0

import timeit
start = timeit.default_timer()
losses = []

batch_size = 128

for epoch in range(5000):
    for src_sents, tgt_word in batch_iter(training_data, batch_size, False):
        model.zero_grad()
        x_lengths = [len(sent) for sent in src_sents]
        x = vocab.to_input_tensor(src_sents, "cpu")
        init_hidden = model.initHidden(len(src_sents), "cpu")
        tag_scores = model.forward(x, init_hidden, x_lengths)
        y_array = model.embedding.source[0](torch.tensor(vocab.words2indices(tgt_word))).double()
        y_pred = tag_scores[0].squeeze(dim = 1).double()
        loss = loss_function(y_pred, y_array, torch.tensor(1))
        loss.backward()
        optimizer.step() 
    losses.append(loss)
    print(epoch, loss, timeit.default_timer() - start)
    
stop = timeit.default_timer()

print('Time: ', stop - start)

import matplotlib.pyplot as plt
print(plt.plot([l.double() for l in losses]))

NameError: name 'vocab' is not defined

In [9]:
import matplotlib.pyplot as plt
print(plt.plot([l.double() for l in losses]))

NameError: name 'losses' is not defined

In [None]:
model.zero_grad()
x_lengths = [len(sent) for sent in src_sents]
x = vocab.to_input_tensor(src_sents, "cpu")
init_hidden = model.initHidden(len(src_sents), "cpu")
tag_scores = model.forward(x, init_hidden, x_lengths)
y_pred = tag_scores[0].squeeze(dim = 1).double()

validate_dict = dict([(w, model.embedding.source[0](torch.tensor(vocab[w])).detach().numpy()) for w in set(words)])
print(len(validate_dict))

print(y_pred.shape)
for i in range(len(y_pred)):
    eval.top_ten_hundred(validate_dict, tgt_word[i], y_pred[i].detach().numpy())