In [11]:
from transformers import BertModel, BertTokenizer
import io

import numpy as np
from collections import namedtuple
import sys
from typing import List, Tuple, Dict, Set, Union
import torch
import torch.nn as nn
import torch.nn.utils
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
from nltk import word_tokenize
import pickle
import timeit
from scipy import spatial

from evaluator import Evaluator
from vocab import Vocab, VocabEntry
from utils import read_corpus, pad_sents, batch_iter

In [2]:
words, defs, ft_dict = pickle.load( open( "../data/words_defs_dict_1M.train", "rb" ))

vocab = VocabEntry.from_corpus(defs, 1000000, 0)
for w in ft_dict:
    vocab.add(w)

number of word types: 23437, number of word types w/ frequency >= 0: 23437


In [3]:
def create_emb_layer(weights_matrix, src_pad_token_idx, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim, src_pad_token_idx)
    emb_layer.weight.data.copy_(torch.from_numpy(weights_matrix)) #figure out what is here
    if non_trainable:
        emb_layer.weight.requires_grad = False
    return emb_layer, num_embeddings, embedding_dim

class ModelEmbeddings(nn.Module): 
    """
    Class that converts input words to their embeddings.
    """

    def __init__(self, embed_size, vocab, fasttext_dict):
        """
        Init the Embedding layers.

        @param embed_size (int): Embedding size (dimensionality)
        @param vocab (VocabEntry)
        """
        super(ModelEmbeddings, self).__init__()

        self.embed_size = embed_size

        matrix_len = len(vocab)
        weights_matrix = np.zeros((matrix_len, self.embed_size))
        words_found = 0
        #print(len(vocab), weights_matrix.shape)
        for word, index in vocab.word2id.items():
            try:
                weights_matrix[index] = np.array(fasttext_dict[word])
                words_found += 1
            except KeyError:
                weights_matrix[index] = np.random.normal(scale=0.6, size=(self.embed_size,))

        # default values
        src_pad_token_idx = vocab['<pad>']
        self.source = create_emb_layer(weights_matrix, src_pad_token_idx, False)

In [4]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data

In [5]:
class ReverseDictionary(nn.Module):

    def __init__(self, embed_dim, hidden_dim, vocab, ft_dict, freeze_bert = False):
        super(ReverseDictionary, self).__init__()
        #Instantiating BERT model object 
        
        self.ft_embedding = ModelEmbeddings(embed_dim, vocab, ft_dict)
        #self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Freeze bert layers
#         if freeze_bert:
#             for p in self.bert_layer.parameters():
#                 p.requires_grad = False
        
        #Classification layer
        self.lstm_fasttext = nn.LSTM(embed_dim, hidden_dim)
        self.lin_layer = nn.Linear(hidden_dim, embed_dim)


    def forward(self, bert_input, ft_input, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        embedded = self.ft_embedding.source[0](ft_input)
        
#         cont_reps, _ = self.bert_layer(bert_input, attention_mask = attn_masks)
        
        output, (cn, hn) = self.lstm_fasttext(embedded.unsqueeze(1))
        
#         cls_rep = cont_reps[:, 0]
        
        #print(cn.squeeze(1).shape, cls_rep.shape)

        toLinear = cn.squeeze(1) # torch.cat([cls_rep, cn.squeeze(1)], 1)

        #Obtaining the representation of [CLS] head
        
        #feed cls_rep to -> fasttext layer
        projected = self.lin_layer(toLinear)

        return projected


In [6]:
model = ReverseDictionary(300, 300, vocab, ft_dict)
loss_function = nn.L1Loss(reduction='mean')
optimizer = torch.optim.Adagrad(model.parameters(), lr = 0.0001)

In [7]:
int_sents = vocab.words2indices(defs)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = max(len(x) for x in int_sents)
sents_ft_id = [torch.tensor(i, dtype=torch.long, device="cpu") for i in int_sents]
sents_bert_id = []
masks = []
for d in defs:
    tokens = ['[CLS]'] + d + ['[SEP]']
    padded_tokens = tokens + ['[PAD]' for _ in range(max_len - len(tokens))]
    attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
    seg_ids = [0 for _ in range(len(padded_tokens))]
    token_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
    token_ids = torch.tensor(token_ids).unsqueeze(0) 
    attn_mask = torch.tensor(attn_mask).unsqueeze(0) 
    sents_bert_id.append(token_ids)
    masks.append(attn_mask)
assert(len(sents_bert_id) == len(masks))
assert(len(masks) == len(sents_ft_id))

In [12]:
start = timeit.default_timer()
losses = []
for epoch in range(5000):
    for i in range(10,11):
        print(words[i])
        model.zero_grad()
        tag_scores = model.forward(sents_bert_id[i], sents_ft_id[i], masks[i])
        y_pred = tag_scores[0].double().unsqueeze(1)
        y_array = model.ft_embedding.source[0](torch.tensor(vocab[words[i]])).double().unsqueeze(1)
        #print(y_pred.shape, y_array.shape)
        loss = loss_function(y_pred, y_array)
        loss.backward()
        optimizer.step() 
        print(loss)
    losses.append(loss)
    print(epoch, loss, timeit.default_timer() - start)

fault
tensor(0.0408, dtype=torch.float64, grad_fn=<MeanBackward0>)
0 tensor(0.0408, dtype=torch.float64, grad_fn=<MeanBackward0>) 0.1167511000000161
fault
tensor(0.0408, dtype=torch.float64, grad_fn=<MeanBackward0>)
1 tensor(0.0408, dtype=torch.float64, grad_fn=<MeanBackward0>) 0.18973110000001725
fault
tensor(0.0407, dtype=torch.float64, grad_fn=<MeanBackward0>)
2 tensor(0.0407, dtype=torch.float64, grad_fn=<MeanBackward0>) 0.2639163000000053
fault
tensor(0.0406, dtype=torch.float64, grad_fn=<MeanBackward0>)
3 tensor(0.0406, dtype=torch.float64, grad_fn=<MeanBackward0>) 0.33952469999999835
fault
tensor(0.0405, dtype=torch.float64, grad_fn=<MeanBackward0>)
4 tensor(0.0405, dtype=torch.float64, grad_fn=<MeanBackward0>) 0.4112983999999926
fault
tensor(0.0404, dtype=torch.float64, grad_fn=<MeanBackward0>)
5 tensor(0.0404, dtype=torch.float64, grad_fn=<MeanBackward0>) 0.48558479999999804
fault
tensor(0.0403, dtype=torch.float64, grad_fn=<MeanBackward0>)
6 tensor(0.0403, dtype=torch.float64

tensor(0.0364, dtype=torch.float64, grad_fn=<MeanBackward0>)
55 tensor(0.0364, dtype=torch.float64, grad_fn=<MeanBackward0>) 4.37024679999999
fault
tensor(0.0364, dtype=torch.float64, grad_fn=<MeanBackward0>)
56 tensor(0.0364, dtype=torch.float64, grad_fn=<MeanBackward0>) 4.444668500000006
fault
tensor(0.0363, dtype=torch.float64, grad_fn=<MeanBackward0>)
57 tensor(0.0363, dtype=torch.float64, grad_fn=<MeanBackward0>) 4.516454199999998
fault
tensor(0.0362, dtype=torch.float64, grad_fn=<MeanBackward0>)
58 tensor(0.0362, dtype=torch.float64, grad_fn=<MeanBackward0>) 4.5908223000000135
fault
tensor(0.0361, dtype=torch.float64, grad_fn=<MeanBackward0>)
59 tensor(0.0361, dtype=torch.float64, grad_fn=<MeanBackward0>) 4.668869999999998
fault
tensor(0.0360, dtype=torch.float64, grad_fn=<MeanBackward0>)
60 tensor(0.0360, dtype=torch.float64, grad_fn=<MeanBackward0>) 4.742874900000004
fault
tensor(0.0360, dtype=torch.float64, grad_fn=<MeanBackward0>)
61 tensor(0.0360, dtype=torch.float64, grad_f

KeyboardInterrupt: 

In [18]:
eval = Evaluator()
model.zero_grad()

for i in range(10,11):
    model.zero_grad()
    tag_scores = model.forward(sents_bert_id[i], sents_ft_id[i], masks[i])
    y_pred = tag_scores[0].double()#.unsqueeze(1)
    #print(y_pred)
    y_array = model.ft_embedding.source[0](torch.tensor(vocab[words[i]])).double().unsqueeze(1)
    #print(y_array)
    #print(y_pred.shape, y_array.shape)
    loss = loss_function(y_pred, y_array)
    #eval.top_ten_hundred(ft_dict, words[i], y_pred[i].detach().numpy())
    print(spatial.distance.cosine(ft_dict['noon'],y_pred[i].detach().numpy()))
    print(spatial.distance.cosine(ft_dict['fault'],y_pred[i].detach().numpy()))
    print(sorted(ft_dict.keys(), key=lambda word: spatial.distance.cosine(ft_dict[word], y_pred[i].detach().numpy())))[:10]
    print(loss)

AttributeError: module 'scipy.spatial.distance' has no attribute 'coside'