In [1]:
# keras implementation of context2vec (Melamud et al., 2016, CoNLL)
# bi-directional language model using CBOW and negative sampling methods

In [2]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, LSTM, Dense, Dropout, Input, concatenate, Flatten, dot, ReLU, Lambda, Layer
from keras.models import Model, load_model
from keras import optimizers
from keras.utils.data_utils import get_file
from keras.utils import to_categorical
import keras.backend as K

import keras_tqdm
import numpy as np
import re
import os

Using TensorFlow backend.


In [3]:
VALIDATION_SPLIT = 0.2
MAX_SEQUENCE_LENGTH = 64  # following the MSCC setting for ukwac-100M corpus
MAX_NB_WORDS = None
MIN_WORD_FREQ = 3
NEGATIVE_NUM = 10
NEGATIVE_FAC = 0.25
MIN_SENT_LEN = 10
MAX_SENT_LEN = MAX_SEQUENCE_LENGTH

# data preparation 

## load txt file 

In [4]:
# TODO: make this part as a module / save the processed results for modeling

In [5]:
path = "../ukwac/ukwac_subset_1M.txt"
with open(path, encoding='utf-8', errors='ignore') as f:
    raw_text = f.read().splitlines()

In [6]:
# filtering out too short & long sentences
corpus = []
for sents in raw_text:
    sent_li = re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', sents)
    if(len(sent_li)>1):
        for sent in sent_li:
            sent_len = len(sent.split())            
            if (sent_len>=MIN_SENT_LEN) & (sent_len<=MAX_SENT_LEN):
                corpus.append(sent)

In [7]:
corpus[:5]

["Oldham 's NHS Stop Smoking Service , established in 2001 , is here to help smokers who wish to give up .",
 'The support the Stop Smoking Service offer is designed to build motivation to quit , preparation for the quit day , help to survive the first week and beyond , provide useful information on keeping weight off and overcoming cravings , identifying danger zones , and coping with a relapse .',
 "All the evidence shows that people who get support with their quit attempt are more likely to be successful than people who do n't , regardless of whether they use nicotine replacement or good-old fashioned will power .",
 'Stomach ulcers are made worse by smoking Smokers experience more asthma , and respiratory problems .',
 'It can affect eyesight , bone density , and the immune system Smokers have twice as much time off work due to illnesses .']

## tokenizer 

In [8]:
# tokenizer mod
## - <BOS> and <EOS> tokens; set freq as 0
## - minfreq: treat all low freq words as unknown words
## cf: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/49654

def tokenizer_mod(corpus, min_freq):
    tokenizer = Tokenizer(oov_token="<UNK>", lower=True)
    tokenizer.fit_on_texts(corpus)
    
    # low freq words to remove
    low_freq_words = [k for k, v in tokenizer.word_counts.items() if v < min_freq]
    for word in low_freq_words:
        del tokenizer.word_index[word]
        del tokenizer.word_docs[word]
        del tokenizer.word_counts[word]

    # additional word index
    tokenizer.word_index["<BOS>"] = len(tokenizer.word_index)
    tokenizer.word_index["<EOS>"] = tokenizer.word_index["<BOS>"]+1
    tokenizer.word_counts["<BOS>"] = 0
    tokenizer.word_counts["<EOS>"] = 0
    return(tokenizer)

- initialize the tokenizer

In [9]:
tokenizer = tokenizer_mod(corpus, MIN_WORD_FREQ)

## negative sampling 

In [10]:
TOTAL_NUM_WORDS = len(tokenizer.word_index)+1  # +1 for the padding token 0
TOTAL_NUM_WORDS  # ref. MSCC: 100K types, ukwac_full: 160k types

215925

In [11]:
# \alpha for negative sampling freq smoothing
word_sample_prob = {k:(v/TOTAL_NUM_WORDS)**NEGATIVE_FAC for k, v in tokenizer.word_counts.items()}
temp_prob_sum = np.array(list(word_sample_prob.values())).sum()
word_sample_prob = {k:v/temp_prob_sum for k, v in word_sample_prob.items()}

def get_negative_samples(targ_word_index, no_negatives, tokenizer, word_sample_prob):
    negative_samples = np.random.choice(list(word_sample_prob.keys()), 
                                        no_negatives, 
                                        replace=False, 
                                        p=list(word_sample_prob.values()))
    negative_incides = [tokenizer.word_index[w] for w in negative_samples]
    while targ_word_index in negative_incides:
        negative_samples = np.random.choice(list(word_sample_prob.keys()), no_negatives, replace=False, p=list(word_sample_prob.values()))
        negative_incides = [tokenizer.word_index[w] for w in negative_samples]
    return(np.array([negative_incides]))

## sentence -> context sequences and target words 

In [12]:
# optional multithreading
import threading

class threadsafe_iter:
    def __init__(self, it):
        self.it = it
        self.lock = threading.Lock()

    def __iter__(self):
        return self

    def __next__(self):
        with self.lock:
            return next(self.it)


def threadsafe_generator(f):
    def g(*a, **kw):
        return threadsafe_iter(f(*a, **kw))
    return g


# @threadsafe_generator # => uncomment to enable the multithreading option
def generate_batch(tokenizer, sentences, cntx_size, batch_size):
    # TODO: identify bottlenecks
    # output: context sequence, targets (positive and negatives), and labels for + and - targets
    # yield a batch for .fit_generator()
 
    cntx_l2r = []
    cntx_r2l = []
    targets = []
    targets_onehot = []
#     labels = []
    
    counter = 0
    # len(sentences) = batch size 
    ## each batch contains the list of CBOW datapoints that can created by the number of *batch_size* sentences
    while True:
        for sentence in sentences:
            token_list = tokenizer.texts_to_sequences([sentence])[0] # TODO:bottleneck?
            L = len(token_list)
#             rdm_idx = np.random.choice(range(L), size=L, replace=False) # randomize the order of yeilded output
            
            for targ_idx in range(L):
                x_l2r = [token_list[i] for i in range(0, targ_idx) if 0 <= i < L and i!=targ_idx]
                x_r2l = [token_list[i] for i in range(targ_idx, L) if 0 <= i < L and i!=targ_idx]                
                x_l2r = [[tokenizer.word_index["<BOS>"]] + x_l2r]
                x_r2l = [x_r2l + [tokenizer.word_index["<EOS>"]]]
                x_l2r_seq = pad_sequences(x_l2r, maxlen=cntx_size, padding='pre')[0]
                x_r2l_seq = pad_sequences(x_r2l, maxlen=cntx_size, padding='post')[0]
                cntx_l2r.append(x_l2r_seq)
                cntx_r2l.append(x_r2l_seq)

#                 targets.append([token_list[targ_idx]]) # for negative sampling setting
#                 labels.append([1])
                targets.append([token_list[targ_idx]])
                targets_onehot.append(to_categorical(token_list[targ_idx], num_classes=len(tokenizer.word_index)+1))
                
#                 negatives = get_negative_samples(targ_idx, NEGATIVE_NUM, tokenizer, word_sample_prob)[0]
#                 for negative in negatives:
#                     cntx_l2r.append(x_l2r_seq)  # for negative sampling setting
#                     cntx_r2l.append(x_r2l_seq)
#                     targets.append([negative])
#                     labels.append([0])

                counter += 1
                
                if(counter == batch_size):
                    # when the number of processed sentences reaches the batch num
#                     out = ([np.asarray(targets)[rdm_idx], np.asarray(cntx_l2r)[rdm_idx], np.asarray(cntx_r2l)[rdm_idx]], 
#                            [np.asarray(labels)[rdm_idx]])
                    out = ([np.asarray(targets), np.asarray(cntx_l2r), np.asarray(cntx_r2l)],
                           [np.asarray(targets_onehot)])
                    
                    yield(out)
                    
                    cntx_l2r = []
                    cntx_r2l = []
                    targets = []
                    targets_onehot = []
#                     labels = []
                    counter = 0

In [13]:
# example data list from a single sentence
next(generate_batch(tokenizer, corpus[:10], MAX_SEQUENCE_LENGTH, 2))

([array([[9416],
         [  23]]),
  array([[     0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0, 215922],
         [     0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,
 

# create model 

In [14]:
DIM = 600
H_DIM = int(DIM/2)
D_DIM = int(DIM*2)
O_DIM = DIM

idrop = 0.3
rdrop = 0.3
odrop = 0.3

alpha_lr = 1e-3

In [15]:
K.clear_session()

In [16]:
# inputs
# x_targ = Input((1,))
x_targ = Input((1,))
x_cntx_l2r = Input((MAX_SEQUENCE_LENGTH,))
x_cntx_r2l = Input((MAX_SEQUENCE_LENGTH,))

e = Embedding(TOTAL_NUM_WORDS, DIM, name='embedding_shared')
e_targ = e(x_targ)
e_cntx_l2r = e(x_cntx_l2r)
e_cntx_r2l = e(x_cntx_r2l)

In [17]:
# hidden LSTM layers
h_l2r = LSTM(H_DIM, go_backwards=False, dropout=idrop, recurrent_dropout=rdrop, return_sequences=False)(e_cntx_l2r) # h(<bos>) h(a) h(b)
h_r2l = LSTM(H_DIM, go_backwards=True,  dropout=idrop, recurrent_dropout=rdrop, return_sequences=False)(e_cntx_r2l) # h(d) h(e) h(<eos>)
h = concatenate([h_l2r, h_r2l], name='bilstm_concat')
# h = Flatten()(h)

In [18]:
# additional deep layers for the biLSTM layer
h = Dropout(odrop)(h)
h = Dense(D_DIM, activation='linear')(h)
h = ReLU()(h)
h = Dropout(odrop)(h)
h = Dense(O_DIM, activation='linear', name='bilstm_deepout')(h)

In [19]:
# https://stackoverflow.com/questions/47892380/how-can-i-use-tensorflows-sampled-softmax-loss-function-in-a-keras-modelq
# https://datascience.stackexchange.com/questions/28213/keras-negative-sampling-with-custom-layer
class SampledSoftmax(Layer):
    def __init__(self, num_sampled, num_classes, mode, **kwargs):
        self.num_sampled = num_sampled
        self.num_classes = num_classes
        self.mode = mode
        super(SampledSoftmax, self).__init__(**kwargs)

    def build(self, input_shape):
        dense_shape, classes_shape = input_shape
        self.kernel = self.add_weight(name='kernel',
                                      shape=(self.num_classes, dense_shape[1]),
                                      initializer='uniform',
                                      trainable=True)
        self.bias = self.add_weight(name='bias',
                                      shape=(self.num_classes,),
                                      initializer='uniform',
                                      trainable=True)  # Maybe zero

        super(SampledSoftmax, self).build(input_shape)  # Be sure to call this somewhere!

    def call(self, inputs_and_labels):
        inputs, labels = inputs_and_labels
        if self.mode == "train":
            loss = K.tf.nn.sampled_softmax_loss(
                weights=self.kernel,
                biases=self.bias,
#                 weights = inputs._keras_history[0].weights,
#                 bias = inputs._keras_history[0].bias,
                labels=labels,
                inputs=inputs,
                num_sampled=self.num_sampled,
                num_classes=self.num_classes,
                num_true=1)

        elif self.mode == "eval":
            logits = K.tf.matmul(inputs, tf.transpose(self.kernel))
            logits = K.tf.nn.bias_add(logits, self.bias)
            labels_one_hot = K.tf.one_hot(labels, self.num_classes)
            loss = K.tf.nn.softmax_cross_entropy_with_logits_v2(
                labels=labels_one_hot,
                logits=logits)

        return loss

    def compute_output_shape(self, input_shape):
        dense_shape, classes_shape = input_shape
        return (dense_shape[0], )
    
    def get_config(self):
        config = super(SampledSoftmax, self).get_config()
        config['num_sampled'] = self.num_sampled
        config['num_classes'] = self.num_classes
        config['mode'] = self.mode
        return config
    
#     def get_config(self):
#         config = {'num_sampled':self.num_sampled, 'num_classes':self.num_classes, 'mode':self.mode}
#         base_config = super(SampledSoftmax, self).get_config()
#         return dict(list(base_config.items()) + list(config.items()))

In [20]:
# # negative sampling sigmoids
# targ_cntx = dot([e_targ, h], axes=-1, normalize=False)
# targ_cntx = Dense(1, activation='sigmoid', name='target_sigmoid')(targ_cntx)
# targ_cntx = Flatten()(targ_cntx)
# targ_cntx = Lambda(lambda x:K.sum(x, axis=2), name='target_loss')(targ_cntx)

# negative sampled softmax
# targ_cntx = Dense(NEGATIVE_NUM+1, activation='softmax')(h)
targ_cntx = SampledSoftmax(NEGATIVE_NUM, TOTAL_NUM_WORDS, mode='train', name='sampled_softmax')([h, x_targ])

Instructions for updating:
Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.


In [21]:
# model compile
# model = Model([x_targ, x_cntx_l2r, x_cntx_r2l], targ_cntx) 
model = Model([x_targ, x_cntx_l2r, x_cntx_r2l], [targ_cntx]) 
adam = optimizers.Adam(lr=alpha_lr)
# model.compile(loss='binary_crossentropy', optimizer=adam)
# model.compile(loss='categorical_crossentropy', optimizer=adam)
# model.compile(loss=custom_loss, optimizer=adam)
model.compile(loss=lambda y_true, loss:loss, optimizer=adam)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 64)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 64)           0                                            
__________________________________________________________________________________________________
embedding_shared (Embedding)    multiple             129555000   input_2[0][0]                    
                                                                 input_3[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 300)          1081200     embedding_shared[1][0]           
__________

In [22]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)

# fit

In [23]:
corpus_size = len(corpus)
# train_idx = len(corpus) - round(len(corpus)*VALIDATION_SPLIT)
train_idx = corpus_size
batch_size = 100
n_steps_train = (train_idx//batch_size)
n_steps_val = ((len(corpus)-train_idx)//batch_size)
num_iter = 3
print(corpus_size, train_idx, n_steps_train, n_steps_val, num_iter)

3415481 3415481 34154 0 3


In [24]:
model.fit_generator(generate_batch(tokenizer, corpus, MAX_SEQUENCE_LENGTH, batch_size), 
                    steps_per_epoch=n_steps_train, epochs=num_iter,
#                     validation_data=generate_batch(tokenizer, corpus[train_idx:], MAX_SEQUENCE_LENGTH, batch_size), 
#                     validation_steps=n_steps_val,
                    verbose=0, callbacks=[keras_tqdm.TQDMNotebookCallback(leave_inner=True, leave_outer=True)],
                    max_queue_size=16, 
                    workers=8, use_multiprocessing=True)

  num_elements)


HBox(children=(IntProgress(value=0, description='Training', max=3, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=34154, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Epoch 1', max=34154, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Epoch 2', max=34154, style=ProgressStyle(description_width='i…




<keras.callbacks.History at 0x7f2e48e71e80>

In [None]:
# https://jovianlin.io/saving-loading-keras-models/

In [70]:
model.save('cbow_bilstm_ukwac_100M_a_'+str(alpha_lr)+'_dim_'+str(DIM)+'_nfac_'+str(NEGATIVE_FAC)+ \
           '_drop_'+str(odrop)+'_epochs_'+str(num_iter)+'neg.h5')

with open('cbow_bilstm_ukwac_100M_a_'+str(alpha_lr)+'_dim_'+str(DIM)+'_nfac_'+str(NEGATIVE_FAC)+ \
           '_drop_'+str(odrop)+'_epochs_'+str(num_iter)+'neg.json', 'w') as f:
    f.write(model.to_json())

# example results 

In [26]:
# TODO: cannot load the custom later 
# model = load_model('cbow_bilstm_ukwac_1M_a_'+str(alpha_lr)+'_dim_'+str(DIM)+'_nfac_'+str(NEGATIVE_FAC)+ \
#                    '_drop_'+str(odrop)+'_epochs_'+str(num_iter)+'neg.h5', 
#                    custom_objects={'SampledSoftmax':SampledSoftmax(NEGATIVE_NUM, TOTAL_NUM_WORDS, 'eval')})

In [72]:
# from keras.models import model_from_json
# with open('cbow_bilstm_ukwac_100M_a_'+str(alpha_lr)+'_dim_'+str(DIM)+'_nfac_'+str(NEGATIVE_FAC)+ \
#            '_drop_'+str(odrop)+'_epochs_'+str(num_iter)+'neg.json', 'r') as f:
#     model = model_from_json(f.read())

In [27]:
model.load_weights('cbow_bilstm_ukwac_100M_a_'+str(alpha_lr)+'_dim_'+str(DIM)+'_nfac_'+str(NEGATIVE_FAC)+ \
                   '_drop_'+str(odrop)+'_epochs_'+str(num_iter)+'neg.h5')

In [28]:
model.input

[<tf.Tensor 'input_1:0' shape=(?, 1) dtype=float32>,
 <tf.Tensor 'input_2:0' shape=(?, 64) dtype=float32>,
 <tf.Tensor 'input_3:0' shape=(?, 64) dtype=float32>]

## comparing sentence vectors 

In [29]:
model_h1 = Model(model.input[1:], model.get_layer('bilstm_concat').output)
model_h2 = Model(model.input[1:], model.get_layer('bilstm_deepout').output)

In [30]:
sent_vec1_h1 = model_h1.predict([pad_sequences(tokenizer.texts_to_sequences(["<BOS> A bunch of kids broken in and"]), maxlen=MAX_SEQUENCE_LENGTH, padding='pre'), 
                                 pad_sequences(tokenizer.texts_to_sequences([" the paintings <EOS>"]), maxlen=MAX_SEQUENCE_LENGTH, padding='post')])

In [31]:
sent_vec1_h2 = model_h2.predict([pad_sequences(tokenizer.texts_to_sequences(["<BOS> A bunch of kids broken in and"]), maxlen=MAX_SEQUENCE_LENGTH, padding='pre'), 
                                 pad_sequences(tokenizer.texts_to_sequences([" the paintings <EOS>"]), maxlen=MAX_SEQUENCE_LENGTH, padding='post')])

In [32]:
sent_vec2_h1 = model_h1.predict([pad_sequences(tokenizer.texts_to_sequences(["<BOS> A bunch of kids broken in and"]), maxlen=MAX_SEQUENCE_LENGTH, padding='pre'), 
                                 pad_sequences(tokenizer.texts_to_sequences([" the books <EOS>"]), maxlen=MAX_SEQUENCE_LENGTH, padding='post')])

In [33]:
sent_vec2_h2 = model_h2.predict([pad_sequences(tokenizer.texts_to_sequences(["<BOS> A bunch of kids broken in and"]), maxlen=MAX_SEQUENCE_LENGTH, padding='pre'), 
                                 pad_sequences(tokenizer.texts_to_sequences([" the books <EOS>"]), maxlen=MAX_SEQUENCE_LENGTH, padding='post')])

In [34]:
sent_vec3_h1 = model_h1.predict([pad_sequences(tokenizer.texts_to_sequences(["<BOS> This is not a good"]), maxlen=MAX_SEQUENCE_LENGTH, padding='pre'), 
                                 pad_sequences(tokenizer.texts_to_sequences([" for her <EOS>"]), maxlen=MAX_SEQUENCE_LENGTH, padding='post')])

In [35]:
sent_vec3_h2 = model_h2.predict([pad_sequences(tokenizer.texts_to_sequences(["<BOS> This is not a good"]), maxlen=MAX_SEQUENCE_LENGTH, padding='pre'), 
                                 pad_sequences(tokenizer.texts_to_sequences([" for her <EOS>"]), maxlen=MAX_SEQUENCE_LENGTH, padding='post')])

In [36]:
sent_vec4_h1 = model_h1.predict([pad_sequences(tokenizer.texts_to_sequences(["<BOS> This "]), maxlen=MAX_SEQUENCE_LENGTH, padding='pre'), 
                                 pad_sequences(tokenizer.texts_to_sequences([" is due not just to mere luck <EOS>"]), maxlen=MAX_SEQUENCE_LENGTH, padding='post')])

In [37]:
sent_vec4_h2 = model_h2.predict([pad_sequences(tokenizer.texts_to_sequences(["<BOS> This "]), maxlen=MAX_SEQUENCE_LENGTH, padding='pre'), 
                                 pad_sequences(tokenizer.texts_to_sequences([" is due not just to mere luck <EOS>"]), maxlen=MAX_SEQUENCE_LENGTH, padding='post')])

In [38]:
sent_vec1_h1[0][:10]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.01980934,  0.        ,  0.        , -0.        , -0.10852313],
      dtype=float32)

In [39]:
sent_vec1_h2[0][:10]

array([ 0.03524098,  0.01960367,  0.04008206, -0.03422608, -0.02447513,
        0.08366579,  0.06117012, -0.03808985,  0.03539947,  0.06514335],
      dtype=float32)

In [40]:
sent_vec2_h1[0][:10]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.01980934,  0.        ,  0.        , -0.        , -0.10852313],
      dtype=float32)

In [41]:
sent_vec2_h2[0][:10]

array([ 0.02010383,  0.02865812,  0.03589398, -0.036089  , -0.03837456,
        0.07460199,  0.0405308 , -0.02475565,  0.04971936,  0.05991247],
      dtype=float32)

In [42]:
sent_vec3_h1[0][:10]

array([ 0.        , -0.31516954,  0.        ,  0.        , -0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
      dtype=float32)

In [43]:
sent_vec3_h2[0][:10]

array([-0.14028482, -0.04478314,  0.03191046, -0.10079529, -0.2970091 ,
        0.01752447,  0.00672428,  0.1021916 ,  0.08007528,  0.07870147],
      dtype=float32)

In [67]:
sent_vec4_h1[0][:10]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00494273],
      dtype=float32)

In [68]:
sent_vec4_h2[0][:10]

array([-5.5300161e-02,  5.7356510e-02, -5.4492977e-02, -9.4825868e-05,
       -1.3343853e-01,  5.6117371e-02,  4.1513979e-02, -3.0547578e-02,
        4.8666321e-02,  1.0632966e-02], dtype=float32)

In [46]:
from scipy.spatial.distance import cosine

In [47]:
1 - cosine(sent_vec1_h2, sent_vec2_h2)

0.9911181926727295

In [48]:
1 - cosine(sent_vec1_h2, sent_vec3_h2)

0.3632470667362213

In [49]:
1 - cosine(sent_vec2_h2, sent_vec3_h2)

0.37099379301071167

In [50]:
1 - cosine(sent_vec3_h2, sent_vec4_h2)

0.561137318611145

## inferred fillers 

In [51]:
vectors_word = model.get_layer('embedding_shared').get_weights()[0]

In [52]:
1-cosine(vectors_word[tokenizer.word_index["she"]], vectors_word[tokenizer.word_index["her"]])

0.20017202198505402

In [53]:
1-cosine(vectors_word[tokenizer.word_index["she"]], vectors_word[tokenizer.word_index["box"]])

0.09070437401533127

In [54]:
1-cosine(vectors_word[tokenizer.word_index["school"]], vectors_word[tokenizer.word_index["book"]])

0.15406030416488647

In [55]:
1-cosine(vectors_word[tokenizer.word_index["school"]], vectors_word[tokenizer.word_index["tiger"]])

0.036152709275484085

In [64]:
# # def sim_mult(sent):
sent_vec = sent_vec3_h2[0]
sent_vec = sent_vec / np.sqrt((sent_vec*sent_vec).sum())

# targ_sim = vectors_word.dot(vectors_word[tokenizer.word_index["<UNK>"]])
# targ_sim[targ_sim<0] = 0.0
# cntx_sim = vectors_word.dot(sent_vec)
# cntx_sim[cntx_sim<0] = 0.0
# mult_sim = targ_sim * cntx_sim

mult_sim = (vectors_word.dot(sent_vec)+1.0)/2

(-mult_sim).argsort()


array([1985,  455, 1297, ..., 1824, 1722, 1180])

In [65]:
count = 0
for i in (-mult_sim).argsort():
    if np.isnan(mult_sim[i]):
        continue
    if (i != 0):
        print('{0}:{1}'.format(tokenizer.index_word[i], mult_sim[i]))
        count += 1
    if count == 10:
        break

failed:0.9797227382659912
st:0.884330153465271
policies:0.8756725788116455
sport:0.8592591881752014
problems:0.8470306992530823
although:0.8468138575553894
though:0.8395321369171143
mean:0.8349412679672241
deals:0.82281494140625
suggests:0.8119405508041382
