In [54]:
import numpy as np
import six
import sys
import os
import traceback
import re
import pickle
from copy import deepcopy

from chainer import cuda
from context2vec.common.context_models import Toks
from context2vec.common.model_reader import ModelReader
import sklearn
import pandas as pd
import logging
from scipy.stats import spearmanr
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import gensim
import math
import collections



[nltk_data] Downloading package stopwords to /home/ql261/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
def produce_top_n_simwords(w_filter,context_embed,n_result,index2word,debug=False):
        #assume that w_filter is already normalized
        context_embed = context_embed / xp.sqrt((context_embed * context_embed).sum())
        similarity_scores=[]
        print('producing top {0} simwords'.format(n_result))
        similarity = (w_filter.dot(context_embed)+1.0)/2
        top_words_i=[]
        top_words=[]
        count = 0
        for i in (-similarity).argsort():
                    if xp.isnan(similarity[i]):
                        continue
                    if debug==True:
                        print('{0}: {1}'.format(str(index2word[i]), str(similarity[i])))
                    count += 1
                    top_words_i.append(i)
                    top_words.append(index2word[i])
                    similarity_scores.append(similarity[i])
                    if count == n_result:
                        break

        top_vec=w_filter[top_words_i,:]
        
        return top_vec,np.array(similarity_scores),top_words
    
def top_mutual_sim(top_vec,similarity_scores):

    #normalize the top_vec
    s = np.sqrt((top_vec * top_vec).sum(1))
    s[s==0.] = 1.
    top_vec /= s.reshape((s.shape[0], 1))
    
    # substitutes' similarity to sentence (similarity_scores) as weight matrix to mutual similarity
    max_score=similarity_scores[0]
    similarity_scores=np.array(similarity_scores)
    sim_weights=(similarity_scores+similarity_scores.reshape(len(similarity_scores),1))/2.0
    #weighted by the maximum score in the substitutes (highre max score means the context is more certain about the substitutes)
    sim_weights=(sim_weights/float(sum(sum(sim_weights))))*max_score
    # dot product weighted by substitute probability (sim_weights)
    inf_score=sum(sum(top_vec.dot(top_vec.T)*sim_weights))
    return inf_score

def top_cluster_density(top_vec,similarity_scores):
    #normalize the top_vec
    s = np.sqrt((top_vec * top_vec).sum(1))
    s[s==0.] = 1.
    top_vec = top_vec/ s.reshape((s.shape[0], 1))
    
    #perform the centroid
    max_score=similarity_scores[0]
    similarity_scores=np.array(similarity_scores).reshape(len(similarity_scores),1)/sum(similarity_scores)
    centroid_vector=sum(top_vec*similarity_scores)
    # average of cosine distance to the centroid,weighted by max scores
    inf_score=sum(top_vec.dot(centroid_vector))/len(top_vec)*max_score
    return inf_score

In [19]:
def load_w2salience(w2salience_f,weight_type):
    w2salience={}
    with open(w2salience_f) as f:
        for line in f:
            if line.strip()=='':
                continue
            w,w_count,s_count=line.strip().split('\t')
            if weight_type==INVERSE_W_FREQ:
                w2salience[w]=1/float(w_count)
            elif weight_type==INVERSE_S_FREQ:
                w2salience[w]=math.log(1+84755431/float(s_count))
    return w2salience

def skipgram_context(model,words,pos,weight=None,w2entropy=None):
    context_wvs=[]
    weights=[]
    for i,word in enumerate(words):
        if i != pos: #surroudn context words
            try:
                if weight ==LDA:
                    if word in w2entropy and word in model:
                        print (word,w2entropy[word])
                        weights.append(1/(w2entropy[word]+1.0))
                        context_wvs.append(model[word])
                elif weight in [INVERSE_W_FREQ,INVERSE_S_FREQ]:
                    if word in w2entropy and word in model:
                        print (word,w2entropy[word])
                        weights.append(w2entropy[word])
                        context_wvs.append(model[word])
                else:
                    #equal weights per word
                    context_wvs.append(model[word])
                    weights.append(1.0)
            except KeyError as e:
                print ('==warning==: key error in context {0}'.format(e))
    print ('per word weights',weights)
    context_embed=sum(np.array(context_wvs)*np.array(weights).reshape(len(weights),1))#/sum(weights)
    return sum(weights),context_embed #  will be normalized later

def lg_model_out_w2v(top_words,w_target,word2index_target):
        # lg model substitutes in skipgram embedding
        top_vec=[]
        index_list=[]
        for i,word in enumerate(top_words):
            try :
                top_vec.append(w_target[word2index_target[word]])
                print ('target word substitute',w_target[word2index_target[word]][:10])
                index_list.append(i)
            except KeyError as e:
                print (e)
        return np.array(top_vec),index_list
    
def context_inform(test_s,test_w, model,model_type,n_result,w_filter,index2word,weight,w2entropy=None,w_target=None,word2index_target=None,index2word_target=None):
    #produce context representation and infromative score for each context
    test_s=test_s.replace(test_w, ' '+test_w+' ')
    print(test_s)
    words=test_s.split()
    pos=words.index(test_w)
    
    score=1.0 #default score
    
    # Decide on the model
    if model_type=='context2vec':
        context_embed= model.context2vec(words, pos)
        context_embed_out=context_embed
    
    elif model_type=='skipgram':
        score,context_embed=skipgram_context(model,words,pos,weight,w2entropy)
        context_embed_out=context_embed
        
    elif model_type=='context2vec-skipgram':
        # context2vec substitutes in skipgram space
        context_embed= model.context2vec(words, pos)
        top_vec,sim_scores,top_words=produce_top_n_simwords(w_filter,context_embed,n_result,index2word)
        top_vec,index_list=lg_model_out_w2v(top_words,w_target,word2index_target) 
        sim_scores=sim_scores[index_list] #weighted by substitute probability
        if weight==SUBSTITUTE_PROB:
            context_embed_out=sum(top_vec*sim_scores.reshape(len(sim_scores),1))
        else:
            context_embed_out=sum(top_vec*((sim_scores/sum(sim_scores)).reshape(len(sim_scores),1)))
    else:
        print ('model type {0} not recognized'.format(model_type))
        sys.exit(1)
        
    print('context_embed original', context_embed[:10])
    print ('context_embed_out',context_embed_out[:10])
    #decide on weight per sentence
    print ('weight mode',weight)
    if weight==TOP_MUTUAL_SIM:
#         if word2index_target==None: #not context2vec-skipgram
#             context2vec word embedding space neighbours
        top_vec,sim_scores,top_words=produce_top_n_simwords(w_filter,context_embed,n_result,index2word)
        #skipgram word embedding space neighbours when context2vec-skipgram
        score=top_mutual_sim(top_vec,sim_scores)
    elif weight==TOP_CLUSTER_DENSITY:
#         if word2index_target==None: #not context2vec-skipgram
#             context2vec word embedding space neighbours
        top_vec,sim_scores,top_words=produce_top_n_simwords(w_filter,context_embed,n_result,index2word)
        score=top_cluster_density(top_vec,sim_scores)
    elif weight==SUBSTITUTE_PROB:
        score=sum(sim_scores)
        print ('substitute prob score',score)
    elif weight=='learned':
        print ('learned not implemented')
    elif weight=='gaussian':
        print ('gaussian not implemented')
    elif weight ==False or weight in [LDA,INVERSE_S_FREQ,INVERSE_W_FREQ]:
        score=score
    else:
        print ('weight mode {0} not recognized'.format(weight))
    return score,context_embed_out

def additive_model(test_ss,test_w, model_type,model,n_result,w_filter,index2word,weight=False,w2entropy=None,w_target=None,word2index_target=None,index2word_target=None,f_w=None):
    #produce context representation across contexts using weighted average
    context_out=[]
    context_weights=[]
    for test_s in test_ss.split('@@'):
        test_s=test_s.strip()
        #produce context representation with scores
        score,context_embed=context_inform(test_s,test_w, model,model_type,n_result,w_filter,index2word,weight,w2entropy,w_target,word2index_target,index2word_target)
        print ('weight is {0}'.format(score))
        print ('context_embed is ', context_embed[:10])
        context_out.append(context_embed)
        context_weights.append(score)
    
    
    print ('context_weights',context_weights)
    #sum representation across contexts
    context_out=np.array(context_out)
    
    
    if model_type=='skipgram' or weight==SUBSTITUTE_PROB:
        # context representation by weighted sum of all context words in all contexts
        context_avg=sum(context_out)/sum(context_weights)
    else:
        norm_weights=np.array(context_weights).reshape(len(context_weights),1)/float(sum(context_weights))
        if f_w!=None:
            f_w.write(','.join([str(i[0]) for i in norm_weights])+'\n')
        print ('normalized weight: \n  {0}'.format(norm_weights))
        # context represenatation by weighted sum of contexts
        context_avg=sum(norm_weights*context_out)
    
    
    # check new embedding neighbours

    print('producing top {0} words for new embedding'.format(n_result))
    if index2word_target==None:
        top_vec,scores,top_words=produce_top_n_simwords(w_filter,context_avg,n_result,index2word)
    else:
        #print the target space neighbours for context2vec-skipgram
        print (w_target.shape)
        top_vec,scores,top_words=produce_top_n_simwords(w_target,context_avg,n_result,index2word_target)
    
    return context_avg




In [27]:
def filter_w(w,word2index,index2word):
    #filter out words with no letters in, and stopwords
    stopw=stopwords.words('english')
    stopw=[word.encode('utf-8') for word in stopw]
    index2word_filter={}
    word2index_filter={}
    index_filter2index=[]
    counter=0
    for word in word2index:
            if word not in stopw:
                    index_filter2index.append(word2index[word])
                    word2index_filter[word]=counter
                    index2word_filter[counter]=word
                    counter+=1
    w_filter= w[index_filter2index,:]
    return w_filter,word2index_filter,index2word_filter

def rm_stopw_context(model):
    stopw=stopwords.words('english')
    stopw=[word.encode('utf-8') for word in stopw]
    
    model2={word:model.wv.__getitem__(word) for word in model.wv.vocab if word not in stopw}
    return model2




In [72]:
def preprocess_nonce(sent):
    
    sents_out=[]
    
    results=re.finditer('___ ',sent)
    matches=[m for m in results]
    for i in range(len(matches)):
        sent_masked=sent
        matches_mask=[(m2.start(0),m2.end(0)) for i2,m2 in enumerate(matches) if i2!=i]
        matches_mask=sorted(matches_mask, key=lambda x:x[0],reverse=True)
        for m in matches_mask:
            sent_masked=sent_masked[:m[0]]+sent_masked[m[1]:]
        sents_out.append(sent_masked)
    return ' @@ '.join(sents_out)

def eval_nonce(nonce_data_f,context_model,model_w2v,model_type,n_result,w,index2word,word2index,weight=False,w2entropy=None,w_target=None,word2index_target=None,index2word_target=None):
        ranks = []
        mrr = 0.0
        data=pd.read_csv(os.path.join(nonce_data_f),delimiter='\t',header=None,comment='#')
        c = 0
        for index, row in data.iterrows():
            sents=preprocess_nonce(row[1])
            nonce=row[0]
            if nonce not in model_w2v:
                print ('{0} not known'.format(nonce))
                continue
            #compute context representation
            if model_type=='context2vec-skipgram?skipgram':
                    #context2vevc                
                    context_avg_1=additive_model(sents.lower(),'___', model_type.split('?')[0],context_model[0],n_result,w[0],index2word[0],weight[0],w2entropy[0],w_target[0],word2index_target[0],index2word_target[0])
                    print ('context2vec avg embed',context_avg_1[:10])
                    context_avg_2=additive_model(sents.lower(),'___', model_type.split('?')[1],context_model[1],n_result,w[1],index2word[1],weight[1],w2entropy[1],w_target[1],word2index_target[1],index2word_target[1])
                    print ('skipgram avg embed', context_avg_2[:10])
                    context_avg=(context_avg_1+context_avg_2)/2
                    print ('context2vec avg skipgram', context_avg[:10])
                    #compute probe embeddings in skipgram space
                    w_out=w[1]
                    w_target_out=w_target[1]
                    word2index_out=word2index[1]
                    word2index_target_out=word2index_target[1]
                    
            else:
                    
                    context_avg=additive_model(sents.lower(),'___', model_type,context_model,n_result,w,index2word,weight,w2entropy,w_target,word2index_target,index2word_target)
                    print ('context avg out', context_avg[:10])
                    w_out=w
                    w_target_out=w_target
                    word2index_out=word2index
                    word2index_target_out=word2index_target
            
#             context_avg = context_avg / xp.sqrt((context_avg * context_avg).sum())

            print ('vector norm: {0}'.format(np.linalg.norm(context_avg)))
            # MRR Rank calculation
            nns=model_w2v.similar_by_vector(context_avg,topn=len(model_w2v.wv.vocab))

            rr = 0
            n = 1
            for nn in nns:
                word = nn[0]
                if word == nonce:
                    print (word)
                    rr = n
                    ranks.append(rr)
                else:
                  n+=1

            if rr != 0:
                mrr+=float(1)/float(rr)	
            print rr,mrr
            c+=1
        print ("Final MRR: ",mrr,c,float(mrr)/float(c))

#         bins = np.linspace(0,len(model_w2v.wv.vocab),40)
#         print bins
#         binned = np.digitize(ranks, bins)
#         print collections.Counter(binned)
        print ('mediam : {0}'.format(np.median(ranks)))
        return ranks
            
#             #cosine similarity with probe embedding
#             for gold,probe in zip(row[3].split(','),row[2].split(',')):
#                 try:
#                     if word2index_target_out==None:
#                         probe_w_vec=xp.array(w_out[word2index_out[probe]])
#                     else:
#                         probe_w_vec=xp.array(w_target_out[word2index_target_out[probe]])
#                     probe_w_vec=probe_w_vec/xp.sqrt((probe_w_vec*probe_w_vec).sum())
#                     cos=probe_w_vec.dot(context_avg)
#                     if xp.isnan(cos):
#                         continue
#                     else:
#                         model_predict.append(cos)
#                         golds.append(gold)
#                         probes.append(probe)
#                 except KeyError as e:
#                     print ("====warning key error for probe=====: {0}".format(e))
#             print ('probes',probes)
#             print ('gold',golds)
#             print ('model_predict',model_predict)
#             sp=spearmanr(golds,model_predict)[0]
#             print ('spearman correlation is {0}'.format(sp))
#             if not math.isnan(sp):
#                 spearmans.append(sp)
#         print ("AVERAGE RHO:",float(sum(spearmans))/float(len(spearmans)))
        


def eval_chimera(chimeras_data_f,context_model,model_type,n_result,w,index2word,word2index,weight=False,w2entropy=None,w_target=None,word2index_target=None,index2word_target=None):
    chimeras_data_dir='/'.join(chimeras_data_f.split('/')[:-1])
    num_sent=chimeras_data_f.split('/')[-1].split('.')[1][1]
    print (chimeras_data_dir)
    print (num_sent)
    with open(chimeras_data_dir+'/weights_{0}_{1}_{2}'.format(num_sent,model_type,str(weight)),'w') as f_w:
        spearmans=[]
        data=pd.read_csv(os.path.join(chimeras_data_f),delimiter='\t',header=None)

        for index, row in data.iterrows():
            golds=[]
            model_predict=[]
            probes=[]
            #compute context representation
            if model_type=='context2vec-skipgram?skipgram':
                    #context2vevc
                    
                    context_avg_1=additive_model(row[1].lower(),'___', model_type.split('?')[0],context_model[0],n_result,w[0],index2word[0],weight[0],w2entropy[0],w_target[0],word2index_target[0],index2word_target[0],f_w)
                    print ('context2vec avg embed',context_avg_1[:10])
                    context_avg_2=additive_model(row[1].lower(),'___', model_type.split('?')[1],context_model[1],n_result,w[1],index2word[1],weight[1],w2entropy[1],w_target[1],word2index_target[1],index2word_target[1],f_w)
                    print ('skipgram avg embed', context_avg_2[:10])
                    context_avg=(context_avg_1+context_avg_2)/2
                    print ('context2vec avg skipgram', context_avg[:10])
                    #compute probe embeddings in skipgram space
                    w_out=w[1]
                    w_target_out=w_target[1]
                    word2index_out=word2index[1]
                    word2index_target_out=word2index_target[1]
                    
            else:
                    
                    context_avg=additive_model(f_w,row[1].lower(),'___', model_type,context_model,n_result,w,index2word,weight,w2entropy,w_target,word2index_target,index2word_target)
                    print ('context avg out', context_avg[:10])
                    w_out=w
                    w_target_out=w_target
                    word2index_out=word2index
                    word2index_target_out=word2index_target
            
            context_avg = context_avg / xp.sqrt((context_avg * context_avg).sum())

           
            
            #cosine similarity with probe embedding
            for gold,probe in zip(row[3].split(','),row[2].split(',')):
                try:
                    if word2index_target_out==None:
                        probe_w_vec=xp.array(w_out[word2index_out[probe]])
                    else:
                        probe_w_vec=xp.array(w_target_out[word2index_target_out[probe]])
                    probe_w_vec=probe_w_vec/xp.sqrt((probe_w_vec*probe_w_vec).sum())
                    cos=probe_w_vec.dot(context_avg)
                    if xp.isnan(cos):
                        continue
                    else:
                        model_predict.append(cos)
                        golds.append(gold)
                        probes.append(probe)
                except KeyError as e:
                    print ("====warning key error for probe=====: {0}".format(e))
            print ('probes',probes)
            print ('gold',golds)
            print ('model_predict',model_predict)
            sp=spearmanr(golds,model_predict)[0]
            print ('spearman correlation is {0}'.format(sp))
            if not math.isnan(sp):
                spearmans.append(sp)
        print ("AVERAGE RHO:",float(sum(spearmans))/float(len(spearmans)))

In [62]:
TOP_MUTUAL_SIM='top_mutual_sim'
TOP_CLUSTER_DENSITY='top_cluster_density'
LDA='lda'
INVERSE_S_FREQ='inverse_s_freq'
INVERSE_W_FREQ='inverse_w_q'
SUBSTITUTE_PROB='substitute_prob'
WEIGHT_DICT={0:False,1:TOP_MUTUAL_SIM,2:LDA,3:INVERSE_S_FREQ,4:INVERSE_W_FREQ,5:TOP_CLUSTER_DENSITY, 6:SUBSTITUTE_PROB}


if __name__=="__main__":
    
    #params read in
    if sys.argv[0]=='/usr/local/lib/python2.7/dist-packages/ipykernel_launcher.py':
        
#         data='./eval_data/data-chimeras/dataset.l2.fixed.test.txt.punct'
        data='./eval_data/data-nonces/n2v.definitional.dataset.test.txt'
        weight=WEIGHT_DICT[0]
        
#         ##context2vec
##         model_param_file='../models/context2vec/model_dir/context2vec.ukwac.model.params'
#         model_param_file='../models/context2vec/model_dir/MODEL-wiki.params.14'
        
#         model_type='context2vec'

####skipgram
        model_param_file='../models/wiki_all.model/wiki_all.sent.split.model'
        model_type='skipgram'
#         weight='inverse_w_freq'
#         w2salience_f='../corpora/corpora/wiki.all.utf8.sent.split.tokenized.vocab'
#         w2salience_f='../models/lda/w2entropy'
        n_result=20


####context2vec-skipgram
#         model_param_file='../models/context2vec/model_dir/MODEL-wiki.params.14?../models/wiki_all.model/wiki_all.sent.split.model'
# #         model_param_file='../models/context2vec/model_dir/context2vec.ukwac.model.params?../models/wiki_all.model/wiki_all.sent.split.model'
#         model_type='context2vec-skipgram?skipgram'
#         n_result=20
#         w2salience_f=None

# #####skipgram?context2vec-skipgram
#         model_param_file='../models/context2vec/model_dir/MODEL-wiki.params.14?../models/wiki_all.model/wiki_all.sent.split.model'
#         model_type='context2vec-skipgram?skipgram'
# #         weight='inverse_w_freq'
# #         w2salience_f='../corpora/corpora/wiki.all.utf8.sent.split.tokenized.vocab'
# #         w2salience_f='../models/lda/w2entropy'
#         n_result=20
    
    else:
        if len(sys.argv) < 5:
            print >> sys.stderr, "Usage: {0} <model_param_file> <model_type: context2vec; context2vec-skipgram (context2vec substitutes in skipgram space); context2vec-skipgram?skipgram (context2vec substitutes in skipgram space plus skipgram context words)> <weight:{1}> <eval_data> <w2salience>"  .format (sys.argv[0],WEIGHT_DICT.items())
            sys.exit(1)
        
        model_param_file = sys.argv[1]
        model_type=sys.argv[2]
        
        if '-' in sys.argv[3]:
            weight,n_result=sys.argv[3].split('-')
            weight=WEIGHT_DICT[int(weight)]
            n_result=int(n_result)
        else:
            weight=WEIGHT_DICT[int(sys.argv[3])]
            n_result=20 #default is 20 top
            
#         context_rm_stopw=int(sys.argv[4])
        data =sys.argv[4]
        
        if len(sys.argv)>5:
            w2salience_f=argv[5]
        else:
            w2salience_f=None
    
    #gpu setup 
    gpu = -1 # todo: make this work with gpu

    if gpu >= 0:
        cuda.check_cuda_available()
        cuda.get_device(gpu).use()    
    xp = cuda.cupy if gpu >= 0 else np
    
    # logging
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    
    #choose model type
    print ('read model....')
    if model_type=='context2vec':
        #read in model
        
        model_reader = ModelReader(model_param_file)
        w = model_reader.w
        index2word = model_reader.index2word
        word2index=model_reader.word2index
        model = model_reader.model
        w_target=None
        word2index_target=None
        index2word_target=None
        
    elif model_type=='skipgram':
        model_w2v = gensim.models.Word2Vec.load(model_param_file)
        w=deepcopy(model_w2v.wv.vectors)
        #vector normalize for target w embedding, consistent with context2vec w and convenient for cosine computation among substitutes
        s = np.sqrt((w * w).sum(1))
        s[s==0.] = 1.
        w /= s.reshape((s.shape[0], 1))
        
        index2word=model_w2v.wv.index2word
        word2index={key: model_w2v.wv.vocab[key].index for key in model_w2v.wv.vocab}
        w_target=None
        word2index_target=None
        index2word_target=None
        
        print ('filter words for context....')

        model=rm_stopw_context(model_w2v)
        
    elif model_type=='context2vec-skipgram':
        model_param_context,model_param_w2v=model_param_file.split('?')
        model_reader = ModelReader(model_param_context)
        w = model_reader.w
        index2word = model_reader.index2word
        word2index=model_reader.word2index
        model = model_reader.model
        
        model_w2v = gensim.models.Word2Vec.load(model_param_w2v)
        w_target=model_w2v.wv.vectors
        index2word_target=model_w2v.wv.index2word
        word2index_target={key: model_w2v.wv.vocab[key].index for key in model_w2v.wv.vocab}
    
    elif model_type=='context2vec-skipgram?skipgram':
        model_param_context,model_param_w2v=model_param_file.split('?')
        #context2vec-skipgram
        model_reader = ModelReader(model_param_context)
        w = model_reader.w
        index2word = model_reader.index2word
        word2index =model_reader.word2index
        model = model_reader.model
        
        model_w2v = gensim.models.Word2Vec.load(model_param_w2v)
        w_target=model_w2v.wv.vectors
        index2word_target=model_w2v.wv.index2word
        word2index_target={key: model_w2v.wv.vocab[key].index for key in model_w2v.wv.vocab}
    
        # skigpram
        model_skipgram = model_w2v
        w_skipgram=deepcopy(model_skipgram.wv.vectors)
        #vector normalize for probe w embedding
        s = np.sqrt((w_skipgram * w_skipgram).sum(1))
        s[s==0.] = 1.
        w_skipgram /= s.reshape((s.shape[0], 1))
        
        index2word_skipgram=model_skipgram.wv.index2word
        word2index_skipgram={key: model_skipgram.wv.vocab[key].index for key in model_skipgram.wv.vocab}
        w_target_skipgram=None
        word2index_target_skipgram=None
        index2word_target_skipgram=None
        
        print ('filter words for context....')

        model_skipgram=rm_stopw_context(model_skipgram)
        
                
        
    
    #remove stop words in target word space
    print ('filter words for target....')
    w,word2index,index2word=filter_w(w,word2index,index2word)
    if  index2word_target!=None:
        w_target,word2index_target,index2word_target=filter_w(w_target,word2index_target,index2word_target)
    if model_type=='context2vec-skipgram?skipgram':
        w_skipgram,word2index_skipgram,index2word_skipgram=filter_w(w_skipgram,word2index_skipgram,index2word_skipgram)
    
    #per word weight
    
    w2salience=None
    if weight==LDA:
        print ('load vectors and entropy')
        w2salience=pickle.load(open(w2salience_f))
    elif weight==INVERSE_W_FREQ:
        print ('load w2freq')
        w2salience=load_w2salience(w2salience_f,weight)
    elif weight==INVERSE_S_FREQ:
        print ('load w2freq')
        w2salience=load_w2salience(w2salience_f,weight)


    #combine parameters for skipgram?context2vec-skipgram
    if model_type=='context2vec-skipgram?skipgram':
        model=(model,model_skipgram)
        w=(w,w_skipgram)
        index2word=(index2word,index2word_skipgram)
        word2index=(word2index,word2index_skipgram)
        weight=(weight,WEIGHT_DICT[0])#assume that skipgram has no weight
        w2salience=(w2salience,w2salience)
        w_target=(w_target,w_target_skipgram)
        word2index_target=(word2index_target,word2index_target_skipgram)
        index2word_target=(index2word_target,index2word_target_skipgram)
    
    print (model_param_file,model_type,weight,data,w2salience_f)


read model....
filter words for context....
filter words for target....
('../models/wiki_all.model/wiki_all.sent.split.model', 'skipgram', False, './eval_data/data-nonces/n2v.definitional.dataset.test.txt', None)


In [41]:

# model_w2v.most_similar('hey')

  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.


[('hey', 1.0),
 ('alright', 0.6344900131225586),
 ('gotta', 0.6183755397796631),
 ('yeah', 0.6179326772689819),
 ('gonna', 0.6031059622764587),
 ('wanna', 0.5750466585159302),
 ('ooh', 0.5629773736000061),
 ('me', 0.5571281909942627),
 ('doin', 0.554619312286377),
 ('crazy', 0.553865909576416),
 ('whatcha', 0.5477524995803833),
 ('oooh', 0.5423804521560669),
 ('darlin', 0.5415685772895813),
 ('lovin', 0.54054856300354),
 ('mama', 0.536620557308197),
 ('lawdy', 0.5365476608276367),
 ('talkin', 0.5342473983764648),
 ('goin', 0.5317614078521729),
 ('mornin', 0.5311395525932312),
 ('missin', 0.5293031334877014)]

In [73]:
#read in data
if data.split('/')[-2]== 'data-chimeras':

        eval_chimera(data,model,model_type,n_result,w,index2word,word2index,weight,w2salience,w_target,word2index_target,index2word_target)
    
elif data.split('/')[-2]== 'data-nonces':
        ranks=eval_nonce(data,model,model_w2v,model_type,n_result,w,index2word,word2index,weight,w2salience,w_target,word2index_target,index2word_target)
    



 ___  international inc is an american multinational conglomerate company that produces a variety of commercial and consumer products engineering services and aerospace systems for a wide variety of customers from private consumers to major corporations and governments
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.63182328,  1.11818997,  0.27647363, -3.8246277 , -1.33201902,
       -1.56528724, -0.12704834, -3.29326714,  2.94604603, -2.37727762]))
('context_embed_out', array([ 0.63182328,  1.11818997,  0.27647363, -3.8246277 , -1.33201902,
       -1.56528724, -0.12704834, -3.29326714,  2.94604603, -2.37727762]))
('weight mode', False)
weight is 23.0
('context_embed is ', array([ 0.63182328,  1.11818997,  0.27647363, -3.8246277 , -1.33201902,
       -1.56528724, -0.12704834, -3.29326714,  2.94604603, -2.37727762]))
('context_weights', [23.0])
producing top 20 



2533 0.000394788787998
 ___  http over tls http over ssl or http secure is a communications protocol for secure communication over a computer network with especially wide deployment on the internet
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 1.26683876,  2.43303285, -6.36337206, -0.50383926, -1.85061281,
       -3.60589562,  2.65003114,  0.51107518,  4.63109032,  1.28419057]))
('context_embed_out', array([ 1.26683876,  2.43303285, -6.36337206, -0.50383926, -1.85061281,
       -3.60589562,  2.65003114,  0.51107518,  4.63109032,  1.28419057]))
('weight mode', False)
weight is 16.0
('context_embed is ', array([ 1.26683876,  2.43303285, -6.36337206, -0.50383926, -1.85061281,
       -3.60589562,  2.65003114,  0.51107518,  4.63109032,  1.28419057]))
('context_weights', [16.0])
producing top 20 words for new embedding
producing top 20 simwords
('context avg out', array([ 0.07917742,  0.15206455, -0.3

('context avg out', array([ 0.11780592,  0.16875167, -0.09844666,  0.10154124, -0.06155184,
       -0.16963064,  0.10825009, -0.12968203,  0.26158767, -0.02200434]))
vector norm: 2.53222406865
telewest
105 0.575143108826
 ___  or trevena cornish tre war venydh meaning village on a mountain is a civil parish and village situated on the atlantic coast of cornwall england united kingdom
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.76685999, -1.3890073 ,  1.67334933, -0.49302786,  0.91407079,
       -1.30424475, -2.13726077,  0.66240514,  1.76878126,  0.55131455]))
('context_embed_out', array([ 0.76685999, -1.3890073 ,  1.67334933, -0.49302786,  0.91407079,
       -1.30424475, -2.13726077,  0.66240514,  1.76878126,  0.55131455]))
('weight mode', False)
weight is 17.0
('context_embed is ', array([ 0.76685999, -1.3890073 ,  1.67334933, -0.49302786,  0.91407079,
       -1.30424475, -2.13726077,

('context avg out', array([ 0.05698823,  0.02490599, -0.0303496 , -0.03838561,  0.08030686,
        0.03808874,  0.0006497 , -0.08952101,  0.07810065,  0.01450861]))
vector norm: 1.77469270946
embarrassment
260 0.794780992422
 ___  is a liquid consisting mainly of acetic acid ch3cooh and water
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-0.27381391, -0.78211817,  0.08397568, -0.37664297, -0.4266242 ,
        0.77980281, -0.85870481,  0.83783614,  0.9846796 ,  0.24901974]))
('context_embed_out', array([-0.27381391, -0.78211817,  0.08397568, -0.37664297, -0.4266242 ,
        0.77980281, -0.85870481,  0.83783614,  0.9846796 ,  0.24901974]))
('weight mode', False)
weight is 6.0
('context_embed is ', array([-0.27381391, -0.78211817,  0.08397568, -0.37664297, -0.4266242 ,
        0.77980281, -0.85870481,  0.83783614,  0.9846796 ,  0.24901974]))
('context_weights', [6.0])
producing top 20 words for new embedding
producing top 20 simwords
('context av

braveheart
744 0.803743685441
 ___  images are a variety of aspect ratios used in film television and computer screens
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-1.72575227,  1.39944405,  0.1605305 , -0.75401249, -0.03673682,
       -1.66219791,  0.25441495, -0.62420383, -0.56631899, -0.02784191]))
('context_embed_out', array([-1.72575227,  1.39944405,  0.1605305 , -0.75401249, -0.03673682,
       -1.66219791,  0.25441495, -0.62420383, -0.56631899, -0.02784191]))
('weight mode', False)
weight is 9.0
('context_embed is ', array([-1.72575227,  1.39944405,  0.1605305 , -0.75401249, -0.03673682,
       -1.66219791,  0.25441495, -0.62420383, -0.56631899, -0.02784191]))
('context_weights', [9.0])
producing top 20 words for new embedding
producing top 20 simwords
('context avg out', array([-0.19175025,  0.15549378,  0.01783672, -0.08377917, -0.00408187,
       -0.18468866,  0.02826833, -0.06935598, -0.06292433, -0.00309355]))
vector 

('context avg out', array([ 0.03951705, -0.0400761 ,  0.03776052,  0.00079949,  0.06443228,
       -0.09075117, -0.01593176,  0.20055582,  0.11783322,  0.0103717 ]))
vector norm: 1.70551288147
midsummer
675 0.818657153432
 ___  is a form of alternative medicine that emphasizes diagnosis treatment and prevention of mechanical disorders of the musculoskeletal system especially the spine under the belief that these disorders affect general health via the nervous system
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-1.92835984,  0.19414493, -1.73294738, -0.49678089, -1.95413171,
        1.83718027, -3.82632577, -1.33556553,  2.69234761, -1.82673925]))
('context_embed_out', array([-1.92835984,  0.19414493, -1.73294738, -0.49678089, -1.95413171,
        1.83718027, -3.82632577, -1.33556553,  2.69234761, -1.82673925]))
('weight mode', False)
weight is 21.0
('context_embed is ', 

('context avg out', array([-0.01639231,  0.17460694,  0.07630818, -0.09762765, -0.11492277,
        0.0013169 , -0.07988894,  0.19580561,  0.29279068,  0.07426012]))
vector norm: 2.93289065128
germination
157 0.946927326961
 ___  or proportion in epidemiology is the proportion of a population found to have a condition typically a disease or a risk factor such as smoking or seat belt use
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-1.11167183,  0.20011535, -1.65571295, -1.26313156,  0.23554823,
       -0.75112212, -1.33686496,  0.8001876 ,  1.19971758, -0.96566136]))
('context_embed_out', array([-1.11167183,  0.20011535, -1.65571295, -1.26313156,  0.23554823,
       -0.75112212, -1.33686496,  0.8001876 ,  1.19971758, -0.96566136]))
('weight mode', False)
weight is 14.0
('context_embed is ', array([-1.11167183,  0.20011535, -1.65571295, -1.26313156,  0.23554823,
       -0.75112212, -1.33686496,  0.8001876 

discrete
5 2.1480128015
 ___  or derdriu is the foremost tragic heroine in irish mythology and probably its best known figure in modern times
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-1.49143192, -0.01965144,  0.08563035,  0.5585991 ,  0.76253736,
       -2.14515807, -0.91668941, -0.31256318, -0.064469  ,  0.23358628]))
('context_embed_out', array([-1.49143192, -0.01965144,  0.08563035,  0.5585991 ,  0.76253736,
       -2.14515807, -0.91668941, -0.31256318, -0.064469  ,  0.23358628]))
('weight mode', False)
weight is 11.0
('context_embed is ', array([-1.49143192, -0.01965144,  0.08563035,  0.5585991 ,  0.76253736,
       -2.14515807, -0.91668941, -0.31256318, -0.064469  ,  0.23358628]))
('context_weights', [11.0])
producing top 20 words for new embedding
producing top 20 simwords
('context avg out', array([-0.13558472, -0.00178649,  0.00778458,  0.05078174,  0.06932158,
       -0.19501437, -0.0833354 , -0.02841483, 

('context avg out', array([ 0.01389134, -0.18466546, -0.11922047, -0.04629944,  0.0609425 ,
       -0.1783283 , -0.05932128,  0.06369563,  0.11370121, -0.01029959]))
vector norm: 1.85755148396
masonry
120 2.19028231031
 ___  is a town and civil parish in the unitary authority of cheshire east and the county of cheshire in england
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 1.16053604, -1.36598577, -0.42865375, -1.29090997, -0.34300149,
       -1.78031707, -2.12364055, -0.49215845,  1.14596689,  0.7902088 ]))
('context_embed_out', array([ 1.16053604, -1.36598577, -0.42865375, -1.29090997, -0.34300149,
       -1.78031707, -2.12364055, -0.49215845,  1.14596689,  0.7902088 ]))
('weight mode', False)
weight is 10.0
('context_embed is ', array([ 1.16053604, -1.36598577, -0.42865375, -1.29090997, -0.34300149,
       -1.78031707, -2.12364055, -0.49215845,  1.14596689,  0.7902088 ]))
('context_weights', [10.0])
producing top 20 wor

('context avg out', array([-0.03179469, -0.05791495, -0.05309641, -0.0664591 , -0.03654198,
       -0.11073651, -0.06117136, -0.02181772,  0.21455231, -0.0464718 ]))
vector norm: 1.72856608253
mumps
39032 2.21244934226
 ___  or shortening of lamport tex is a document preparation system and document markup language
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-0.4385117 , -0.07483126, -0.21964919,  0.28539826,  0.47439403,
       -0.72205325, -0.25403664,  0.08578088,  1.72333939, -0.15565573]))
('context_embed_out', array([-0.4385117 , -0.07483126, -0.21964919,  0.28539826,  0.47439403,
       -0.72205325, -0.25403664,  0.08578088,  1.72333939, -0.15565573]))
('weight mode', False)
weight is 9.0
('context_embed is ', array([-0.4385117 , -0.07483126, -0.21964919,  0.28539826,  0.47439403,
       -0.72205325, -0.25403664,  0.08578088,  1.72333939, -0.15565573]))
('context_weights', [9.0])
producing top 20 words for new embedding
pr

('context avg out', array([-0.11284134, -0.08543995,  0.12150188, -0.15336273,  0.13431856,
       -0.01514858, -0.01954951, -0.01921344,  0.04261317,  0.07967647]))
vector norm: 2.02008708243
mozzarella
5 2.43172182821
 ___  is a sweet food made by bees using nectar from flowers
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.48642813,  1.37881118, -0.14035472, -1.86901442,  1.11997706,
        0.21197264, -0.20903387,  0.68345709,  0.57180923,  0.83562846]))
('context_embed_out', array([ 0.48642813,  1.37881118, -0.14035472, -1.86901442,  1.11997706,
        0.21197264, -0.20903387,  0.68345709,  0.57180923,  0.83562846]))
('weight mode', False)
weight is 7.0
('context_embed is ', array([ 0.48642813,  1.37881118, -0.14035472, -1.86901442,  1.11997706,
        0.21197264, -0.20903387,  0.68345709,  0.57180923,  0.83562846]))
('context_weights', [7.0])
producing top 20 words for new embedding
producing top 20 simwords
('context avg out', a

('context avg out', array([ 0.01543814, -0.10127957, -0.00462026, -0.08590816, -0.07472952,
        0.05282407, -0.02231619,  0.06256992,  0.30453831,  0.03032037]))
vector norm: 2.45911683981
biodiesel
1830 2.76329564435
 ___  or scepticism see spelling differences is generally any questioning attitude towards knowledge facts or opinionsbeliefs stated as facts or doubt regarding claims that are taken for granted elsewhere
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.65843579, -0.95178238,  0.07681819, -0.35488252,  2.88102636,
        1.16620569,  0.07694748, -0.34576405, -0.24272119,  0.12522055]))
('context_embed_out', array([ 0.65843579, -0.95178238,  0.07681819, -0.35488252,  2.88102636,
        1.16620569,  0.07694748, -0.34576405, -0.24272119,  0.12522055]))
('weight mode', False)
weight is 18.0
('context_embed is ', array([ 0.65843579, -0.95178238,  0.07681819, -0.35488252, 

('context avg out', array([-0.05204343, -0.06460242,  0.00882137, -0.0753868 ,  0.07554441,
       -0.02476164, -0.03691791,  0.08807267,  0.1360522 , -0.04429449]))
vector norm: 1.70957222394
law
38 3.26671531754
 ___  is the southernmost state in the region of the united states known as new england
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-0.03054249,  0.64184679,  0.44924584,  0.65378723,  0.12110502,
       -1.63743005, -1.21786313,  0.31402099,  0.38940474,  0.37731701]))
('context_embed_out', array([-0.03054249,  0.64184679,  0.44924584,  0.65378723,  0.12110502,
       -1.63743005, -1.21786313,  0.31402099,  0.38940474,  0.37731701]))
('weight mode', False)
weight is 8.0
('context_embed is ', array([-0.03054249,  0.64184679,  0.44924584,  0.65378723,  0.12110502,
       -1.63743005, -1.21786313,  0.31402099,  0.38940474,  0.37731701]))
('context_weights', [8.0])
producing top 20 words for new embedding
producing top 20 simw

('context avg out', array([ 0.06097239, -0.18420595, -0.08024484, -0.10193367,  0.00886516,
        0.1061548 , -0.04952772,  0.07560697, -0.02445294,  0.03237797]))
vector norm: 2.88298627498
limestone
487 3.63818451866
 ___  is a watery substance located in the mouths of animals secreted by the glands
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.55105285,  0.66861763, -0.52171815, -1.38485828,  0.66143103,
        1.03347766, -1.6693273 ,  0.82163912,  0.45356746,  0.20174175]))
('context_embed_out', array([ 0.55105285,  0.66861763, -0.52171815, -1.38485828,  0.66143103,
        1.03347766, -1.6693273 ,  0.82163912,  0.45356746,  0.20174175]))
('weight mode', False)
weight is 7.0
('context_embed is ', array([ 0.55105285,  0.66861763, -0.52171815, -1.38485828,  0.66143103,
        1.03347766, -1.6693273 ,  0.82163912,  0.45356746,  0.20174175]))
is a watery substance located in the mouths of animals secreted by the  ___  glands
('per w

('context avg out', array([-0.077229  ,  0.01402099, -0.05097178,  0.02911009,  0.06826141,
       -0.21137871, -0.04423682,  0.06656227,  0.09444068, -0.01966673]))
vector norm: 1.83039996383
joss
171141 3.65083955883
 ___  is the semi autonomous part of tanzania in east africa
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.38228652,  0.13909072,  0.48151455,  0.40291443,  0.25102032,
       -0.50740135, -0.4365864 ,  1.1210025 ,  1.01561789, -0.84783375]))
('context_embed_out', array([ 0.38228652,  0.13909072,  0.48151455,  0.40291443,  0.25102032,
       -0.50740135, -0.4365864 ,  1.1210025 ,  1.01561789, -0.84783375]))
('weight mode', False)
weight is 6.0
('context_embed is ', array([ 0.38228652,  0.13909072,  0.48151455,  0.40291443,  0.25102032,
       -0.50740135, -0.4365864 ,  1.1210025 ,  1.01561789, -0.84783375]))
('context_weights', [6.0])
producing top 20 words for new embedding
producing top 20 simwords
('context avg out', array([

('context avg out', array([-0.15625763, -0.07986141, -0.07413853, -0.01025209, -0.00507529,
       -0.1110212 , -0.08348662, -0.0207462 ,  0.06328289, -0.0125978 ]))
vector norm: 1.9304173002
fortran
494 3.65852799632
 ___  is a region of historic and modern central italy
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-0.12178495, -0.42445697,  0.10727876,  0.03196581, -0.55612926,
       -0.94178943, -0.02063657,  0.75693759, -0.10334246,  0.01152851]))
('context_embed_out', array([-0.12178495, -0.42445697,  0.10727876,  0.03196581, -0.55612926,
       -0.94178943, -0.02063657,  0.75693759, -0.10334246,  0.01152851]))
('weight mode', False)
weight is 5.0
('context_embed is ', array([-0.12178495, -0.42445697,  0.10727876,  0.03196581, -0.55612926,
       -0.94178943, -0.02063657,  0.75693759, -0.10334246,  0.01152851]))
('context_weights', [5.0])
producing top 20 words for new embedding
producing top 20 simwords
('context avg out', array([-0.02435699,

ddt
43233 3.77779860245
 ___  is a seaside resort located in portsmouth at the southern end of portsea island in the county of hampshire in england
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 2.63880311, -2.32248848,  1.10150616, -1.25378461,  1.01362468,
       -0.46576144, -1.78019182, -0.36318679,  0.69558292,  0.41405687]))
('context_embed_out', array([ 2.63880311, -2.32248848,  1.10150616, -1.25378461,  1.01362468,
       -0.46576144, -1.78019182, -0.36318679,  0.69558292,  0.41405687]))
('weight mode', False)
weight is 11.0
('context_embed is ', array([ 2.63880311, -2.32248848,  1.10150616, -1.25378461,  1.01362468,
       -0.46576144, -1.78019182, -0.36318679,  0.69558292,  0.41405687]))
('context_weights', [11.0])
producing top 20 words for new embedding
producing top 20 simwords
('context avg out', array([ 0.23989119, -0.21113532,  0.10013692, -0.11398042,  0.0921477 ,
       -0.04234195, -0.16183562, -0.0330

('context avg out', array([ 0.09821838,  0.01021636, -0.03814832,  0.04698352, -0.00551099,
       -0.03478454, -0.14683387,  0.03183125,  0.08509367, -0.00617752]))
vector norm: 2.13395214677
singapore
449 3.86192580589
 ___  is a term for any very thick viscous fluid
('per word weights', [1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.21162631, -1.00149534, -0.03271693,  0.11233522, -0.50091234,
        0.44389817, -0.72867795,  0.99689847,  0.16639778,  0.91362141]))
('context_embed_out', array([ 0.21162631, -1.00149534, -0.03271693,  0.11233522, -0.50091234,
        0.44389817, -0.72867795,  0.99689847,  0.16639778,  0.91362141]))
('weight mode', False)
weight is 4.0
('context_embed is ', array([ 0.21162631, -1.00149534, -0.03271693,  0.11233522, -0.50091234,
        0.44389817, -0.72867795,  0.99689847,  0.16639778,  0.91362141]))
('context_weights', [4.0])
producing top 20 words for new embedding
producing top 20 simwords
('context avg out', array([ 0.05290658, -0.2503

('context avg out', array([ 0.0531275 , -0.03756282, -0.0595494 , -0.06946756,  0.00951563,
       -0.19286291, -0.15661797, -0.11121814,  0.23328888,  0.03756947]))
vector norm: 2.74463026342
utrecht
5658 3.888091872
 ___  is an eight issue comic book limited series a futuristic spin off of the television series buffy the vampire slayer
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-1.19102099,  0.47234779,  1.32835261,  0.34864897,  1.50669445,
       -2.55934076, -0.64577217,  0.15147166,  0.75732637,  0.33305796]))
('context_embed_out', array([-1.19102099,  0.47234779,  1.32835261,  0.34864897,  1.50669445,
       -2.55934076, -0.64577217,  0.15147166,  0.75732637,  0.33305796]))
('weight mode', False)
weight is 13.0
('context_embed is ', array([-1.19102099,  0.47234779,  1.32835261,  0.34864897,  1.50669445,
       -2.55934076, -0.64577217,  0.15147166,  0.75732637,  0.33305796]))
('context_weights', [13.0

('context avg out', array([-0.058389  , -0.06631763,  0.1269034 , -0.01778302,  0.10639981,
       -0.00976416, -0.08723053, -0.08287735,  0.14461202,  0.04116462]))
vector norm: 1.9708587354
lego
1153 4.63904959036
 ___  is an acronym for what you see is what you get
('per word weights', [1.0, 1.0, 1.0])
('context_embed original', array([ 0.04412977,  0.14523249, -0.07799207, -0.37179814,  0.27165757,
       -0.14586437, -0.43271219,  0.4883989 ,  0.84968788,  0.33087436]))
('context_embed_out', array([ 0.04412977,  0.14523249, -0.07799207, -0.37179814,  0.27165757,
       -0.14586437, -0.43271219,  0.4883989 ,  0.84968788,  0.33087436]))
('weight mode', False)
weight is 3.0
('context_embed is ', array([ 0.04412977,  0.14523249, -0.07799207, -0.37179814,  0.27165757,
       -0.14586437, -0.43271219,  0.4883989 ,  0.84968788,  0.33087436]))
('context_weights', [3.0])
producing top 20 words for new embedding
producing top 20 simwords
('context avg out', array([ 0.01470992,  0.04841083, 

('context avg out', array([ 0.01624689, -0.15615224,  0.06705818, -0.22099697, -0.01679084,
       -0.02027483, -0.23168408, -0.04653654,  0.14047044, -0.02412028]))
vector norm: 2.3804315099
romsey
1847 4.65076146332
 ___  fe2 in chemistry indicates a divalent iron compound 2 oxidation state as opposed to ferric which indicates a trivalent iron compound 3 oxidation state
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-0.65837272,  1.03211088, -0.98088693, -1.76890624, -1.53645107,
       -1.57269597, -0.94287554,  4.40806128,  4.25458623,  2.20036574]))
('context_embed_out', array([-0.65837272,  1.03211088, -0.98088693, -1.76890624, -1.53645107,
       -1.57269597, -0.94287554,  4.40806128,  4.25458623,  2.20036574]))
('weight mode', False)
weight is 18.0
('context_embed is ', array([-0.65837272,  1.03211088, -0.98088693, -1.76890624, -1.53645107,
       -1.57269597, -0.94287554,  4.408

('context avg out', array([ 0.09151339, -0.01522403, -0.14798167, -0.17019365,  0.04800076,
       -0.01300118, -0.08971204,  0.04454549,  0.01249293, -0.01602279]))
vector norm: 1.79698281589
sovereignty
2188 4.65693407245
 ___  is the practice of forcing another party to act in an involuntary manner by use of intimidation or threats or some other form of pressure or force
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.73662768, -1.00631891, -1.15938197,  0.38380298,  1.31976634,
        1.00144988,  0.15362655, -0.09428989, -0.14998556, -1.67778651]))
('context_embed_out', array([ 0.73662768, -1.00631891, -1.15938197,  0.38380298,  1.31976634,
        1.00144988,  0.15362655, -0.09428989, -0.14998556, -1.67778651]))
('weight mode', False)
weight is 13.0
('context_embed is ', array([ 0.73662768, -1.00631891, -1.15938197,  0.38380298,  1.31976634,
        1.00144988,  0.15362655, -0.09428989, -0.14998556, -1.

('context avg out', array([ 0.16210226, -0.02014794, -0.12157919,  0.06998288, -0.16858351,
       -0.22818767, -0.12250601, -0.01110518,  0.05984119, -0.10269653]))
vector norm: 2.24662872283
hackney
71 5.20520353051
 ___  is a practice behavior or habit generally considered immoral sinful depraved or degrading in the associated society
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-1.2099251 ,  0.17403607, -1.64400825,  0.87510665,  1.03222918,
        1.22798324,  0.38361506, -1.52420288, -0.40068602, -0.32550058]))
('context_embed_out', array([-1.2099251 ,  0.17403607, -1.64400825,  0.87510665,  1.03222918,
        1.22798324,  0.38361506, -1.52420288, -0.40068602, -0.32550058]))
('weight mode', False)
weight is 11.0
('context_embed is ', array([-1.2099251 ,  0.17403607, -1.64400825,  0.87510665,  1.03222918,
        1.22798324,  0.38361506, -1.52420288, -0.40068602, -0.32550058]))
('context_weights', [11.0])
produci

('context avg out', array([ 0.17007651, -0.11965472, -0.02622062,  0.20176639,  0.01278268,
        0.17091429, -0.20120502, -0.01094977, -0.00855375, -0.05490526]))
vector norm: 2.61042646969
barbados
12 5.31564842828
 ___  is lean meat that has been trimmed of fat cut into strips and then dried to prevent spoilage
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.30772856,  0.5297319 ,  0.34181052, -3.53793214,  0.62387803,
        1.39144986,  0.30773724,  0.4526327 ,  0.65258893,  1.79997731]))
('context_embed_out', array([ 0.30772856,  0.5297319 ,  0.34181052, -3.53793214,  0.62387803,
        1.39144986,  0.30773724,  0.4526327 ,  0.65258893,  1.79997731]))
('weight mode', False)
weight is 9.0
('context_embed is ', array([ 0.30772856,  0.5297319 ,  0.34181052, -3.53793214,  0.62387803,
        1.39144986,  0.30773724,  0.4526327 ,  0.65258893,  1.79997731]))
('context_weights', [9.0])
producing top 20 words for new embedding


('context avg out', array([ 0.05026123,  0.03850303,  0.03327428, -0.08281386, -0.03300967,
       -0.08418921,  0.05262694, -0.04812104,  0.20385176,  0.05608864]))
vector norm: 2.14304797693
bmx
11 5.74418607393
 ___  is the common name for a large number of species in the anatidae family of birds which also includes swans and geese
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-0.48030353,  1.91628445, -1.04395729, -1.48442342,  1.48064578,
       -1.92156208, -1.57033673, -0.28775435,  0.0529547 , -0.02227126]))
('context_embed_out', array([-0.48030353,  1.91628445, -1.04395729, -1.48442342,  1.48064578,
       -1.92156208, -1.57033673, -0.28775435,  0.0529547 , -0.02227126]))
('weight mode', False)
weight is 12.0
('context_embed is ', array([-0.48030353,  1.91628445, -1.04395729, -1.48442342,  1.48064578,
       -1.92156208, -1.57033673, -0.28775435,  0.0529547 , -0.02227126]))
('context_weights', [12.0])
produ

('context avg out', array([-0.08085019, -0.07544847, -0.07985232, -0.01653024, -0.03869498,
       -0.07560353, -0.04617987, -0.0064769 ,  0.07111551, -0.06551561]))
vector norm: 2.02980826711
awk
705 5.77435258751
 ___  is a city in the federal state of thuringia germany
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.28954456,  0.11897223,  0.2258999 , -0.90184447, -0.79666971,
       -0.28747964, -0.63646396,  0.26606585,  0.27704042,  0.53244038]))
('context_embed_out', array([ 0.28954456,  0.11897223,  0.2258999 , -0.90184447, -0.79666971,
       -0.28747964, -0.63646396,  0.26606585,  0.27704042,  0.53244038]))
('weight mode', False)
weight is 5.0
('context_embed is ', array([ 0.28954456,  0.11897223,  0.2258999 , -0.90184447, -0.79666971,
       -0.28747964, -0.63646396,  0.26606585,  0.27704042,  0.53244038]))
('context_weights', [5.0])
producing top 20 words for new embedding
producing top 20 simwords
('context avg out', array([ 0.05790891,

gasoline
14 5.8655882566
 ___  is both a specific chemical compound and a class of chemical compounds
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-0.90295634, -0.08319561,  0.17185441, -0.76545266, -0.4969467 ,
       -0.01326389, -0.22728739, -0.89777154,  1.42497862,  0.00713445]))
('context_embed_out', array([-0.90295634, -0.08319561,  0.17185441, -0.76545266, -0.4969467 ,
       -0.01326389, -0.22728739, -0.89777154,  1.42497862,  0.00713445]))
('weight mode', False)
weight is 6.0
('context_embed is ', array([-0.90295634, -0.08319561,  0.17185441, -0.76545266, -0.4969467 ,
       -0.01326389, -0.22728739, -0.89777154,  1.42497862,  0.00713445]))
('context_weights', [6.0])
producing top 20 words for new embedding
producing top 20 simwords
('context avg out', array([-0.15049272, -0.01386593,  0.0286424 , -0.12757544, -0.08282445,
       -0.00221065, -0.03788123, -0.14962859,  0.23749644,  0.00118908]))
vector norm: 2.88032548319
alum
21931 5

cosmetics
2604 5.89477772177
 ___  or is a common extrusive igneous volcanic rock formed from the rapid cooling of lava exposed at or very near the surface of a planet or moon
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 2.70276744, -2.95092862,  0.40742583, -0.02304387,  0.30876871,
       -0.99703135, -2.14718279,  0.59616957, -0.1654889 ,  1.63985681]))
('context_embed_out', array([ 2.70276744, -2.95092862,  0.40742583, -0.02304387,  0.30876871,
       -0.99703135, -2.14718279,  0.59616957, -0.1654889 ,  1.63985681]))
('weight mode', False)
weight is 14.0
('context_embed is ', array([ 2.70276744, -2.95092862,  0.40742583, -0.02304387,  0.30876871,
       -0.99703135, -2.14718279,  0.59616957, -0.1654889 ,  1.63985681]))
or is a common extrusive igneous volcanic rock formed from the rapid cooling of  ___  lava exposed at or very near the surface of a planet or moon
('per word weights', [1.0, 1.0, 1.0, 

('context avg out', array([ 0.01341773,  0.05404672,  0.05194689,  0.04972314, -0.03983546,
       -0.15158839, -0.15193777,  0.04436782,  0.10973298, -0.00208982]))
vector norm: 2.30886291438
vermont
825 5.95517277365
 ___  is a legal or economic system under which people are treated as property
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-0.60258744, -0.87326836, -1.30533387, -1.20702778,  0.28623562,
        0.08537226, -1.05768432, -0.25111919,  0.28455124, -1.05970882]))
('context_embed_out', array([-0.60258744, -0.87326836, -1.30533387, -1.20702778,  0.28623562,
        0.08537226, -1.05768432, -0.25111919,  0.28455124, -1.05970882]))
('weight mode', False)
weight is 6.0
('context_embed is ', array([-0.60258744, -0.87326836, -1.30533387, -1.20702778,  0.28623562,
        0.08537226, -1.05768432, -0.25111919,  0.28455124, -1.05970882]))
('context_weights', [6.0])
producing top 20 words for new embedding
producing top 20 simwords
('context

('context avg out', array([-0.06219159,  0.13149242, -0.25307297,  0.07764605,  0.02089623,
       -0.36989309, -0.14443555, -0.10366625, -0.09473736, -0.08851056]))
vector norm: 3.32026089492
condor
106786 5.96111536502
 ___  is principally the name of the german town which has given its name to many other places and entities notably including the royal house of the former independent state in modern germany county new york and the city of bahamas
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-0.89423729, -1.49188849, -0.53166138, -1.18830597, -1.00036684,
       -2.96719416, -3.45198826,  0.70609657,  1.30120594,  0.9314093 ]))
('context_embed_out', array([-0.89423729, -1.49188849, -0.53166138, -1.18830597, -1.00036684,
       -2.96719416, -3.45198826,  0.70609657,  1.30120594,  0.9314093 ]))
('weight mode', False)
weight is 23.0
('context_embed is ', array([-

('context avg out', array([-0.03887988, -0.06486472, -0.02311571, -0.05166548, -0.04349421,
       -0.12900844, -0.15008645,  0.03069985,  0.05657417,  0.04049606]))
vector norm: 1.69293185267
nassau
9203 5.96122402524
 ___  is the involuntary reddening of a person s face due to emotional stress
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.80974642, -0.24854385,  0.26327443, -0.55328663,  0.25323209,
        0.32140999, -0.86394974, -0.60503148, -0.16370588, -0.01870402]))
('context_embed_out', array([ 0.80974642, -0.24854385,  0.26327443, -0.55328663,  0.25323209,
        0.32140999, -0.86394974, -0.60503148, -0.16370588, -0.01870402]))
('weight mode', False)
weight is 7.0
('context_embed is ', array([ 0.80974642, -0.24854385,  0.26327443, -0.55328663,  0.25323209,
        0.32140999, -0.86394974, -0.60503148, -0.16370588, -0.01870402]))
('context_weights', [7.0])
producing top 20 words for new embedding
producing top 20 simwords
('con

empowerment
26 6.03936327563
 ___  scottish gaelic eilean bharraigh is an island in the outer hebrides in scotland
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 2.08493847,  0.14473881,  2.97815619, -0.0149802 ,  0.30483317,
       -0.39983165, -2.58423407,  0.11899804,  0.5209455 ,  0.5242646 ]))
('context_embed_out', array([ 2.08493847,  0.14473881,  2.97815619, -0.0149802 ,  0.30483317,
       -0.39983165, -2.58423407,  0.11899804,  0.5209455 ,  0.5242646 ]))
('weight mode', False)
weight is 7.0
('context_embed is ', array([ 2.08493847,  0.14473881,  2.97815619, -0.0149802 ,  0.30483317,
       -0.39983165, -2.58423407,  0.11899804,  0.5209455 ,  0.5242646 ]))
scottish gaelic  ___  eilean bharraigh is an island in the outer hebrides in scotland
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 2.08493847,  0.14473881,  2.97815619, -0.0149802 ,  0.30483317,
       -0.39983165, -2.58423407,  0.11

('context avg out', array([-0.07208299, -0.07250836, -0.10082642, -0.10743377,  0.13064035,
        0.18080549,  0.0330259 , -0.00602708,  0.04619455, -0.01125223]))
vector norm: 2.17030520596
altruism
9 6.26710874684
 ___  latin cancellarius is a title of various official positions in the governments of many nations
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.43891397,  0.60111734, -0.42218557, -0.53474095,  0.43727301,
       -0.40624844, -0.4517789 ,  0.85947993,  0.4217362 , -0.78274103]))
('context_embed_out', array([ 0.43891397,  0.60111734, -0.42218557, -0.53474095,  0.43727301,
       -0.40624844, -0.4517789 ,  0.85947993,  0.4217362 , -0.78274103]))
('weight mode', False)
weight is 8.0
('context_embed is ', array([ 0.43891397,  0.60111734, -0.42218557, -0.53474095,  0.43727301,
       -0.40624844, -0.4517789 ,  0.85947993,  0.4217362 , -0.78274103]))
('context_weights', [8.0])
producing top 20 words for new embedding
prod

('context avg out', array([ 0.0549949 , -0.04206046,  0.02488694, -0.07151978,  0.09394054,
       -0.05340269, -0.03493421,  0.09670365, -0.02302539,  0.01465164]))
vector norm: 2.06841298712
alabaster
3839 6.26777789817
 ___  or armor is a protective covering that is used to prevent damage from being inflicted to an object individual or vehicle by direct contact weapons or projectiles usually during combat or from damage caused by a potentially dangerous environment or action eg cycling construction sites etc
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-1.23953013, -2.74454159, -3.31980326, -4.2017462 , -0.14872925,
       -0.54419922, -1.65634358, -0.52252551,  1.74170717, -0.1215967 ]))
('context_embed_out', array([-1.23953013, -2.74454159, -3.31980326, -4.2017462 , -0.14872925,
       -0.54419922, -1.65634358, -0.52252551,  1.74170717,

('context avg out', array([ 0.20111924,  0.12899235, -0.05155123, -0.01768376,  0.06459643,
       -0.08625647, -0.08259655, -0.01181209,  0.01747548,  0.02229414]))
vector norm: 2.40694807197
black
440 6.34371412386
 ___  is a casino game named after a french diminutive for little wheel
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-1.03390994, -0.30341223, -0.07752711, -0.61505721,  0.28660245,
       -1.07803084,  0.30740188, -1.05685105,  0.25411045,  0.76857451]))
('context_embed_out', array([-1.03390994, -0.30341223, -0.07752711, -0.61505721,  0.28660245,
       -1.07803084,  0.30740188, -1.05685105,  0.25411045,  0.76857451]))
('weight mode', False)
weight is 7.0
('context_embed is ', array([-1.03390994, -0.30341223, -0.07752711, -0.61505721,  0.28660245,
       -1.07803084,  0.30740188, -1.05685105,  0.25411045,  0.76857451]))
('context_weights', [7.0])
producing top 20 words for new embedding
producing top 20 simwords
('context avg

('context avg out', array([-0.01659909,  0.16926177, -0.1488903 , -0.09583918, -0.11816314,
       -0.08706447, -0.14902104,  0.02083355,  0.19412135,  0.07435317]))
vector norm: 2.33783088849
redox
39 6.42247521063
 ___  may refer to a member or supporter of a parliament as in may also refer to an expert adviser on parliamentary procedure as in daydream nation daydream nation is the fifth studio album by american alternative rock band sonic youth
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.77269974, -3.00494565, -0.22254294,  1.95477109,  1.42390317,
       -5.15782311, -3.92666052,  1.62559188,  4.52578429, -2.33555527]))
('context_embed_out', array([ 0.77269974, -3.00494565, -0.22254294,  1.95477109,  1.42390317,
       -5.15782311, -3.92666052,  1.62559188,  4.52578429, -2.33555527]))
('weight mode', False)
weight is 25.0
('context_embed is ',

('context avg out', array([ 0.01698213,  0.10403726,  0.00680262, -0.26085625,  0.20596372,
        0.06577788,  0.01176237,  0.09523637,  0.13309738,  0.347354  ]))
vector norm: 2.56454670743
marmalade
2792 6.43678727814
 ___  is an extensible multimedia framework developed by apple inc capable of handling various formats of digital video picture sound panoramic images and interactivity
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.52011245,  1.6390742 , -1.52788166, -1.44284841, -0.34626851,
       -2.57491928,  1.06974681, -0.74022684,  3.10515486,  0.39864889]))
('context_embed_out', array([ 0.52011245,  1.6390742 , -1.52788166, -1.44284841, -0.34626851,
       -2.57491928,  1.06974681, -0.74022684,  3.10515486,  0.39864889]))
('weight mode', False)
weight is 17.0
('context_embed is ', array([ 0.52011245,  1.6390742 , -1.52788166, -1.44284841, -0.34626851,
       -2.57491928,  1.06974

('context avg out', array([ 0.00763384,  0.06709494, -0.12725836,  0.03265611,  0.08607414,
       -0.1471266 ,  0.08397754, -0.08489165,  0.06688451, -0.02939954]))
vector norm: 2.10411387248
satanism
1304 6.63769160635
 ___  is a 1972 american dramatic thriller film produced and directed by john boorman and starring jon voight burt reynolds ned beatty and ronny cox with the latter two making their feature film debuts
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-0.84550471, -4.05477928,  4.46506915, -1.70001272,  2.88563318,
       -2.72198687,  0.85377517, -2.65088789,  3.9024982 ,  1.01898811]))
('context_embed_out', array([-0.84550471, -4.05477928,  4.46506915, -1.70001272,  2.88563318,
       -2.72198687,  0.85377517, -2.65088789,  3.9024982 ,  1.01898811]))
('weight mode', False)
weight is 24.0
('context_embed is ', array([-0.84550471, -4.05477928, 

('context avg out', array([ 0.05708262,  0.04170003,  0.01493662,  0.04978434,  0.04692672,
       -0.05560148, -0.00216977, -0.01052127,  0.13205522,  0.01230047]))
vector norm: 1.75454941752
divx
3559 6.88829322734
 ___  is an action sport which involves riding and performing tricks using a skateboard
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-0.46736995, -0.57054757,  0.31242932, -1.0443781 , -0.22543615,
       -0.95837895, -0.33741346, -0.80513033,  1.04141808,  1.28662341]))
('context_embed_out', array([-0.46736995, -0.57054757,  0.31242932, -1.0443781 , -0.22543615,
       -0.95837895, -0.33741346, -0.80513033,  1.04141808,  1.28662341]))
('weight mode', False)
weight is 8.0
('context_embed is ', array([-0.46736995, -0.57054757,  0.31242932, -1.0443781 , -0.22543615,
       -0.95837895, -0.33741346, -0.80513033,  1.04141808,  1.28662341]))
('context_weights', [8.0])
producing top 20 words for new embedding
producing top 20 s

('context avg out', array([ 0.01450311,  0.24398252, -0.05877063, -0.08654656, -0.0213214 ,
       -0.12133808, -0.16626561,  0.05277775,  0.04170521,  0.02624471]))
vector norm: 1.79889170765
vulture
12540 7.22233329064
 ___  welsh yr wyddfa is the highest mountain in wales at an elevation of 1085 m above sea level and the highest point in the british isles outside the scottish highlands
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 3.89871877, -0.24856527,  1.65020926, -1.93199785,  0.14452768,
       -1.26991883, -2.82067386,  1.13901746, -0.92823034,  0.71656732]))
('context_embed_out', array([ 3.89871877, -0.24856527,  1.65020926, -1.93199785,  0.14452768,
       -1.26991883, -2.82067386,  1.13901746, -0.92823034,  0.71656732]))
('weight mode', False)
weight is 16.0
('context_embed is ', array([ 3.89871877, -0.24856527,  1.65020926, -1.93199785,  0.14452768,
       -1.26991883, -2.82067386,

('context avg out', array([-0.09866097,  0.08981321,  0.01692507,  0.02350103,  0.01342535,
       -0.17555926,  0.00032394,  0.00444971,  0.1308135 , -0.07609992]))
vector norm: 1.88130003722
compuserve
38162 7.47553457182
 ___  or is a historic county of northern england and the largest in the united kingdom
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.36196312, -0.65443663,  0.60398828, -0.21548822, -0.3226129 ,
       -1.65021002, -1.30959684, -0.30355602,  0.1941866 ,  0.64975256]))
('context_embed_out', array([ 0.36196312, -0.65443663,  0.60398828, -0.21548822, -0.3226129 ,
       -1.65021002, -1.30959684, -0.30355602,  0.1941866 ,  0.64975256]))
('weight mode', False)
weight is 7.0
('context_embed is ', array([ 0.36196312, -0.65443663,  0.60398828, -0.21548822, -0.3226129 ,
       -1.65021002, -1.30959684, -0.30355602,  0.1941866 ,  0.64975256]))
('context_weights', [7.0])
producing top 20 words for new embedding
producing top 20

('context avg out', array([ 0.11331761,  0.05552241,  0.02637562, -0.02241824,  0.11228843,
       -0.26818117, -0.08373717, -0.00227513, -0.01072075,  0.07955397]))
vector norm: 2.14827531128
pluto
44 7.51845005566
 ___  or interment is the ritual act of placing a dead person or animal sometimes with objects into the ground
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.36966596,  1.03334167, -1.07275897, -0.69567357,  1.0771597 ,
       -0.86116559, -1.66063898,  0.52524912,  1.18866983, -0.11523488]))
('context_embed_out', array([ 0.36966596,  1.03334167, -1.07275897, -0.69567357,  1.0771597 ,
       -0.86116559, -1.66063898,  0.52524912,  1.18866983, -0.11523488]))
('weight mode', False)
weight is 10.0
('context_embed is ', array([ 0.36966596,  1.03334167, -1.07275897, -0.69567357,  1.0771597 ,
       -0.86116559, -1.66063898,  0.52524912,  1.18866983, -0.11523488]))
('context_weights', [10.0])
producing top 20 words fo

('context avg out', array([ 0.07664155, -0.09268025,  0.00428237, -0.00097685,  0.07253494,
       -0.15019505,  0.06468154,  0.18636711,  0.03851582,  0.04699681]))
vector norm: 1.92552984102
noon
39 8.29578790143
 ___  is the common name for the genus gadus of demersal fishes belonging to the family gadidae
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-0.44962006, -0.08893904, -1.5540651 , -0.87808958,  0.58055144,
       -1.8062902 , -0.2035575 ,  0.58941262, -0.84047661, -0.95482061]))
('context_embed_out', array([-0.44962006, -0.08893904, -1.5540651 , -0.87808958,  0.58055144,
       -1.8062902 , -0.2035575 ,  0.58941262, -0.84047661, -0.95482061]))
('weight mode', False)
weight is 8.0
('context_embed is ', array([-0.44962006, -0.08893904, -1.5540651 , -0.87808958,  0.58055144,
       -1.8062902 , -0.2035575 ,  0.58941262, -0.84047661, -0.95482061]))
('context_weights', [8.0])
producing top 20 words for new embedding
producing to

('context avg out', array([-0.10684411, -0.0752805 , -0.12331353,  0.09226411,  0.05906043,
       -0.06361063, -0.04872763, -0.08518904,  0.15429994, -0.13081562]))
vector norm: 2.21805917275
persecution
1199 8.34603761302
 ___  is a common name for fish of the genus perca freshwater gamefish belonging to the family percidae
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-1.49404724,  0.92976277, -1.75056676, -2.31552185,  0.95087052,
       -1.84032567, -0.93478004, -0.28825956, -1.07378881, -1.4308087 ]))
('context_embed_out', array([-1.49404724,  0.92976277, -1.75056676, -2.31552185,  0.95087052,
       -1.84032567, -0.93478004, -0.28825956, -1.07378881, -1.4308087 ]))
('weight mode', False)
weight is 10.0
('context_embed is ', array([-1.49404724,  0.92976277, -1.75056676, -2.31552185,  0.95087052,
       -1.84032567, -0.93478004, -0.28825956, -1.07378881, -1.4308087 ]))
('context_weights', [10.0])
producing top 20 words f

('context avg out', array([ 0.08511935, -0.00939763, -0.13111353, -0.21159112, -0.02432509,
       -0.03774916, -0.03858645, -0.0043583 ,  0.08114971, -0.07153906]))
vector norm: 2.17507209421
infantry
13 8.43231804886
 ___  or fancy dress is the distinctive style of dress of a particular people class or period
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-0.49149058,  0.14082148, -0.3771113 , -0.07256029,  1.71055677,
       -0.35869296,  0.21943511,  1.14287524,  0.60268756, -0.30075938]))
('context_embed_out', array([-0.49149058,  0.14082148, -0.3771113 , -0.07256029,  1.71055677,
       -0.35869296,  0.21943511,  1.14287524,  0.60268756, -0.30075938]))
('weight mode', False)
weight is 9.0
('context_embed is ', array([-0.49149058,  0.14082148, -0.3771113 , -0.07256029,  1.71055677,
       -0.35869296,  0.21943511,  1.14287524,  0.60268756, -0.30075938]))
('context_weights', [9.0])
producing top 20 words for new embedding
produ

('context avg out', array([ 0.14475659, -0.10557677,  0.0685959 , -0.03638379, -0.11395621,
       -0.1176618 , -0.09888297, -0.10179423,  0.10679349,  0.01907132]))
vector norm: 2.19215607738
heslington
108 8.57601897679
 ___  is a process in which one or more messages communicated to a prophet are then communicated to others
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.4248082 ,  0.22952836, -1.71487501,  0.09063715,  0.6626    ,
       -0.21525003, -0.30797382, -0.16393005,  0.58577719, -0.69529609]))
('context_embed_out', array([ 0.4248082 ,  0.22952836, -1.71487501,  0.09063715,  0.6626    ,
       -0.21525003, -0.30797382, -0.16393005,  0.58577719, -0.69529609]))
('weight mode', False)
weight is 7.0
('context_embed is ', array([ 0.4248082 ,  0.22952836, -1.71487501,  0.09063715,  0.6626    ,
       -0.21525003, -0.30797382, -0.16393005,  0.58577719, -0.69529609]))
('context_weights', [7.0])
producing top 20 words for new embedding

tallinn
8 8.71933140418
 ___  corporation is an american multinational corporation headquartered in santa clara california
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.98918288, -0.07403628,  1.19013613, -1.34040191, -0.67675547,
       -1.23996922,  0.23641838, -0.14513934,  0.89911571, -0.56816516]))
('context_embed_out', array([ 0.98918288, -0.07403628,  1.19013613, -1.34040191, -0.67675547,
       -1.23996922,  0.23641838, -0.14513934,  0.89911571, -0.56816516]))
('weight mode', False)
weight is 8.0
('context_embed is ', array([ 0.98918288, -0.07403628,  1.19013613, -1.34040191, -0.67675547,
       -1.23996922,  0.23641838, -0.14513934,  0.89911571, -0.56816516]))
('context_weights', [8.0])
producing top 20 words for new embedding
producing top 20 simwords
('context avg out', array([ 0.12364786, -0.00925453,  0.14876702, -0.16755024, -0.08459443,
       -0.15499615,  0.0295523 , -0.01814242,  0.11238946, -0.07102065]))
vector n

('context avg out', array([ 0.20144426, -0.07587375, -0.12937355, -0.0094363 , -0.24412773,
       -0.12782704, -0.17570745, -0.11557697,  0.08794624, -0.10716002]))
vector norm: 2.55063989018
welling
7713 9.23989864241
 ___  is the administration of antigenic material a vaccine to stimulate an individual s immune system to develop adaptive immunity to a pathogen
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-1.37561474,  2.29446254, -0.4603924 , -0.09978647,  0.51435129,
        1.08136931, -1.30643029,  0.46104933,  1.30235645, -0.63359002]))
('context_embed_out', array([-1.37561474,  2.29446254, -0.4603924 , -0.09978647,  0.51435129,
        1.08136931, -1.30643029,  0.46104933,  1.30235645, -0.63359002]))
('weight mode', False)
weight is 12.0
('context_embed is ', array([-1.37561474,  2.29446254, -0.4603924 , -0.09978647,  0.51435129,
        1.08136931, -1.30643029,  0.46104933,  1.30235645, -0.63359002]))
('co

('context avg out', array([ 4.00494453e-03,  2.16014440e-01, -2.16291419e-01, -8.89634072e-02,
        7.37806786e-02, -1.27371656e-01,  5.59849044e-05,  1.35159882e-01,
        3.90531251e-02,  1.28581836e-01]))
vector norm: 2.96027488435
jasmine
10232 9.41789506637
 ___  sa is a french multinational vehicle manufacturer established in 1899
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.00753741, -0.34980373,  0.86035236, -0.55542077, -0.19452802,
        0.61342646, -0.11950621, -0.83925012,  1.94447309,  0.29250205]))
('context_embed_out', array([ 0.00753741, -0.34980373,  0.86035236, -0.55542077, -0.19452802,
        0.61342646, -0.11950621, -0.83925012,  1.94447309,  0.29250205]))
('weight mode', False)
weight is 7.0
('context_embed is ', array([ 0.00753741, -0.34980373,  0.86035236, -0.55542077, -0.19452802,
        0.61342646, -0.11950621, -0.83925012,  1.94447309,  0.29250205]))
('context_weights', [7.0])
producing top 20 words fo

('context avg out', array([ 0.02687156,  0.16054218, -0.16328367, -0.00355473,  0.0731652 ,
        0.0460842 , -0.11175033,  0.03480926,  0.06344777, -0.11275183]))
vector norm: 2.00679646576
perception
27 9.49751289946
 ___  refers to using water vessels called yachts for sporting purposes
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-0.04557598,  0.1148631 ,  0.0103292 , -1.36889539,  0.61352996,
       -0.80696172, -0.31489097, -0.35906876,  0.0945571 , -0.31707558]))
('context_embed_out', array([-0.04557598,  0.1148631 ,  0.0103292 , -1.36889539,  0.61352996,
       -0.80696172, -0.31489097, -0.35906876,  0.0945571 , -0.31707558]))
('weight mode', False)
weight is 8.0
('context_embed is ', array([-0.04557598,  0.1148631 ,  0.0103292 , -1.36889539,  0.61352996,
       -0.80696172, -0.31489097, -0.35906876,  0.0945571 , -0.31707558]))
('context_weights', [8.0])
producing top 20 words for new embedding
producing top 20 simwords
('co

('context avg out', array([ 0.20103956, -0.14107404,  0.09948237,  0.10606244,  0.00624454,
       -0.20675448,  0.02949844,  0.12073553,  0.10752493, -0.05933374]))
vector norm: 2.50983225466
labrador
3 10.3437078858
 ___  is any of various mined and manufactured salts that contain potassium in water soluble form
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([-0.02690019, -0.41251518,  0.16609725, -1.56423709,  0.79264345,
        1.07200401, -1.7880908 ,  1.06623174, -0.01181502,  0.83282033]))
('context_embed_out', array([-0.02690019, -0.41251518,  0.16609725, -1.56423709,  0.79264345,
        1.07200401, -1.7880908 ,  1.06623174, -0.01181502,  0.83282033]))
('weight mode', False)
weight is 9.0
('context_embed is ', array([-0.02690019, -0.41251518,  0.16609725, -1.56423709,  0.79264345,
        1.07200401, -1.7880908 ,  1.06623174, -0.01181502,  0.83282033]))
('context_weights', [9.0])
producing top 20 words for new embedding
pr

('context avg out', array([ 0.12609905, -0.02415598,  0.04589695, -0.09446549,  0.01236384,
       -0.09325459, -0.04852736,  0.06302362,  0.06552331,  0.03245916]))
vector norm: 1.96962906747
hunstanton
2189 10.352582737
 ___  welsh abertawe mouth of the tawe officially known as the city and county of is a coastal city and county in wales
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.43464533, -1.0397515 ,  0.31363665, -2.26027561,  0.62680503,
       -2.76185311, -1.68144311, -0.50445923,  0.63813542,  1.28631871]))
('context_embed_out', array([ 0.43464533, -1.0397515 ,  0.31363665, -2.26027561,  0.62680503,
       -2.76185311, -1.68144311, -0.50445923,  0.63813542,  1.28631871]))
('weight mode', False)
weight is 11.0
('context_embed is ', array([ 0.43464533, -1.0397515 ,  0.31363665, -2.26027561,  0.62680503,
       -2.76185311, -1.68144311, -0.50445923,  0.63813542,  1.28631871]))
welsh abertawe mouth of the tawe 

photosynthesis
6 10.5272715585
 ___  is a numerical description of how far apart objects are
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.61370121, -0.0883739 , -0.33527504,  0.07093984,  0.03540107,
       -0.51520826,  0.05542284, -0.03165051,  0.29709075,  0.5023728 ]))
('context_embed_out', array([ 0.61370121, -0.0883739 , -0.33527504,  0.07093984,  0.03540107,
       -0.51520826,  0.05542284, -0.03165051,  0.29709075,  0.5023728 ]))
('weight mode', False)
weight is 5.0
('context_embed is ', array([ 0.61370121, -0.0883739 , -0.33527504,  0.07093984,  0.03540107,
       -0.51520826,  0.05542284, -0.03165051,  0.29709075,  0.5023728 ]))
('context_weights', [5.0])
producing top 20 words for new embedding
producing top 20 simwords
('context avg out', array([ 0.12274024, -0.01767478, -0.06705501,  0.01418797,  0.00708021,
       -0.10304165,  0.01108457, -0.0063301 ,  0.05941815,  0.10047456]))
vector norm: 2.09033921177
distance
6479 10.527425903

('context avg out', array([ 0.03077202, -0.00213383,  0.0616789 , -0.11245173, -0.10516632,
       -0.21064912, -0.32127746, -0.17901319,  0.13157198,  0.04173174]))
vector norm: 2.00008060593
duran
17564 10.5988725839
 ___  is any liquid liquefiable or mastic composition that after application to a substrate in a thin layer converts to a solid film
('per word weights', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])
('context_embed original', array([ 0.00721545, -1.86989769,  0.0337014 , -0.45578319, -0.17088661,
        0.94100352, -0.23116025,  1.44151491,  0.37699141,  1.42037278]))
('context_embed_out', array([ 0.00721545, -1.86989769,  0.0337014 , -0.45578319, -0.17088661,
        0.94100352, -0.23116025,  1.44151491,  0.37699141,  1.42037278]))
('weight mode', False)
weight is 10.0
('context_embed is ', array([ 0.00721545, -1.86989769,  0.0337014 , -0.45578319, -0.17088661,
        0.94100352, -0.23116025,  1.44151491,  0.37699141,  1.42037278]))
('context_weights', [10.0])


In [69]:
np.linalg.norm(model_w2v['the'])

  """Entry point for launching an IPython kernel.


2.3791163