### Match to get similar neighbors
- Get top terms
- Get context embedding
- Find neighbors that:
> have similar context (semantically) <br>
> treatment word being substituted with other top terms <br>

In [3]:
import copy
import io, time
from io import BytesIO
from itertools import combinations, cycle, product
from IPython.display import display
import math
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd
import pickle
import tarfile
import random
import re
import requests
from scipy.sparse import hstack, lil_matrix

from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', 1000)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1) # change None to -1


from collections import Counter, defaultdict
import numpy as np
import re

import sklearn

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import classification_report, accuracy_score


import torch
from transformers import * # here import bert

import warnings
warnings.filterwarnings("ignore")

In [4]:
from data_structure import Dataset #, get_IMDB, get_kindle

data_path = '/data/zwang/2020_S/Attention/Counterfactual/'

# pickle.dump(ds_imdb, open(data_path+'imdb_embedding_1_01.pkl', 'wb'))
# pickle.dump(df_imdb_ite, open(data_path+'imdb_ite_1_01.pkl', 'wb'))

In [5]:
def get_kindle():
    """
    Only use selected_train data
    """
    df_kindle = pickle.load(open("/data/zwang/2020_S/Attention/Counterfactual/kindle_ct/causal_sents/kindle_data.pkl",'rb'))
    df_kindle = df_kindle[df_kindle['flag']=='selected_train']
    df_kindle.reset_index(drop=True,inplace=True)
    return df_kindle

In [6]:
df_kindle = get_kindle()
df_kindle.shape, Counter(df_kindle.label), display(df_kindle.head())

Unnamed: 0,text,rating,label,flag
0,"The story was good, but I was getting very irritated at all the grammatical and spelling errors",2,-1,selected_train
1,REALLY enjoyed this series and am hopeful there will be a book 5 to give me some closure on some of the other characters,4,1,selected_train
2,"The lead character was successful and not as sad as usual, but the chemistry was lacking for me",1,-1,selected_train
3,"I liked the characters though, I would have loved to see how Dylan would have handled their relationship",2,-1,selected_train
4,"Teenage, special needs child, ignorant ex-husband and a strong,weak,vulnerable, older woman",2,-1,selected_train


((10000, 4), Counter({-1: 5000, 1: 5000}), None)

In [7]:
class Counterfactual:
    def __init__(self, df_train, df_test, moniker):
        display(df_train.head(1))
        self.moniker = moniker
        self.train = df_train
        self.test = df_test
        
def get_IMDB():
    """
    IMDB data split into sentences (len>1 and len<30)
    """
#     df_imdb = pickle.load(open(data_path+'imdb/imdb_sentences/imdb_sents.pkl','rb'))
    ds_imdb_ct = pickle.load(open(data_path+"imdb_ct/sentiment/combined/paired/split_sents/ds_imdb.pkl", "rb"))
    df_imdb = ds_imdb_ct.train[['batch_id','text','label']]
    df_imdb.reset_index(drop=True,inplace=True)
    
    return df_imdb

In [96]:
df_imdb = get_IMDB()
df_imdb.shape, Counter(df_imdb.label), display(df_imdb.head())

Unnamed: 0,batch_id,text,label
0,4,"Long, boring, blasphemous.",-1
1,4,Never have I been so glad to see ending credits roll.,-1
2,40,Not good!,-1
3,40,It is like claiming an Elvis actor is as good as the real King.,-1
4,47,"This movie is so bad, it can only be compared to the all-time worst ""comedy"": Police Academy 7.",-1


((8173, 3), Counter({-1: 4059, 1: 4114}), None)

In [4]:
def get_large_IMDB_sentences():
    """
    IMDB sentences from the original large dataset
    """
    df_imdb = pickle.load(open(data_path+'large_imdb_sents.pkl','rb'))
    return df_imdb

In [22]:
data_path = '/data/zwang/2020_S/Attention/Counterfactual/imdb_ct/sentiment/orig/eighty_percent/sentences/'
df_imdb = get_large_IMDB_sentences()
df_imdb.shape, Counter(df_imdb.label)

((160000, 4), Counter({1: 80000, -1: 80000}))

In [6]:
df_imdb.head()

Unnamed: 0,text,label,type,length
0,"The characters are interesting, vibrant with primary colours and all.",1,train,10
1,<br /><br />the best way to watch this film is to not expect what you have seen in the past by Miyazaki.,1,test,22
2,"This one's worth watching more than once, and showing to all your friends.",1,test,14
3,That's not true in Japanese horror.,1,test,7
4,No one could have played this role any better that Jack Webb.,1,test,12


In [8]:
def simple_vectorize(df):
    """
    Vectorize text
    min_df = 10: agree with min_df for ite features
    """
    vec = CountVectorizer(min_df=5, binary=True, max_df=.8)
    X = vec.fit_transform(df.text)
    print(X.shape)
    y = df.label.values
    feats = np.array(vec.get_feature_names())
    
    return X, y, vec, feats

In [9]:
def get_top_terms(dataset, coef_thresh, placebo_thresh, C=1):
    """
    Fit classifier, print top-200 terms;
    Top features: abs(coef) >= thresh
    Placebos: abs(coef) <= thresh
    """
    clf = LogisticRegression(class_weight='auto', C=C, solver='lbfgs', max_iter=1000)
    clf.fit(dataset.X, dataset.y)
    
#     print_coef(clf, dataset.feats, n=100)
    #print('dummy coef= %.3f' % clf.coef_[0][dataset.vec.vocabulary_[DUMMY_TERM]])
    
    top_feature_idx = np.where(abs(clf.coef_[0]) >= coef_thresh)[0]
    placebo_feature_idx = np.where(abs(clf.coef_[0]) <= placebo_thresh)[0]
    
    return top_feature_idx, placebo_feature_idx, np.array([float("%.3f" % c) for c in clf.coef_[0]])


In [10]:
def get_wd_context(sentence, word, window=0):
    """
    Return left context and right context
    If window > 0: return n words to the left / right;
    If window == 0: return all words to the left / right.
    """
    word = word.lower()
    toks = sentence.split()
    
    for i, t in enumerate(toks):
        if re.search(r'(?i)\b%s\b' % word, t): # if find the word, then take left-n and right-n words
            if window > 0:
                context = ' '.join(toks[max(0, i-window):min(i+window+1, len(toks))])
                left_context = ' '.join(toks[max(0, i-window):i])
                right_context = ' '.join(toks[(i+1):min(i+window+1, len(toks))])
            elif window == 0:
                context = ' '.join(toks[:i] + toks[i+1:])
                left_context = ' '.join(toks[:i])
                right_context = ' '.join(toks[(i+1):])

#             return re.sub(r'(?i)\b%s\b' % word, ' ', context, re.IGNORECASE)
            return left_context, right_context, context

In [8]:
get_wd_context(sentence='nice movie under the direction of Spielberg', word='of', window=0)

('nice movie under the direction',
 'Spielberg',
 'nice movie under the direction Spielberg')

In [11]:
class SentenceEdit:
    def __init__(self, remove_wd, sentence_idx, left_context, right_context, context, label):
        
        self.sentence_idx = sentence_idx
        self.remove_wd = remove_wd
        self.context = context
        self.label = label
        self.left_context = left_context
        self.right_context = right_context
        
      
    def __repr__(self):
        " returns a printable string representation of an object"
        if(len(str(self.left_context).strip() + str(self.right_context).strip())==0):
            return '%s ||| %s \n' % (str(self.context), str(self.label))
        else:
            return '%s ||| %s ||| %s ||| %s \n' % (str(self.remove_wd), str(self.left_context), str(self.right_context), str(self.label))
    

In [12]:
def get_wd_unk_sentences(X, sentences, labels, vec, remove_wd_list, exclude_sent_idx, window=0):
    """
    unk sentence: sentence after removing the word
    remove_wd_list:
        - top_words: abs(coef) > thresh
        - placebo_words: abs(coef) < thresh
    exclude_sent_idx:
        - sentences to exclude
    
    for each word to be removed
      for each treatment sentence (sentence containing this word)
        make a copy of the sentence with this word removed
        
    returns:
       unk_sentences: list of SentenceEdit objects
       word2sentences: dict from word to list of SentenceEdit objects
    """
    word2sentences = defaultdict(list)
    unk_sentences = []
    containing_sents = []
    
    for word in remove_wd_list: # iterate over all treatment word
        wi = vec.vocabulary_[word]
        for si in X[:,wi].nonzero()[0]: # iterate over sentences containing current treatment word
            if(si not in exclude_sent_idx): # not contain any top words
                left_context, right_context, context = get_wd_context(sentences[si], word, window=0) # context within window=5
                sent_obj = SentenceEdit(word, si, left_context, right_context, context, labels[si])
                
                word2sentences[word].append(sent_obj)
                unk_sentences.append(sent_obj)
                containing_sents.append(si)
                
#     print('%d sentences with placebo terms\n' % len(unk_sentences))
    return unk_sentences, word2sentences, list(set(containing_sents))

In [13]:
def get_all_sentences(df):
    """
    Construct SentenceEdit object for all sentences
    """
    df['i_th'] = range(df.shape[0])
    all_sentences = []
    for ri, row in df.iterrows():
        sent_obj = SentenceEdit('', row['i_th'], '' , '',  row['text'], row['label'])
        all_sentences.append(sent_obj)
        
    return all_sentences

In [14]:
def load_bert():
    return (BertTokenizer.from_pretrained('bert-base-uncased'),
            BertModel.from_pretrained('bert-base-uncased',output_hidden_states=True, output_attentions=True))

# bert representation of each sentence.
def embed_sentence(sentence, sentence_model, tokenizer):
    """
    # bert_tokenizer.vocab_size
    # bert_tokenizer.tokenize(sentence)
    # bert_tokenizer.convert_tokens_to_ids('on')
    # bert_tokenizer.convert_ids_to_tokens(102)
    # each sentence is encoded as a 3072 vec: 768 * 4 (concat last four layers)
    """
    with torch.no_grad():
        # sentence_model returns (logit output layer, pooler_output, hidden states, attentions)
        hidden_states = sentence_model(torch.tensor([tokenizer.encode(sentence, add_special_tokens=True)]))[2]
        #last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
        last_four_layers = [hidden_states[i] for i in (0,1,2,3)] # list of 4 element, each element is [1,16,768]
        # cast layers to a tuple and concatenate over the last dimension
        cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1) # [1,16,3072]
        cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze() # 3027
        #return(torch.mean(hidden_states[-1], dim=1).squeeze()) # average word embeddings in last layer
        return cat_sentence_embedding.numpy()                         # average last 4 layers
        
def embed_all_sentences(sentences, bert_tokenizer=None, sentence_model=None):
    """
    Each sentence is an object of SentenceEdit;
    Adding embedding attribute for the object;
    """
    if not bert_tokenizer:        
        bert_tokenizer, sentence_model = load_bert()
    for s in tqdm(sentences):
#         s.left_embedding = embed_sentence(s.left_context, sentence_model, bert_tokenizer)
#         s.right_embedding = embed_sentence(s.right_context, sentence_model, bert_tokenizer)
        s.context_embedding = embed_sentence(s.context, sentence_model, bert_tokenizer)
        s.word_embedding = embed_sentence(s.remove_wd, sentence_model, bert_tokenizer)


In [29]:
def get_data_embedding():
    """
    window_size = 0: left_context + right_context
    window_size = n: left / right n words as context
    """
    random.seed(42)
    datasets = []
    for get_data_df, moniker, coef_thresh, placebo_thresh in [
            (get_IMDB, 'imdb', 1.0, 0.1),
#             (get_kindle, 'kindle', 1.0, 0.1),
#             (get_toxic_comment, 'toxic', 1.0, 0.05), # limit_len == False
#             (get_toxic_tw, 'toxic_tw', 0.7, 0.2),
#             (get_TV_data, 'tv', 0.9, 0.1),
        ]: 
        
        df = get_data_df()
        X, y, vec, feats = simple_vectorize(df) # vectorize text
        ds = Dataset(X, y, vec, df, moniker) # construct dataset object
        
        print('%s dataset, %d instances' % (moniker,len(df)))
        print('Label distribution: %s' % str(Counter(y).items()))
        print('Feature matrix: %s' % str(X.shape))
        
        # get top / placebo features
        ds.top_feature_idx, ds.placebo_feature_idx, ds.coef = get_top_terms(ds, coef_thresh=coef_thresh, 
                                                                            placebo_thresh=placebo_thresh, C=1)
        ds.top_features = feats[ds.top_feature_idx]
        ds.placebo_features = feats[ds.placebo_feature_idx]
        print('\n%d top terms: %d pos, %d neg\n' % (len(ds.top_features), 
                                                    len(np.where(ds.coef[ds.top_feature_idx]>0)[0]), 
                                                    len(np.where(ds.coef[ds.top_feature_idx]<0)[0])))
        
#         print('\n%d placebo terms: %d pos, %d neg\n' % (len(ds.placebo_features), 
#                                                         len(np.where(ds.coef[ds.placebo_feature_idx]>0)[0]), 
#                                                         len(np.where(ds.coef[ds.placebo_feature_idx]<0)[0])))
        
        # embed texts
        print('getting treat sentences')                                                                   
        ds.topwd_unk_sentences_list, ds.topwd_unk_sentences_dict, topwd_idx = get_wd_unk_sentences(X,
                                                                                                   df.text,df.label,
                                                                                                   vec,ds.top_features,
                                                                                                   exclude_sent_idx=[],
                                                                                                   window=0)
        print('%d unk sentences with top terms\n' % len(ds.topwd_unk_sentences_list))
        embed_all_sentences(ds.topwd_unk_sentences_list) 
#         embed_all_sentences_USE(ds.topwd_unk_sentences_list)

#         print('getting placebo sentences')     
#         ds.placebowd_unk_sentences_list, ds.placebowd_unk_sentences_dict, placebo_wd_idx = get_wd_unk_sentences(X,
#                                                                                                                 df.text,
#                                                                                                                 df.label,
#                                                                                                                 vec,
#                                                                                                                 ds.placebo_features,
#                                                                                                                 topwd_idx,
#                                                                                                                 window=window_size)
#         print('%d unk sentences with placebo terms\n' % len(ds.placebowd_unk_sentences_list))
#         embed_all_sentences(ds.placebowd_unk_sentences_list) 
        
#         print('getting vocab sentences')     
#         ds.vocab_unk_sentences_list, ds.vocab_unk_sentences_dict, vocab_wd_idx = get_wd_unk_sentences(X,df.text,df.label,vec,ds.vec.get_feature_names(),exclude_sent_idx=[])
#         print('%d unk sentences with vocab terms\n' % len(ds.vocab_unk_sentences_list))
#         embed_all_sentences(ds.vocab_unk_sentences_list) 
        
        print('getting all sentences as control')
        ds.all_sentences = get_all_sentences(df)
        print('%d control sentences\n\n' % len(ds.all_sentences))
        embed_all_sentences(ds.all_sentences)    
        
#         pickle.dump(ds, open(data_path+'large_imdb_sents_embedding_1_01.pkl', 'wb'))
#         pickle.dump(ds, open(data_path+'kindle/kindle_sentences/kindle_sents_embedding_1_01.pkl', 'wb'))
        
        return ds
        
        
#         datasets.append(ds)
        
#     return datasets

In [99]:
ds_imdb.topwd_unk_sentences_list[10]

adorable ||| The babies are ||| and it's fun watching them play and grow. ||| 1 

In [34]:
start = time.time() # imdb: 3 hours; toxic: 6 hours
ds_imdb = get_data_embedding() # kindle, 20 min
# pickle.dump(ds_kindle, open(data_path+'kindle_ct/ITE/ds_kindle_emb.pkl','wb'))
# pickle.dump(ds_imdb, open(data_path+'imdb_ct/sentiment/combined/paired/ITE/ds_imdb_emb.pkl','wb'))
end = time.time()
print((end-start)/60)

(8173, 2865)
new dataset with 8173 records


Unnamed: 0,batch_id,text,label
0,4,"Long, boring, blasphemous.",-1


imdb dataset, 8173 instances
Label distribution: dict_items([(-1, 4059), (1, 4114)])
Feature matrix: (8173, 2865)

197 top terms: 99 pos, 98 neg

getting treat sentences
6392 unk sentences with top terms



HBox(children=(FloatProgress(value=0.0, max=6392.0), HTML(value='')))


getting all sentences as control
8173 control sentences




HBox(children=(FloatProgress(value=0.0, max=8173.0), HTML(value='')))


12.552439943949382


In [20]:
# ds_kindle = pickle.load(open(data_path+'kindle/kindle_sentences/kindle_sents_embedding_1_01.pkl', 'rb'))
# term_stats = {}
# term_stats['term'] = ds_kindle.feats[ds_kindle.top_feature_idx]
# term_stats['coef'] = ds_kindle.coef[ds_kindle.top_feature_idx]
# term_df = pd.DataFrame(term_stats)
# term_df.shape

(270, 2)

In [21]:
# term_df.to_csv(data_path+'kindle/kindle_sentences/annotate_top_terms.csv' , index=False)

#### ITE match
- context_A + treat_wd_A
- context_B + placebo_wd_B
- similarity(context_A, context_B)

In [19]:
# ds_data = pickle.load(open(data_path+'imdb_embedding_1_01.pkl', 'rb'))

In [76]:
def avg_embedding(obj,flag):
    """
    sentence embedding from left and right context embeddings
    flag = ['left','right','word']
    emb_by: avg, concat
    """
    
    emb = []
    for f in flag:
        if(f == 'left' and len(obj.left_context.strip())>0):
            emb.append(obj.left_embedding)
        elif(f == 'right' and len(obj.right_context.strip())>0):
            emb.append(obj.right_embedding)
        elif(f == 'word' and len(obj.remove_wd.strip())>0):
            emb.append(obj.word_embedding)

    if(len(emb)>0):
        return np.mean(emb, axis=0)
    else:
        return np.zeros(len(obj.left_embedding))

In [86]:
# avg_embedding(c_sent_objlist[3478],flag=['left','right'])

In [77]:
def concat_embedding(obj,flag):
    """
    sentence embedding from left and right context embeddings
    flag = ['left','right','word']
    """
    
    emb = []
    for f in flag:
        if(f == 'left'):
            emb.append(obj.left_embedding)
        elif(f == 'right'):
            emb.append(obj.right_embedding)
        elif(f == 'word' and len(obj.remove_wd.strip())>0):
            emb.append(obj.word_embedding)

    if(len(emb)>0):
        return np.hstack(emb)
    else:
        print("error")

In [21]:
def get_sentence_match(t_sent_obj, t_sent_objlist, c_sent_objlist, similarity='cosine', emb_by='context',min_sim=.7):
    """
    get the most similar match: (context, treat_wd) VS (context, placebo_wd)
    
    t_sent_obj: context_A + treat_wd
    c_sent_objlist: a list of context_B + placebo_wd
    similarity(context_A, context_B)
    
    emb_by: avg, concat, context
    for context_A:
        sort similarity score of (context_A, context of all placebo words in the dataset) in descending order
        if((sentence_A != sentence_B) and (cos(context_A,context_B)>0.7) and (word_A != word_B)):
            context_B is a match for context_A
    
    diff: difference between sentence_A.label - sentence_B.label
    
    """
    
    # similarity between current treatment context with all other contexts
    
    random.seed(42)
    if(len(c_sent_objlist) > 50000):
        c_sent_obj_smp = random.sample(c_sent_objlist,50000)
    else:
        c_sent_obj_smp = c_sent_objlist
    
    
    if(emb_by == 'context'):
        u = [t_sent_obj.context_embedding]
        v = [c_sent_obj.context_embedding for c_sent_obj in c_sent_obj_smp]
        control_sims = cosine_similarity(u, v)[0]
        treat_sims = cosine_similarity(u,[c_sent_smp.context_embedding for c_sent_smp in t_sent_objlist])[0]
    elif(emb_by == 'avg'):
        u = [avg_embedding(t_sent_obj,['left', 'right'])]
        v = [avg_embedding(c_sent_obj,['left', 'right']) for c_sent_obj in c_sent_obj_smp]
    elif(emb_by == 'concat'):
        u = [concat_embedding(t_sent_obj,['left', 'right'])]
        v = [concat_embedding(c_sent_obj,['left', 'right']) for c_sent_obj in c_sent_obj_smp]
    
    
    treat_match = [] # find 20 most similar matches
    n_treat = 0
    for t_sent_smp, sim in sorted(zip(t_sent_objlist, treat_sims), key=lambda x: -x[1]):
        if((t_sent_smp.sentence_idx != t_sent_obj.sentence_idx) and (t_sent_smp.remove_wd != t_sent_obj.remove_wd)):
            treat_match.append((t_sent_smp, float("%.3f" % sim)))
            n_treat += 1
            if(n_treat==10):
                break
           
    control_match = []
    n_control = 0
    for c_sent_smp, sim in sorted(zip(c_sent_obj_smp, control_sims), key=lambda x: -x[1]): 
        if((c_sent_smp.sentence_idx != t_sent_obj.sentence_idx) and (t_sent_obj.remove_wd not in c_sent_smp.context)):
#             if (not re.search(r'(?i)\b%s\b' % t_sent_obj.remove_wd, c_sent_obj.left_context+' '+c_sent_obj.right_context)): # the treat word is not in control context
#                 if (not re.search(r'(?i)\b%s\b' % c_sent_obj.remove_wd, t_sent_obj.left_context+' '+t_sent_obj.right_context)):
            control_match.append((c_sent_smp, float("%.3f" % sim)))
            n_control += 1
            if(n_control==10):
                break
            
    return treat_match, control_match

In [35]:
ds_data = ds_imdb
t_sent_objlist = ds_data.topwd_unk_sentences_list
c_sent_objlist = ds_data.topwd_unk_sentences_list

sims = cosine_similarity([t_sent_objlist[0].context_embedding], [c_sent_obj.context_embedding for c_sent_obj in c_sent_objlist])[0]
sims
# diff, match_list, match_sim, match_label = get_sentence_match(t_sent_objlist[2000], c_sent_objlist, min_sim=.7)

array([0.9999998 , 0.7434988 , 0.73416686, ..., 0.76551396, 0.65273   ,
       0.775444  ], dtype=float32)

- most similar match with same / opposite label?

In [22]:
def get_data_matches(ds_data, matchby='treat', emb_by='context', min_sim=0.01):
    """
    get one most similar match for each sentence
    t_sent_objlist: a list of SentenceEdit object, sentences with top words
    c_sent_objlist: a list of SentenceEdit object, sentences with top / placebo / '' words
    matchby = treat / control / placebo
    """
    t_sent_objlist = ds_data.topwd_unk_sentences_list
    
    if(matchby == 'treat'):
        c_sent_objlist = ds_data.topwd_unk_sentences_list
    elif(matchby == 'placebo'):
        c_sent_objlist = ds_data.placebowd_unk_sentences_list
    elif(matchby == 'vocab'):
        c_sent_objlist = ds_data.vocab_unk_sentences_list
    elif(matchby == 'control'):
        c_sent_objlist = ds_data.all_sentences
    else:
        print("combinations of treat + placebo + control")
    
    matched_ites = []   
    for t_sent_obj in tqdm(t_sent_objlist):  # t_sent_objlist, c_sent_objlist
        treat_match, control_match = get_sentence_match(t_sent_obj, ds_data.topwd_unk_sentences_list, ds_data.all_sentences, 
                                                   similarity='cosine', emb_by=emb_by, min_sim=min_sim)
        
#         treat_match, control_match = get_sentence_match(t_sent_obj, ds_data.topwd_unk_sentences_dict[t_sent_obj.remove_wd], c_sent_objlist, 
#                                                    similarity='cosine', emb_by=emb_by, min_sim=min_sim)
            
        matched_ites.append(
            {
                'term': t_sent_obj.remove_wd,
#                 'sentence_id': t_sent_obj.sentence_idx,
                'sentence': t_sent_obj,
                'treat_match': treat_match,
                'control_match': control_match,
            }
            
        )

    return pd.DataFrame(matched_ites)

In [23]:
start = time.time() # 10pm ~7am
# ds_imdb_window5 = pickle.load(open(data_path+'imdb/imdb_embedding_1_01_window5.pkl', 'rb'))
# print('vocab')
# df_imdb_vocab = get_data_matches(ds_imdb, matchby='vocab')
# print('control')
# df_imdb_control = get_data_matches(ds_imdb, matchby='control')

# match with context
# ds_kindle = pickle.load(open(data_path+'kindle/kindle_embedding_1_01.pkl', 'rb'))
ds_data = ds_kindle # 8 hours
df_ite_match = get_data_matches(ds_data, matchby='treat', emb_by='context') # 90 min for kindle, 22 min for imdb
# pickle.dump(df_ite_match, open(data_path+'kindle_ct/ITE/kindle_ite_match.pkl','wb'))
# pickle.dump(df_ite_match, open(data_path+'imdb_ct/sentiment/combined/paired/ITE/imdb_ite_match.pkl','wb'))
end = time.time() 
print((end-start)/60)

HBox(children=(FloatProgress(value=0.0, max=13014.0), HTML(value='')))


76.70414376656214


In [26]:
df_ite_match.tail(1)

Unnamed: 0,term,sentence,treat_match,control_match
13013,wrong,wrong ||| The poor victims were just in the ||| place at the wrong time ||| -1 \n,"[(series ||| The other books in the ||| are just as good ||| 1 \n, 0.832), (great ||| The others were ||| but I had a hard time reading the story with her in it ||| -1 \n, 0.823), (boring ||| The book was ||| and the characters were just as bad ||| -1 \n, 0.815), (boring ||| I found it ||| and the story was all over the place ||| -1 \n, 0.811), (falls ||| I WAS HAPPY WITH THE PRICE BUT IT ||| RIGHT OUT OF THE CASE ||| -1 \n, 0.81), (editing ||| The ||| was so poor, however, that it was hard to get through the story ||| -1 \n, 0.809), (love ||| The whole ||| story was thrown in at the last minute ||| -1 \n, 0.807), (series ||| Once more a good ||| written about England and the problems of the average person during that time ||| 1 \n, 0.806), (love ||| The way the unethical partner was handled was well deserved and in the end, the ||| of the family members won out ||| 1 \n, 0.805), (love ||| When you thought that it was impossible to ||| 2 men at the same time ||| 1 \n, 0.803)]","[(The other books in the series are just as good ||| 1 \n, 0.815), (The book was boring, and the characters were just as bad ||| -1 \n, 0.81), (The others were great but I had a hard time reading the story with her in it ||| -1 \n, 0.81), (I WAS HAPPY WITH THE PRICE BUT IT FALLS RIGHT OUT OF THE CASE ||| -1 \n, 0.81), (I found it boring and the story was all over the place ||| -1 \n, 0.805), (The whole love story was thrown in at the last minute ||| -1 \n, 0.803), (It was not funny at all and the romance was the worst ||| -1 \n, 0.803), (The way the unethical partner was handled was well deserved and in the end, the love of the family members won out ||| 1 \n, 0.801), (Once more a good series written about England and the problems of the average person during that time ||| 1 \n, 0.8), (It was nice to see the good guys win in the end ||| 1 \n, 0.798)]"


In [86]:
df_ite = {}
df_ite['treat'] = df_ite_treat
df_ite['placebo'] = df_ite_placebo
pickle.dump(df_ite,open(data_path+'kindle/kindle_ite_byconcat.pkl', 'wb'))

In [32]:
'15' in ds_kindle.top_features

True

In [31]:
df_ite_placebo_window5.head(2)

Unnamed: 0,term,sentence_id,treat_obj,control_ids,control_cos
0,15,308,15 ||| read if he had chosen ||| legends and elaborated on them ||| -1 \n,"[188, 4540, 4370, 3299, 2749, 1507, 6175, 857, 2522, 5803, 6059, 1410, 6108, 1620, 4390, 234, 2982, 5982, 2429, 5148, 7162, 5755, 3136, 175, 6172, 4643, 5773, 80, 2985, 3574, 5553, 3364, 1503, 6244, 2074, 1541, 6586, 2010, 2751, 164, 6224, 2025, 6096, 1257, 4366, 5356, 4996, 7121, 1277, 3651, 3076, 3783, 6974, 692, 733, 744, 629, 1044, 5874, 2333, 859, 1677, 2023, 2729, 5675, 137, 6329, 846, 2071, 4474, 2831, 2805, 880, 1125, 6408, 1922, 6048, 6405, 4636, 5663, 2176, 3118, 2882, 1791, 1183, 745, 2444, 7165, 6163, 1130, 6989, 4187, 105, 616, 3619, 4642, 3760, 6174, 5863, 1865]","[0.774431, 0.7657779, 0.7654418, 0.7586538, 0.75765145, 0.75622255, 0.75387245, 0.7537848, 0.752055, 0.7515459, 0.751132, 0.75104064, 0.75036454, 0.75020015, 0.7485354, 0.74835086, 0.74833584, 0.74815947, 0.7479827, 0.747982, 0.7470428, 0.7463633, 0.74589694, 0.745704, 0.745267, 0.7447803, 0.744725, 0.74398893, 0.7432944, 0.7432766, 0.7427939, 0.74252474, 0.742417, 0.74239707, 0.7420739, 0.7419039, 0.7415648, 0.74129796, 0.74038553, 0.73997355, 0.7397634, 0.73882425, 0.7388134, 0.73840004, 0.73831725, 0.738283, 0.737738, 0.7374506, 0.7374104, 0.7374099, 0.7373049, 0.73705024, 0.73661757, 0.73660856, 0.73660856, 0.73660856, 0.73644245, 0.736059, 0.73586774, 0.73576295, 0.7357268, 0.7356725, 0.7353916, 0.73535186, 0.7352672, 0.73520756, 0.73473483, 0.73451936, 0.73436654, 0.73417914, 0.73410475, 0.73389065, 0.73387575, 0.73384523, 0.7338085, 0.7337475, 0.7335499, 0.73337823, 0.733353, 0.73333454, 0.7333176, 0.7330762, 0.73299396, 0.73235583, 0.73223346, 0.73214984, 0.73211503, 0.73199534, 0.7319912, 0.73147464, 0.7314663, 0.7311471, 0.7310847, 0.73101175, 0.7308252, 0.7308156, 0.7305696, 0.73056245, 0.73031694, 0.7302091]"
1,15,434,15 ||| free but i want my ||| min of life back ||| -1 \n,"[2127, 2170, 3125, 2113, 718, 2121, 6895, 6026, 2160, 6782, 243, 6664, 2126, 1111, 5224, 6805, 629, 6357, 2184, 1812, 2084, 5175, 7037, 1179, 2259, 3416, 3799, 1251, 5773, 6337, 3363, 1342, 5147, 6274, 6172, 4676, 6713, 1554, 2158, 6884, 473, 614, 708, 3676, 3362, 4160, 2167, 2924, 1427, 3539, 7021, 4255, 6251, 1193, 5111, 2177, 5244, 3181, 7123, 3454, 3931, 5234, 7141, 2020, 3555, 1277, 2216, 3248, 3221, 2699, 1762, 6598, 6519, 110, 6553, 7137, 253, 5470, 5478, 1354, 3122, 175, 3212, 3156, 6761, 4560, 6741, 4354, 3952, 3708, 898, 4574, 1191, 1207, 3990, 2236, 244, 773, 3562, 6423]","[0.80159783, 0.80159783, 0.75432986, 0.7523407, 0.7521267, 0.7503412, 0.7487483, 0.74851257, 0.7460893, 0.7457087, 0.74384177, 0.7437999, 0.7436033, 0.7432044, 0.74240184, 0.74160695, 0.74076843, 0.74056995, 0.7400458, 0.73904514, 0.7377364, 0.7349162, 0.7347597, 0.7345468, 0.73429215, 0.73221064, 0.7314495, 0.72963405, 0.7294752, 0.7294397, 0.72929406, 0.7286548, 0.72816515, 0.72712684, 0.72669935, 0.7263777, 0.7255297, 0.72502124, 0.7247044, 0.72465146, 0.72463375, 0.72456336, 0.7242066, 0.7236022, 0.72344446, 0.72297347, 0.7227949, 0.7224593, 0.72219646, 0.722188, 0.7218343, 0.7215353, 0.721472, 0.7212204, 0.7209994, 0.72019124, 0.71988475, 0.719862, 0.71956116, 0.7195289, 0.71947646, 0.7192878, 0.7187964, 0.7187561, 0.7184754, 0.7182376, 0.7180848, 0.7177154, 0.7175374, 0.7174197, 0.7173748, 0.7172474, 0.7171042, 0.7170011, 0.71683997, 0.7166271, 0.7165545, 0.7164749, 0.7164495, 0.71596324, 0.71592975, 0.7158358, 0.7158158, 0.7157743, 0.7155573, 0.71547824, 0.7154068, 0.7153419, 0.7153045, 0.7151293, 0.7151145, 0.71509135, 0.71494764, 0.714921, 0.7148969, 0.71487635, 0.71478707, 0.71476185, 0.71471786, 0.7146066]"


In [138]:
ds_imdb_window5.topwd_unk_sentences_list[1845]

effective ||| subtle , but it is ||| . it's a quirky , ||| 1 

In [108]:
df_ite_window5['byconcat'].head(2)

Unnamed: 0,term,sentence_id,treat_obj,control_obj,similarity,control_list,control_list_sim,control_lb_list,causal_flag
0,51,1685,"51 ||| on an igloo , formula ||| sank from quirky to jerky ||| -1 \n","eager ||| on energy , and too ||| to be quirky at moments ||| -1 \n",0.771962,"[eager ||| on energy , and too ||| to be quirky at moments ||| -1 \n, too ||| low on energy , and ||| eager to be quirky at ||| -1 \n, charming ||| davis is funny , ||| and quirky in her feature ||| 1 \n, charming ||| a ||| , quirky and leisurely paced ||| 1 \n, effective ||| subtle , but it is ||| . it's a quirky , ||| 1 \n, add ||| tries to ||| some spice to its quirky ||| -1 \n, built ||| a premise , a joke ||| entirely from musty memories of ||| -1 \n, funny ||| davis is ||| , charming and quirky in ||| 1 \n, tries ||| , the harder that liman ||| to squeeze his story , ||| 1 \n, unique ||| how the film knows what's ||| and quirky about canadians . ||| 1 \n]","[0.7719615, 0.76660997, 0.7384485, 0.7363356, 0.7339607, 0.7187836, 0.7158764, 0.7153128, 0.712566, 0.7085526]","[-1, -1, 1, 1, 1, -1, -1, 1, 1, 1]",0
1,51,2796,51 ||| formula ||| is so trite that even ||| -1 \n,51 ||| formula ||| promises a new kind of ||| -1 \n,0.853867,"[51 ||| formula ||| promises a new kind of ||| -1 \n, 51 ||| formula ||| has dulled your senses faster ||| -1 \n, manages ||| ||| to be original , even ||| 1 \n, problem ||| the ||| is that for the most ||| -1 \n, none ||| ||| of this is half as ||| -1 \n, jonah ||| ||| is only so-so . . ||| -1 \n, problem ||| the ||| is that the movie has ||| -1 \n, beauty ||| the ||| of the piece is that ||| 1 \n, problem ||| the ||| with this film is that ||| -1 \n, problem ||| the ||| with this film is that ||| -1 \n]","[0.85386735, 0.8460399, 0.79239595, 0.7908349, 0.78753716, 0.785044, 0.7844373, 0.7817038, 0.77946806, 0.77946806]","[-1, -1, 1, -1, -1, -1, -1, 1, -1, -1]",0


In [None]:
start = time.time()
# match with whole sentence
ds_data = pickle.load(open(data_path+'imdb/imdb_embedding_1_01.pkl', 'rb'))
print('treat')
df_ite_treat_sentence = get_data_matches(ds_data, matchby='treat', emb_by='concat') 
print('placebo')
df_ite_placebo_sentence = get_data_matches(ds_data, matchby='placebo', emb_by='concat')

end = time.time()
print((end-start)/60)

In [49]:
# pickle.dump(df_imdb_vocab_byconcat_window5,open(data_path+'imdb_vocab_byconcat_window5.pkl', 'wb'))
# df_imdb_treat_byconcat_window5 = pickle.load(open(data_path+'imdb_treat_byconcat_window5.pkl', 'rb'))

In [44]:
df_imdb_treat_window5 = {}
df_imdb_treat_window5['bymean'] = df_imdb_treat_bymean_window5
df_imdb_treat_window5['bycontext'] = df_imdb_treat_bycontext_window5
df_imdb_treat_window5['byconcat'] = df_imdb_treat_byconcat_window5

pickle.dump(df_imdb_treat_window5, open(data_path+'/imdb/imdb_treat_ite_window5.pkl', 'wb'))

In [90]:
df_imdb_ite = {}
df_imdb_ite['treat'] = df_imdb_treat
df_imdb_ite['placebo'] = df_imdb_placebo
df_imdb_ite['vocab'] = df_imdb_vocab
df_imdb_ite['control'] = df_imdb_control
pickle.dump(df_imdb_ite, open(data_path+'imdb_ite_1_01.pkl', 'wb'))

#### Check the number of terms that have empty context

In [51]:
df_imdb_treat_byconcat_window5.head(2)

Unnamed: 0,term,sentence_id,treat_obj,control_obj,similarity,control_list,control_list_sim,control_lb_list,causal_flag
0,51,1685,"51 ||| on an igloo , formula ||| sank from quirky to jerky ||| -1 \n","eager ||| on energy , and too ||| to be quirky at moments ||| -1 \n",0.771962,"[eager ||| on energy , and too ||| to be quirky at moments ||| -1 \n, too ||| low on energy , and ||| eager to be quirky at ||| -1 \n, charming ||| davis is funny , ||| and quirky in her feature ||| 1 \n, charming ||| a ||| , quirky and leisurely paced ||| 1 \n, effective ||| subtle , but it is ||| . it's a quirky , ||| 1 \n, add ||| tries to ||| some spice to its quirky ||| -1 \n, built ||| a premise , a joke ||| entirely from musty memories of ||| -1 \n, funny ||| davis is ||| , charming and quirky in ||| 1 \n, tries ||| , the harder that liman ||| to squeeze his story , ||| 1 \n, unique ||| how the film knows what's ||| and quirky about canadians . ||| 1 \n]","[0.7719615, 0.76660997, 0.7384485, 0.7363356, 0.7339607, 0.7187836, 0.7158764, 0.7153128, 0.712566, 0.7085526]","[-1, -1, 1, 1, 1, -1, -1, 1, 1, 1]",0
1,51,2796,51 ||| formula ||| is so trite that even ||| -1 \n,51 ||| formula ||| promises a new kind of ||| -1 \n,0.853867,"[51 ||| formula ||| promises a new kind of ||| -1 \n, 51 ||| formula ||| has dulled your senses faster ||| -1 \n, manages ||| ||| to be original , even ||| 1 \n, problem ||| the ||| is that for the most ||| -1 \n, none ||| ||| of this is half as ||| -1 \n, jonah ||| ||| is only so-so . . ||| -1 \n, problem ||| the ||| is that the movie has ||| -1 \n, beauty ||| the ||| of the piece is that ||| 1 \n, problem ||| the ||| with this film is that ||| -1 \n, problem ||| the ||| with this film is that ||| -1 \n]","[0.85386735, 0.8460399, 0.79239595, 0.7908349, 0.78753716, 0.785044, 0.7844373, 0.7817038, 0.77946806, 0.77946806]","[-1, -1, 1, -1, -1, -1, -1, 1, -1, -1]",0


In [53]:
len(ds_imdb.topwd_unk_sentences_list)

8882

In [52]:
ds_imdb.placebowd_unk_sentences_list

12996

In [58]:
re.findall('\w+', ds_imdb.topwd_unk_sentences_list[0].left_context), ds_imdb.topwd_unk_sentences_list[0].left_context

(['on', 'an', 'igloo', 'formula'], 'on an igloo , formula')

In [None]:
# ds_imdb = pickle.load(open(data_path+moniker+'_embedding_1_01_window5.pkl', 'rb'))

In [72]:
def check_empty_context(sentObj_list):
    """
    A list of sentenceEdit object
    """
    empty_id = []
    for si,sent in enumerate(sentObj_list):
        left_wds = re.findall('\w+', sent.left_context)
        right_wds = re.findall('\w+', sent.right_context)
        if(len(left_wds) == 0 or len(right_wds) == 0):
            empty_id.append(si)
    
    print("%d (%.2f) items with empty contexts." % (len(empty_id), (len(empty_id)/len(sentObj_list))))
    return empty_id

In [73]:
topwd_empty_ids = check_empty_context(ds_imdb.topwd_unk_sentences_list)
placebowd_empty_ids = check_empty_context(ds_imdb.placebowd_unk_sentences_list)

1297 (0.15) items with empty contexts.
1596 (0.12) items with empty contexts.


In [75]:
wds = []
for eid in placebowd_empty_ids:
    wds.append(ds_imdb.placebowd_unk_sentences_list[eid].remove_wd)

len(Counter(wds))

286

#### Double check current embedding and ite data

In [86]:
ds_imdb_window5 = pickle.load(open(data_path+'imdb/imdb_embedding_1_01_window5.pkl', 'rb'))
ds_imdb = pickle.load(open(data_path+'imdb/imdb_embedding_1_01.pkl', 'rb'))

In [147]:
ds_imdb.df.head()

Unnamed: 0,label,text,i_th
0,-1,"simplistic , silly and tedious .",0
1,-1,"it's so laddish and juvenile , only teenage boys could possibly find it funny .",1
2,-1,exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable .,2
3,-1,"[garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation .",3
4,-1,a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification .,4


In [87]:
len(ds_imdb_window5.topwd_unk_sentences_list), len(ds_imdb_window5.placebowd_unk_sentences_list), len(ds_imdb_window5.vocab_unk_sentences_list)

(8882, 12996, 156632)

In [89]:
ds_imdb_window5.placebowd_unk_sentences_list[:3]

[20th ||| the relevance of these two ||| footnotes . ||| -1 ,
 20th ||| key turning point of the ||| century , and returns again ||| 1 ,
 20th ||| addressing the turn of the ||| century into the 21st . ||| 1 ]

In [88]:
len(ds_imdb.topwd_unk_sentences_list), len(ds_imdb.placebowd_unk_sentences_list), len(ds_imdb.vocab_unk_sentences_list)

(8882, 12996, 156632)

In [90]:
ds_imdb.placebowd_unk_sentences_list[:3]

[20th ||| weiss and speck never make a convincing case for the relevance of these two ||| footnotes . ||| -1 ,
 20th ||| while it regards 1967 as the key turning point of the ||| century , and returns again and again to images of dissidents in the streets , it's alarmingly current . ||| 1 ,
 20th ||| in capturing the understated comedic agony of an ever-ruminating , genteel yet decadent aristocracy that can no longer pay its bills , the film could just as well be addressing the turn of the ||| century into the 21st . ||| 1 ]

In [81]:
df_ite_window5 = pickle.load(open(data_path+'imdb/imdb_treat_ite_window5.pkl', 'rb'))
df_ite = pickle.load(open(data_path+'imdb/imdb_treat_ite_BERT.pkl', 'rb'))

In [83]:
df_ite_window5.keys(), df_ite.keys()

(dict_keys(['bymean', 'bycontext', 'byconcat']),
 dict_keys(['bymean', 'bycontext', 'byconcat']))

In [84]:
df_ite_window5['byconcat'].head(1)

Unnamed: 0,term,sentence_id,treat_obj,control_obj,similarity,control_list,control_list_sim,control_lb_list,causal_flag
0,51,1685,"51 ||| on an igloo , formula ||| sank from quirky to jerky ||| -1 \n","eager ||| on energy , and too ||| to be quirky at moments ||| -1 \n",0.771962,"[eager ||| on energy , and too ||| to be quirky at moments ||| -1 \n, too ||| low on energy , and ||| eager to be quirky at ||| -1 \n, charming ||| davis is funny , ||| and quirky in her feature ||| 1 \n, charming ||| a ||| , quirky and leisurely paced ||| 1 \n, effective ||| subtle , but it is ||| . it's a quirky , ||| 1 \n, add ||| tries to ||| some spice to its quirky ||| -1 \n, built ||| a premise , a joke ||| entirely from musty memories of ||| -1 \n, funny ||| davis is ||| , charming and quirky in ||| 1 \n, tries ||| , the harder that liman ||| to squeeze his story , ||| 1 \n, unique ||| how the film knows what's ||| and quirky about canadians . ||| 1 \n]","[0.7719615, 0.76660997, 0.7384485, 0.7363356, 0.7339607, 0.7187836, 0.7158764, 0.7153128, 0.712566, 0.7085526]","[-1, -1, 1, 1, 1, -1, -1, 1, 1, 1]",0


In [85]:
df_ite['byconcat'].head(1)

Unnamed: 0,term,sentence_id,treat_obj,control_obj,similarity,control_list,control_list_sim,control_lb_list,causal_flag
0,51,1685,"51 ||| in exactly 89 minutes , most of which passed as slowly as if i'd been sitting naked on an igloo , formula ||| sank from quirky to jerky to utter turkey . ||| -1 \n","unique ||| the plot of the comeback curlers isn't very interesting actually , but what i like about men with brooms and what is kind of special is how the film knows what's ||| and quirky about canadians . ||| 1 \n",0.778109,"[unique ||| the plot of the comeback curlers isn't very interesting actually , but what i like about men with brooms and what is kind of special is how the film knows what's ||| and quirky about canadians . ||| 1 \n, still ||| while easier to sit through than most of jaglom's self-conscious and gratingly irritating films , it's ||| tainted by cliches , painful improbability and murky points . ||| -1 \n, eager ||| often likable , but just as often it's meandering , low on energy , and too ||| to be quirky at moments when a little old-fashioned storytelling would come in handy . ||| -1 \n, too ||| often likable , but just as often it's meandering , low on energy , and ||| eager to be quirky at moments when a little old-fashioned storytelling would come in handy . ||| -1 \n, flat ||| it's a lot to ask people to sit still for two hours and change watching such a character , especially when rendered in as ||| and impassive a manner as phoenix's . ||| -1 \n, breathtaking ||| jackson tries to keep the plates spinning as best he can , but all the bouncing back and forth can't help but become a bit tedious -- even with the ||| landscapes and villainous varmints there to distract you from the ricocheting . ||| 1 \n, treat ||| yes , 4ever is harmless in the extreme and it'll mute your kids for nearly 80 minutes , but why not just ||| the little yard apes to the real deal and take them to spirited away ? ||| -1 \n, smarter ||| not only is undercover brother as funny , if not more so , than both austin powers films , but it's also one of the ||| , savvier spoofs to come along in some time . ||| 1 \n, devoid ||| it's drained of life in an attempt to be sober and educational , and yet it's so ||| of realism that its lack of whistles and bells just makes it obnoxious and stiff . ||| -1 \n, performances ||| is there a group of more self-absorbed women than the mother and daughters featured in this film ? i don't think so . nothing wrong with ||| here , but the whiney characters bugged me . ||| -1 \n]","[0.77810943, 0.77367425, 0.7700913, 0.7654258, 0.76197994, 0.76025236, 0.7598947, 0.7573191, 0.7569665, 0.7566486]","[1, -1, -1, -1, -1, 1, -1, 1, -1, -1]",1


#### Embedding with Universal Sentence Encoder

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

def embed_all_sentences_USE(sentences):
    USE_model = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
    
    left_contexts = [s.left_context for s in sentences]
    right_contexts = [s.right_context for s in sentences]
    contexts = [s.left_context + ' ' + s.right_context for s in sentences]
    words = [s.remove_wd for s in sentences]
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        left_embeddings = sess.run(USE_model(left_contexts))
        right_embeddings = sess.run(USE_model(right_contexts))
        context_embeddings = sess.run(USE_model(contexts))
        word_embeddings = sess.run(USE_model(words))
        
    for i,s in enumerate(sentences):
        s.left_embedding = left_embeddings[i]
        s.right_embedding = right_embeddings[i]
        s.context_embedding = context_embeddings[i]
        s.word_embedding = word_embeddings[i]
        

In [None]:
test_context = ds_imdb.topwd_unk_sentences_list[0].left_context + ' ' + ds_imdb.topwd_unk_sentences_list[0].right_context

USE_model = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    test_embedding = sess.run(USE_model([test_context]))