### Lexical Substitution Effect estimation using Virtual Twins RandomForest
- n_trees = min(200, train_vec.shape[0]) <br>
- max_features='log2' <br>
- min_samples_leaf=2 (updated later) <br>

In [1]:
import requests, time, operator, re, json, csv, pickle, copy, ast
import pandas as pd
import numpy as np
from collections import Counter
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from ast import literal_eval
from scipy.sparse import vstack
from scipy.spatial.distance import cdist
import sklearn.metrics.pairwise
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

AMT_path = '/data/2/zwang/2018_S_WordTreatment/V2_AMT/'
airbnb_path = '/data/2/zwang/2018_S_WordTreatment/V2_airbnb/'
tw_path = '/data/2/zwang/2018_S_WordTreatment/V2_twitter/'
yp_path = '/data/2/zwang/2018_S_WordTreatment/V2_yelp/'

In [2]:
def VT_treatment_for_pair(swd,twd,sents,labels,divide,effect):
    source_pos_treatment = []
    source_neg_treatment = []
    target_pos_treatment = []
    target_neg_treatment = []
    
    for i in range(len(sents)):
        if(i<divide and labels[i]==1):
            source_pos_treatment.append((sents[i],round(float(effect[i]),5)))
        elif(i<divide and labels[i]==0):
            source_neg_treatment.append((sents[i],round(float(effect[i]),5)))
        elif(i>=divide and labels[i]==1):
            target_pos_treatment.append((sents[i],round(float(effect[i]),5)))
        elif(i>=divide and labels[i]==0):
            target_neg_treatment.append((sents[i],round(float(effect[i]),5)))
            
    return {'source':swd,'target':twd,
            'source_pos_sents_treatment':[source_pos_treatment],'source_neg_sents_treatment':[source_neg_treatment],
            'target_pos_sents_treatment':[target_pos_treatment],'target_neg_sents_treatment':[target_neg_treatment]}

In [3]:
def train_10RFC(swd,twd,vocab,X_src,y_src,X_src_testidx,X_src_trainidx,X_tar,y_tar,X_tar_testidx,X_tar_trainidx,nfold,n_trees):
    # train 10 CF for 10 folds of X_src
    src_pred = []
    src_effect = []
    tar_pred = []
    tar_effect = []
    for i in range(X_src.shape[0]):
        src_pred.append(-2)
        src_effect.append(-2)
    for i in range(X_tar.shape[0]):
        tar_pred.append(-2)
        tar_effect.append(-2)
    
    src_wdi = vocab[swd]
    tar_wdi = vocab[twd]
        
    for i in range(nfold):
        test_vec_src = X_src[X_src_testidx[i]]
        test_vec_tar = X_tar[X_tar_testidx[i]]
        train_vec_src = X_src[X_src_trainidx[i]]
        train_vec_tar = X_tar[X_tar_trainidx[i]]
        train_vec = vstack((train_vec_src,train_vec_tar))
        
        train_y_src = np.array(y_src)[X_src_trainidx[i]]
        train_y_tar = np.array(y_tar)[X_tar_trainidx[i]]
        train_y = list(train_y_src) + list(train_y_tar)

        
#         if(train_vec.shape[0] < n_trees):
#             RF_clf = RandomForestClassifier(n_estimators = train_vec.shape[0], max_features='log2', min_samples_leaf=2, 
#                                             random_state=42, n_jobs=-1, oob_score=True)
#         else:
#             RF_clf = RandomForestClassifier(n_estimators = n_trees, max_features='log2', min_samples_leaf=2, 
#                                             random_state=42, n_jobs=-1, oob_score=True)

        RF_clf = RandomForestClassifier(n_estimators = min(200,train_vec.shape[0]), max_features='log2', min_samples_leaf=2, 
                                            random_state=42, n_jobs=-1, oob_score=True)
        
        RF_clf.fit(train_vec, train_y)
        #print(test_vec_src.shape)
        #print(list(RF_clf.classes_).index(1))
        RFC_pred_src = RF_clf.predict_proba(test_vec_src)[:,list(RF_clf.classes_).index(1)]
        RFC_pred_tar = RF_clf.predict_proba(test_vec_tar)[:,list(RF_clf.classes_).index(1)]
        
        src_treat_vec = test_vec_src
        src_treat_vec[:,src_wdi] = 0
        src_treat_vec[:,tar_wdi] = 1
        RFC_pred_src_treat = RF_clf.predict_proba(src_treat_vec)[:,list(RF_clf.classes_).index(1)]
        
        tar_treat_vec = test_vec_tar
        tar_treat_vec[:,tar_wdi] = 0
        tar_treat_vec[:,src_wdi] = 1
        RFC_pred_tar_treat = RF_clf.predict_proba(tar_treat_vec)[:,list(RF_clf.classes_).index(1)]

        j=0
        for idx in X_src_testidx[i]:
            src_pred[idx] = RFC_pred_src[j]
            src_effect[idx] = RFC_pred_src_treat[j] - RFC_pred_src[j]
            j += 1
        j=0
        for idx in X_tar_testidx[i]:
            tar_pred[idx] = RFC_pred_tar[j]
            tar_effect[idx] = RFC_pred_tar_treat[j] - RFC_pred_tar[j]
            j += 1
    
    return src_pred, src_effect, tar_pred, tar_effect

In [4]:
def make_folds(swd,twd,vocab,X_sents,labels,divide,n_trees,nfold):
    """
    split X_src into 10 folds, and split X_tar into 10 folds
    """ 
    X_src = X_sents[:divide]
    X_tar = X_sents[divide:]
    y_src = labels[:divide]
    y_tar = labels[divide:]
    
    X_src_testidx = []
    X_src_trainidx = []
    X_tar_testidx = []
    X_tar_trainidx = []
    skf = StratifiedKFold(n_splits=nfold, random_state=42)
    #if((X_src.shape[0]<=nfold) or (X_tar.shape[0]<=nfold)):
    #    return
    #print(X_src.shape,X_tar.shape)
    for train_index, test_index in skf.split(X_src, y_src):
        X_src_testidx.append(list(test_index))
        X_src_trainidx.append(list(train_index))
    #src_pred, src_effect = train_RFC(swd,twd,vocab,X_src,y_src,X_tar,y_tar,X_src_testidx,X_src_trainidx,nfold,n_trees)
    
    for train_index, test_index in skf.split(X_tar, y_tar):
        X_tar_testidx.append(list(test_index))
        X_tar_trainidx.append(list(train_index))
    #tar_pred, tar_effect = train_RFC(twd,swd,vocab,X_tar,y_tar,X_src,y_src,X_tar_testidx,X_tar_trainidx,nfold,n_trees)
    src_pred, src_effect, tar_pred, tar_effect = train_10RFC(swd,twd,vocab,
                                                             X_src,y_src,X_src_testidx,X_src_trainidx,
                                                             X_tar,y_tar,X_tar_testidx,X_tar_trainidx,
                                                             nfold,n_trees)
    
    return list(src_pred)+list(tar_pred), list(src_effect)+list(tar_effect)

In [8]:
def cal_VT_treatment(wdpair_file,treat_file,log_file,vocab_file,n_trees,nfold):
    wdpairs_pos_neg_pd = pd.DataFrame(pickle.load(open(wdpair_file,'rb')))
    
    if(vocab_file):
        my_vocab = list(set(pd.read_csv(vocab_file).word.values))
        ct_vec = CountVectorizer(min_df=3,binary=True,vocabulary=my_vocab)
        #print(len(my_vocab))
    else:
        ct_vec = CountVectorizer(min_df=3,binary=True)

    
    with open(treat_file,'wt') as fw, open(log_file,'wt') as flog:
        my_fields = ['source','target','source_pos_sents_treatment','source_neg_sents_treatment','target_pos_sents_treatment','target_neg_sents_treatment']
        csv_f = csv.DictWriter(fw,fieldnames=my_fields)
        csv_f.writeheader()
        
        log_fields = ['swd','twd','shape']
        log_csv = csv.DictWriter(flog,fieldnames=log_fields)
        log_csv.writeheader()

        for idx, row in wdpairs_pos_neg_pd.iterrows():
            if(idx % 100 == 0):
                print("-------------",idx,"-------------")
                
            swd = row['source']
            twd = row['target']
            swd_pos_sents = row['source_pos_sents']
            swd_neg_sents = row['source_neg_sents']
            twd_pos_sents = row['target_pos_sents']
            twd_neg_sents = row['target_neg_sents']

            #if(len(swd_pos_sents)>nfold and len(swd_neg_sents)>nfold and len(twd_pos_sents)>nfold and len(twd_neg_sents)>nfold):
            sents = swd_pos_sents + swd_neg_sents + twd_pos_sents + twd_neg_sents
            labels = list(np.ones(len(swd_pos_sents)))+list(np.zeros(len(swd_neg_sents)))+list(np.ones(len(twd_pos_sents)))+list(np.zeros(len(twd_neg_sents)))
            divide = len(swd_pos_sents)+len(swd_neg_sents)

            X_sents = ct_vec.fit_transform(sents)
            vocab = ct_vec.vocabulary_
            #print(swd,twd,X_sents.shape)
            #print(vocab)


            log_csv.writerow({'swd':swd, 'twd':twd, 'shape': X_sents.shape})
            RFC_pred, RFC_effect = make_folds(swd,twd,vocab,X_sents,labels,divide,n_trees,nfold)

            treat_info = VT_treatment_for_pair(swd,twd,sents,labels,divide,RFC_effect)
            
            if(treat_info):
                csv_f.writerow({'source':treat_info['source'],'target':treat_info['target'],
                        'source_pos_sents_treatment':treat_info['source_pos_sents_treatment'],'source_neg_sents_treatment':treat_info['source_neg_sents_treatment'],
                            'target_pos_sents_treatment':treat_info['target_pos_sents_treatment'],'target_neg_sents_treatment':treat_info['target_neg_sents_treatment']})


- Only using AMT labeled data

In [11]:
prefix = "yp"
pair_file = AMT_path+'AMT_WdSents/Data/'+prefix+'_AMT_wdsents_markPPN.pickle'
treat_file = AMT_path +"AMT_WdSents/3_VT/"+prefix+"_vt_200tree_treatment.csv"
log_file =  AMT_path+"AMT_WdSents/3_VT/"+prefix+"_vt_200tree_log.csv"
#vocab_file=airbnb_path+'0_Data/common_wds.csv'
start = time.time()
cal_VT_treatment(pair_file,treat_file,log_file,vocab_file='',n_trees=200,nfold=10)
end = time.time()
print((end-start)/60)

------------- 0 -------------
4.858896497885386


In [12]:
res_pd = pd.read_csv(treat_file,index_col=False)
res_pd.shape

(23, 6)

In [13]:
res_pd.head()

Unnamed: 0,source,target,source_pos_sents_treatment,source_neg_sents_treatment,target_pos_sents_treatment,target_neg_sents_treatment
0,yummy,good,"[[(""Despite the yummy spice tuna roll and teri...","[[('I had a nibble ... well ... okay , I had l...",[[('It was the only part of the meal I had to ...,"[[(""I 'm always on the look out for good deals..."
1,yummy,tasty,"[[(""Despite the yummy spice tuna roll and teri...","[[('I had a nibble ... well ... okay , I had l...","[[(""Love the veal here and pasta dishes I 've ...","[[(""-RRB- , an odd little tasty cake -LRB- i k..."
2,fabulous,amazing,"[[('_NNP_ stage , fantastic lighting and fabul...","[[('This is a fabulous restaurant . ', 0.0928)...",[[('We will definitely go back to have the ama...,"[[('A truly amazing deal . ', -0.09343), ('Thi..."
3,gorgeous,great,[[('A previous reviewer mentioned gorgeous sta...,[[('Gorgeous theater lobby almost like a _NNP_...,"[[(""The atmosphere is great - you 're higher t...","[[('Totally great for a one-time experience , ..."
4,boyfriend,buddy,[[('When I checked in with my boyfriend for a ...,[[('I love meeting girlfriends here for drinks...,"[[('The weakness in your knees , The cigarette...",[[('My buddy was intrigued by the _NNP_ wings ...


In [18]:
ast.literal_eval(res_pd.iloc[0]['source_pos_sents_treatment'])[0][0]

("Despite the yummy spice tuna roll and teriyaki salmon , I 'll skip this place next time . ",
 0.15973)

In [8]:
this_path = airbnb_path
prefix = "airbnb"
pair_file = this_path+"1_Process/"+prefix+"_wdpair_sents_limit5000.pickle"
treat_file = this_path +"3_VirtualTwins/"+prefix+"_vt_200tree_treatment.csv"
log_file =  this_path+"3_VirtualTwins/"+prefix+"_vt_200tree_log.csv"
vocab_file = this_path+'0_Data/common_wds.csv'
start = time.time()
cal_VT_treatment(pair_file,vocab_file,treat_file,log_file,n_trees=200,nfold=10)
end = time.time()
print((end-start)/60)

------------- 5 -------------
1.0724186539649962


In [9]:
this_path = yp_path
prefix = "yp"
pair_file = this_path+"1_Process/"+prefix+"_wdpair_sents_limit5000.pickle"
treat_file = this_path +"3_VirtualTwins/"+prefix+"_vt_200tree_treatment.csv"
log_file =  this_path+"3_VirtualTwins/"+prefix+"_vt_200tree_log.csv"
start = time.time()
cal_VT_treatment(pair_file,treat_file,log_file,n_trees=200,nfold=10)
end = time.time()
print((end-start)/60)

------------- 5 -------------
1.266097374757131


In [8]:
this_path = tw_path
prefix = "tw"
pair_file = this_path+"1_Process/"+prefix+"_wdpair_sents_limit5000.pickle"
treat_file = this_path +"3_VirtualTwins/"+prefix+"_vt_200tree_treatment.csv"
log_file =  this_path+"3_VirtualTwins/"+prefix+"_vt_200tree_log.csv"
start = time.time()
cal_VT_treatment(pair_file,treat_file,log_file,n_trees=200,nfold=10)
end = time.time()
print((end-start)/60)

------------- 0 -------------
------------- 100 -------------
------------- 200 -------------
------------- 300 -------------
------------- 400 -------------
------------- 500 -------------
------------- 600 -------------
------------- 700 -------------
------------- 800 -------------
------------- 900 -------------
168.15899878342947
