### Lexical Substitution Effect estimation using KNN
- K=30

In [1]:
import requests, time, operator, re, json, csv, pickle, copy, ast
import pandas as pd
import numpy as np
from collections import Counter
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from ast import literal_eval
from scipy.sparse import vstack
from scipy.spatial.distance import cdist
import sklearn.metrics.pairwise
import matplotlib.pyplot as plt
#%matplotlib inline

import warnings
warnings.filterwarnings("ignore")
AMT_path = '/data/2/zwang/2018_S_WordTreatment/V2_AMT/'
airbnb_path = '/data/2/zwang/2018_S_WordTreatment/V2_airbnb/'
tw_path = '/data/2/zwang/2018_S_WordTreatment/V2_twitter/'
yp_path = '/data/2/zwang/2018_S_WordTreatment/V2_yelp/'

In [2]:
def knn_treatment_for_pair(swd,swdi,twd,twdi,X_sents,source_sents,source_labels,X_source,target_sents,target_labels,X_target,n_neighbors):
    """
    LSE for every sentence of every word pair
    """
    source_pos_treatment = []
    source_neg_treatment = []
    target_pos_treatment = []
    target_neg_treatment = []
    
    if(X_sents.shape[0]*X_sents.shape[1] < 600000000): # control for memory error
        X_src_ = X_source[:]
        X_src_[:,swdi] = 0
        X_tar_ = X_target[:]
        X_tar_[:,twdi] = 0
        
        s_t_sim = cosine_similarity(X_src_,X_target)
        t_s_sim = cosine_similarity(X_tar_,X_source)
        s_s_sim = cosine_similarity(X_src_, X_source)
        t_t_sim = cosine_similarity(X_tar_, X_target)
        
        for i in range(X_src_.shape[0]):
            treat_neighbor_idx = s_t_sim[i].argsort()[::-1][:min(n_neighbors,X_target.shape[0])]
            treat_neighbor_labels = [target_labels[idx] for idx in treat_neighbor_idx]
            
            contrl_neighbor_idx = s_s_sim[i].argsort()[::-1][1:min(n_neighbors+1,X_source.shape[0])]
            contrl_neighbor_labels = [source_labels[idx] for idx in contrl_neighbor_idx]
            
            effect = np.mean(treat_neighbor_labels) - np.mean(contrl_neighbor_labels)
            
            if(source_labels[i]==1):
                source_pos_treatment.append((source_sents[i],round(float(effect),5)))
            elif(source_labels[i]==0):
                source_neg_treatment.append((source_sents[i],round(float(effect),5)))
    
        for j in range(X_tar_.shape[0]):
            treat_neighbor_idx = t_s_sim[j].argsort()[::-1][:min(n_neighbors,X_source.shape[0])]
            treat_neighbor_labels = [source_labels[idx] for idx in treat_neighbor_idx]
        
            contrl_neighbor_idx = t_t_sim[j].argsort()[::-1][1:min(n_neighbors+1,X_target.shape[0])]
            contrl_neighbor_labels = [target_labels[idx] for idx in contrl_neighbor_idx]
        
            effect = np.mean(treat_neighbor_labels) - np.mean(contrl_neighbor_labels)
        
            if(target_labels[j]==1):
                target_pos_treatment.append((target_sents[j],round(float(effect),5)))
            elif(target_labels[j]==0):
                target_neg_treatment.append((target_sents[j],round(float(effect),5)))
        
        return {'source':swd,'target':twd,
            'source_pos_sents_treatment':[source_pos_treatment],'source_neg_sents_treatment':[source_neg_treatment],
            'target_pos_sents_treatment':[target_pos_treatment],'target_neg_sents_treatment':[target_neg_treatment]}
    else:
        print("Large matrix!",swd,twd)
        return None

In [4]:
def cal_KNN_treatment(pair_file,treat_file,vocab_file,n_neighbors):
    """
    Iteratively calculating LSE for every word pair
    """
    wdpairs_pos_neg_pd = pd.DataFrame(pickle.load(open(pair_file,'rb')))
    if(vocab_file):
        my_vocab = list(set(pd.read_csv(vocab_file).word.values))
        tfidf_vec = TfidfVectorizer(norm='l2',vocabulary=my_vocab)
        print(len(my_vocab))
    else:
        tfidf_vec = TfidfVectorizer(norm='l2')
    
    with open(treat_file,'wt') as fw:
        my_fields = ['source','target','source_pos_sents_treatment','source_neg_sents_treatment','target_pos_sents_treatment','target_neg_sents_treatment']
        csv_f = csv.DictWriter(fw,fieldnames=my_fields)
        csv_f.writeheader()
            
        for idx, row in wdpairs_pos_neg_pd.iterrows():
            if(idx % 100 == 0):
                print("-------------",idx,"-------------")
            swd = row['source']
            twd = row['target']
            swd_pos_sents = row['source_pos_sents']
            swd_neg_sents = row['source_neg_sents']
            twd_pos_sents = row['target_pos_sents']
            twd_neg_sents = row['target_neg_sents']
            
            source_sents = swd_pos_sents + swd_neg_sents
            target_sents = twd_pos_sents + twd_neg_sents
            source_labels = list(np.ones(len(swd_pos_sents)))+list(np.zeros(len(swd_neg_sents)))
            target_labels = list(np.ones(len(twd_pos_sents)))+list(np.zeros(len(twd_neg_sents)))
    
            X_sents = tfidf_vec.fit_transform(source_sents+target_sents)
            vocab = tfidf_vec.vocabulary_
#             return source_sents+target_sents,X_sents,tfidf_vec
            swdi = vocab[swd]
            twdi = vocab[twd]
            
            X_source = X_sents[:len(source_sents)]
            X_target = X_sents[len(source_sents):]
            
            
            #logger.debug("swd: %s, twd: %s, X_sents.shape: %s" % (swd,twd,X_sents.shape))
            #print(swd, twd, X_sents.shape)
            
            treat_info = {}
            treat_info = knn_treatment_for_pair(swd,swdi,twd,twdi, X_sents,
                                                source_sents,source_labels,X_source,
                                                target_sents,target_labels,X_target,n_neighbors)
            
            if(treat_info):
                csv_f.writerow({'source':treat_info['source'],'target':treat_info['target'],
                            'source_pos_sents_treatment':treat_info['source_pos_sents_treatment'],'source_neg_sents_treatment':treat_info['source_neg_sents_treatment'],
                            'target_pos_sents_treatment':treat_info['target_pos_sents_treatment'],'target_neg_sents_treatment':treat_info['target_neg_sents_treatment']})

- Only using AMT labeled data

In [22]:
prefix = "yp"
pair_file = AMT_path+'AMT_WdSents/Data/'+prefix+'_AMT_wdsents_markPPN.pickle'
treat_file = AMT_path +"AMT_WdSents/3_KNN/"+prefix+"_knn_30_treatment.csv"
#vocab_file=airbnb_path+'0_Data/common_wds.csv'
start = time.time()
cal_KNN_treatment(pair_file,treat_file,vocab_file='',n_neighbors=30)
end = time.time()
print((end-start)/60)

------------- 0 -------------
2.705143372217814


In [17]:
res_pd = pd.read_csv(AMT_path +"AMT_WdSents/3_KNN/yp_knn_30_treatment.csv",index_col=False)
res_pd.shape

(18, 6)

In [20]:
res_pd.tail()

Unnamed: 0,source,target,source_pos_sents_treatment,source_neg_sents_treatment,target_pos_sents_treatment,target_neg_sents_treatment
13,gorgeous,terrific,[[('A previous reviewer mentioned gorgeous sta...,[[('Gorgeous theater lobby almost like a _NNP_...,[[('The blue cheese really popped and gave it ...,"[[('Had a terrific time . ', -0.63333), ('We h..."
14,gorgeous,excellent,[[('A previous reviewer mentioned gorgeous sta...,[[('Gorgeous theater lobby almost like a _NNP_...,[[('The food was excellent and was visually ap...,"[[('Very cute little place , great food , exce..."
15,fabulous,excellent,[[('_NNP_ atmosphere fabulous pizza amazing co...,"[[(""The chips and salsa are great and it is ea...",[[('The food was excellent and was visually ap...,"[[('Very cute little place , great food , exce..."
16,gorgeous,outstanding,[[('A previous reviewer mentioned gorgeous sta...,[[('This hotel really is gorgeous and luxuriou...,[[('their wine list is decently sized and the ...,[[('More positives : The costumer service is g...
17,cute,excellent,"[[(""I 've been back _NUMBER_ or _NUMBER_ times...","[[('What a cute little local coffee shop ! ', ...","[[('But the food is really excellent . ', -0.5...","[[('Very cute little place , great food , exce..."


In [21]:
ast.literal_eval(res_pd.iloc[-1]['source_pos_sents_treatment'])

[[("I 've been back _NUMBER_ or _NUMBER_ times since I first dropped by and it 's so damn cute ... _NNP_ , the owner , greets me like a long lost son every time ... and she never remembers that I 've been in before ! ",
   0.26667),
  ("The decor here is very cute and goes well in the _NNP_ _NNP_ area ... _NNP_ is prompt and some days get busier than others ... _NNP_ job _NNP_ 's # _NUMBER_ ... ",
   0.63333),
  ("If you like to order your drinks by color and suck from a fistful of straws from a fishbowl ... and watch college kids fight and vomit , _NNP_ 's is the place for you ... and it is fun , the boys are cute and unconcious , what more could you ask for ? ",
   0.4),
  ('Very cute little place perfect for a warm and cozy coffee break ! ', 0.3),
  ('The chopsticks , pots , bowls , and even how kimchi is presented is very cute . ',
   0.4),
  ('_NNP_ a cute used bike -- a cruiser -- for my wife . ', 0.56667),
  ('Cute little spot about _NUMBER_ - _NUMBER_ minutes from the strip . '

In [31]:
this_path = airbnb_path
prefix = "airbnb"
pair_file = this_path+"1_Process/"+prefix+"_wdpair_sents_limit5000.pickle"
treat_file = this_path +"3_KNN/"+prefix+"_knn_30_treatment_limitvocab_test.csv"
vocab_file=airbnb_path+'0_Data/common_wds.csv'
sents,X_sents,tfidf_vec = cal_KNN_treatment(pair_file,treat_file,vocab_file,n_neighbors=30)

1550
------------- 0 -------------


In [4]:
this_path = airbnb_path
prefix = "airbnb"
pair_file = this_path+"1_Process/"+prefix+"_wdpair_sents_limit5000.pickle"
treat_file = this_path +"3_KNN/"+prefix+"_knn_30_treatment.csv"
start = time.time()
cal_KNN_treatment(pair_file,treat_file,n_neighbors=30)
end = time.time()
print((end-start)/60)

------------- 0 -------------
------------- 100 -------------
------------- 200 -------------
------------- 300 -------------
------------- 400 -------------
------------- 500 -------------
------------- 600 -------------
------------- 700 -------------
------------- 800 -------------
8.920055758953094


In [5]:
this_path = yp_path
prefix = "yp"
pair_file = this_path+"1_Process/"+prefix+"_wdpair_sents_limit5000.pickle"
treat_file = this_path +"3_KNN/"+prefix+"_knn_30_treatment.csv"
start = time.time()
cal_KNN_treatment(pair_file,treat_file,n_neighbors=30)
end = time.time()
print((end-start)/60)

------------- 0 -------------
------------- 100 -------------
------------- 200 -------------
------------- 300 -------------
------------- 400 -------------
------------- 500 -------------
------------- 600 -------------
------------- 700 -------------
------------- 800 -------------
23.310502588748932


In [6]:
this_path = tw_path
prefix = "tw"
pair_file = this_path+"1_Process/"+prefix+"_wdpair_sents_limit5000.pickle"
treat_file = this_path +"3_KNN/"+prefix+"_knn_30_treatment.csv"
start = time.time()
cal_KNN_treatment(pair_file,treat_file,n_neighbors=30)
end = time.time()
print((end-start)/60)

------------- 0 -------------
------------- 100 -------------
------------- 200 -------------
------------- 300 -------------
------------- 400 -------------
------------- 500 -------------
------------- 600 -------------
------------- 700 -------------
------------- 800 -------------
------------- 900 -------------
13.739893937110901


### Check neighbors for each individual sentence