### weight KNN neighbors by similarity
> sum of weighted similarity/k <br>
> Re-calculate correlation with weighted AMT in 20180821\_weighted\_AMT.ipynb <br>

In [2]:
import requests, time, operator, re, json, csv, pickle, copy
import pandas as pd
import numpy as np
from collections import Counter
from scipy.stats import pearsonr, spearmanr
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from nltk.corpus import stopwords
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from ast import literal_eval
from scipy.sparse import vstack
from scipy.spatial.distance import cdist
import sklearn.metrics.pairwise
import matplotlib.pyplot as plt
#%matplotlib inline

import warnings
warnings.filterwarnings("ignore")
airbnb_path = '/data/2/zwang/2018_S_WordTreatment/V2_airbnb/'
tw_path = '/data/2/zwang/2018_S_WordTreatment/V2_twitter/'
yp_path = '/data/2/zwang/2018_S_WordTreatment/V2_yelp/'
AMT_path = '/data/2/zwang/2018_S_WordTreatment/V2_AMT/'

In [13]:
def weighted_knn_treatment_for_pair(swd,swdi,twd,twdi,X_sents,source_sents,source_labels,X_source,target_sents,target_labels,X_target,n_neighbors):
    source_pos_treatment = []
    source_neg_treatment = []
    target_pos_treatment = []
    target_neg_treatment = []
    
    if(X_sents.shape[0]*X_sents.shape[1] < 600000000): # control for memory error
        X_src_ = X_source[:]
        X_src_[:,swdi] = 0
        X_tar_ = X_target[:]
        X_tar_[:,twdi] = 0
        
        s_t_sim = cosine_similarity(X_src_,X_target)
        t_s_sim = cosine_similarity(X_tar_,X_source)
        s_s_sim = cosine_similarity(X_src_, X_source)
        t_t_sim = cosine_similarity(X_tar_, X_target)
        
        for i in range(X_src_.shape[0]):
            treat_neighbor_idx = s_t_sim[i].argsort()[::-1][:min(n_neighbors,X_target.shape[0])]
            treat_neighbor_labels = [target_labels[idx] for idx in treat_neighbor_idx]
            treat_neighbor_sims = [s_t_sim[i][idx] for idx in treat_neighbor_idx]
            
            contrl_neighbor_idx = s_s_sim[i].argsort()[::-1][1:min(n_neighbors+1,X_source.shape[0])]
            contrl_neighbor_labels = [source_labels[idx] for idx in contrl_neighbor_idx]
            contrl_neighbor_sims = [s_s_sim[i][idx] for idx in contrl_neighbor_idx]
            
            effect = np.mean(treat_neighbor_labels) - np.mean(contrl_neighbor_labels)
            sim_effect = np.mean([x * y+(x-1) * y for x, y in zip(treat_neighbor_labels, treat_neighbor_sims)]) - np.mean([x * y+(x-1) * y for x, y in zip(contrl_neighbor_labels, contrl_neighbor_sims)])
            
            if(source_labels[i]==1):
                source_pos_treatment.append((source_sents[i],round(float(sim_effect),5)))
            elif(source_labels[i]==0):
                source_neg_treatment.append((source_sents[i],round(float(sim_effect),5)))
    
        for j in range(X_tar_.shape[0]):
            treat_neighbor_idx = t_s_sim[j].argsort()[::-1][:min(n_neighbors,X_source.shape[0])]
            treat_neighbor_labels = [source_labels[idx] for idx in treat_neighbor_idx]
            treat_neighbor_sims = [t_s_sim[j][idx] for idx in treat_neighbor_idx]
        
            contrl_neighbor_idx = t_t_sim[j].argsort()[::-1][1:min(n_neighbors+1,X_target.shape[0])]
            contrl_neighbor_labels = [target_labels[idx] for idx in contrl_neighbor_idx]
            contrl_neighbor_sims = [t_t_sim[j][idx] for idx in contrl_neighbor_idx]
        
            effect = np.mean(treat_neighbor_labels) - np.mean(contrl_neighbor_labels)
            sim_effect = np.mean([x * y+(x-1) * y for x, y in zip(treat_neighbor_labels, treat_neighbor_sims)]) - np.mean([x * y+(x-1) * y for x, y in zip(contrl_neighbor_labels, contrl_neighbor_sims)])
            
        
            if(target_labels[j]==1):
                target_pos_treatment.append((target_sents[j],round(float(sim_effect),5)))
            elif(target_labels[j]==0):
                target_neg_treatment.append((target_sents[j],round(float(sim_effect),5)))
        
        return {'source':swd,'target':twd,
            'source_pos_sents_treatment':[source_pos_treatment],'source_neg_sents_treatment':[source_neg_treatment],
            'target_pos_sents_treatment':[target_pos_treatment],'target_neg_sents_treatment':[target_neg_treatment]}
    else:
        print("Large matrix!",swd,twd)
        return None

In [21]:
def weighted_KNN_treatment(pair_file,treat_file,vocab_file,n_neighbors):
    wdpairs_pos_neg_pd = pd.DataFrame(pickle.load(open(pair_file,'rb')))
    if(vocab_file):
        my_vocab = list(set(pd.read_csv(vocab_file).word.values))
        tfidf_vec = TfidfVectorizer(norm='l2',vocabulary=my_vocab)
        print(len(my_vocab))
    else:
        tfidf_vec = TfidfVectorizer(norm='l2')
    
    with open(treat_file,'wt') as fw:
        my_fields = ['source','target','source_pos_sents_treatment','source_neg_sents_treatment','target_pos_sents_treatment','target_neg_sents_treatment']
        csv_f = csv.DictWriter(fw,fieldnames=my_fields)
        csv_f.writeheader()
            
        for idx, row in wdpairs_pos_neg_pd.iterrows():
            if(idx % 100 == 0):
                print("-------------",idx,"-------------")
            swd = row['source']
            twd = row['target']
            swd_pos_sents = row['source_pos_sents']
            swd_neg_sents = row['source_neg_sents']
            twd_pos_sents = row['target_pos_sents']
            twd_neg_sents = row['target_neg_sents']
            
            source_sents = swd_pos_sents + swd_neg_sents
            target_sents = twd_pos_sents + twd_neg_sents
            source_labels = list(np.ones(len(swd_pos_sents)))+list(np.zeros(len(swd_neg_sents)))
            target_labels = list(np.ones(len(twd_pos_sents)))+list(np.zeros(len(twd_neg_sents)))
    
            X_sents = tfidf_vec.fit_transform(source_sents+target_sents)
            vocab = tfidf_vec.vocabulary_
            
            swdi = vocab[swd]
            twdi = vocab[twd]
            
            X_source = X_sents[:len(source_sents)]
            X_target = X_sents[len(source_sents):]
            
            
            #logger.debug("swd: %s, twd: %s, X_sents.shape: %s" % (swd,twd,X_sents.shape))
            #print(swd, twd, X_sents.shape)
            
            treat_info = {}
            treat_info = weighted_knn_treatment_for_pair(swd,swdi,twd,twdi, X_sents,
                                                source_sents,source_labels,X_source,
                                                target_sents,target_labels,X_target,n_neighbors)
            
            if(treat_info):
                csv_f.writerow({'source':treat_info['source'],'target':treat_info['target'],
                            'source_pos_sents_treatment':treat_info['source_pos_sents_treatment'],'source_neg_sents_treatment':treat_info['source_neg_sents_treatment'],
                            'target_pos_sents_treatment':treat_info['target_pos_sents_treatment'],'target_neg_sents_treatment':treat_info['target_neg_sents_treatment']})

- Only using AMT labeled data

In [25]:
prefix = "yp"
pair_file = AMT_path+'AMT_WdSents/Data/'+prefix+'_AMT_wdsents_markPPN.pickle'
treat_file = AMT_path +"AMT_WdSents/3_KNN/"+prefix+"_weightedknn_30_treatment.csv"
#vocab_file=airbnb_path+'0_Data/common_wds.csv'
start = time.time()
weighted_KNN_treatment(pair_file,treat_file,vocab_file='',n_neighbors=30)
end = time.time()
print((end-start)/60)

------------- 0 -------------


MemoryError: 

In [26]:
res_pd = pd.read_csv(AMT_path +"AMT_WdSents/3_KNN/tw_weightedknn_30_treatment.csv",index_col=False)
res_pd.shape

(25, 6)