# Find matches in each pair group (DM, DF), (DM, UM), (DF, UF), (UM, UF), hen analyzing matched of each pair group to get a conclusion

In [None]:
import os
import tqdm
import pickle
import random
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial import distance
from scipy import linalg
from sklearn.utils import resample
from scipy import stats
from functools import reduce

In [None]:
from pathlib import Path
current_dir = Path.cwd()
csv_files_dir = 'csv_files'
root_dir = current_dir / Path(csv_files_dir)

In [None]:
# dis_male = pd.read_csv('disclosed_male.csv',sep='|')
# Load disclosed male
dis_male = pd.read_csv(root_dir / Path('disclosed_male_l_s_r.csv'), sep='|', index_col=0)
dis_male = dis_male.reset_index(drop=True)
dis_male['Sentiment'] = dis_male.Sentiment.apply(lambda x: ast.literal_eval(x)['compound'])

In [None]:
# dis_female = pd.read_csv('disclosed_female.csv',sep='|')
# Load disclosed female
dis_female = pd.read_csv(root_dir / Path('disclosed_female_l_s_r.csv'), sep='|', index_col=0)
dis_female = dis_female.reset_index(drop=True)
dis_female['Sentiment'] = dis_female.Sentiment.apply(lambda x: ast.literal_eval(x)['compound'])

In [None]:
# undis_male = pd.read_csv('undisclosed_male.csv',sep='|')
# Load undisclosed male
undis_male = pd.read_csv(root_dir / ('undisclosed_male_l_s_r.csv'),sep='|', index_col=0)
undis_male = undis_male.reset_index(drop=True)
undis_male['Sentiment'] = undis_male.Sentiment.apply(lambda x: ast.literal_eval(x)['compound'])

In [None]:
# undis_female = pd.read_csv('undisclosed_female.csv',sep='|')
# Load undisclosed female
undis_female = pd.read_csv(root_dir / Path('undisclosed_female_l_s_r.csv'), sep='|', index_col=0)
undis_female = undis_female.reset_index(drop=True)
undis_female['Sentiment'] = undis_female.Sentiment.apply(lambda x: ast.literal_eval(x)['compound'])

# Legecy code, just for your information

In [None]:
# cat = undis_male.iloc[0]['categories'].replace('[','').replace(']','').replace('\'','').split(',')

In [None]:
def getCategoryList(x):
    return reduce(lambda a,b: a+b, ast.literal_eval(x))

In [None]:
undis_male = undis_male.drop(columns=['category'])

In [None]:
undis_male.iloc[0]['categories']

In [None]:
undis_female = undis_female.rename(columns={'categories':'category'})

In [None]:
undis_female.head()

In [None]:
undis_female['categories'] = undis_female['category'].apply(lambda x:getCategoryList(x))

In [None]:
undis_female.head()

In [None]:
undis_female = undis_female.drop(columns=['category'])

In [None]:
undis_male = undis_male.rename(columns={'categories':'category'})

In [None]:
undis_male['categories'] = undis_male['category'].apply(lambda x:getCategoryList(x))
undis_male = undis_male.drop(columns=['category'])

In [None]:
undis_female.head()

In [None]:
undis_male.head()

In [None]:
dis_female = dis_female.rename(columns={'categories':'category'})
dis_female['categories'] = dis_female['category'].apply(lambda x:getCategoryList(x))
dis_female = dis_female.drop(columns=['category'])

In [None]:
dis_male = dis_male.rename(columns={'categories':'category'})
dis_male['categories'] = dis_male['category'].apply(lambda x:getCategoryList(x))
dis_male = dis_male.drop(columns=['category'])

In [None]:
def filterCategory(x,category):
    if category in x.lower():
        return True
    return False

# Matching

In [None]:
# Mahalanobis Distance
def calculateCdist(df_1_s, df_2, cov_inv):
    cnt = df_2.shape[0]
    slice_len = 250000
    obtained_pairs = []
    similarity_val = []
    n = df_1_s.shape[0]
    if cnt > slice_len:
        slice_cnt = int(cnt/slice_len)+1
        for i in range(slice_cnt):
            u_s = i * slice_len
            
            if i < slice_cnt-1:
                u_e = u_s + slice_len
                df_2_s = df_2[u_s:u_e]
            else:
                df_2_s = df_2[u_s:]

            Y = distance.cdist(df_1_s[['stars', 'timestamp', 'length', 'Grade_level', 'Sentiment']], \
                               df_2_s[['stars', 'timestamp', 'length', 'Grade_level', 'Sentiment']], \
                               'mahalanobis', VI=cov_inv)
            Y_1 = Y.argmin(axis=1)


            min_y = [(Y_1[j], Y[j, Y_1[j]], i) for j in range(n)]
            obtained_pairs.append(min_y)

        # obtaining the matched set
        matched_set = []
        for i in range(n):
            for ind,val,s in sorted(list(zip(*obtained_pairs))[i], key=lambda x:x[1]):
                pos = s * slice_len + ind
                matched_set.append(pos)
                similarity_val.append(val)
                break
        
        return matched_set,similarity_val
    
    else:
        Y = distance.cdist(df_1_s[['stars', 'timestamp', 'length', 'Grade_level', 'Sentiment']], \
                           df_2[['stars', 'timestamp', 'length', 'Grade_level', 'Sentiment']], \
                           'mahalanobis', VI=cov_inv)
        Y_1 = Y.argmin(axis=1)
        
        for i in range(Y.shape[0]):
            similarity_val.append(Y[i,Y_1[i]])
        
        return Y_1, similarity_val

In [None]:
DISCLOSED_MALE = 0
DISCLOSED_FEMALE = 1
UNDISCLOSED_MALE = 2
UNDISCLOSED_FEMALE = 3
def findMatchAllPairs(d_m, d_f, u_m, u_f, sample_size=None):
    total_size =  d_m.shape[0] + d_f.shape[0] + u_m.shape[0] + u_f.shape[0]
    if sample_size is None:
        sample_size = total_size
    elif sample_size > total_size:
        raise Exception('sample size beyond the number of population.')
        
    print(sample_size)
    pairs = {}
    
    if 'group' not in d_m.columns:
        d_m.insert(0,'group', DISCLOSED_MALE)
    if 'group' not in d_f.columns:
        d_f.insert(0,'group', DISCLOSED_FEMALE)
    if 'group' not in u_m.columns:
        u_m.insert(0,'group', UNDISCLOSED_MALE)
    if 'group' not in u_f.columns:
        u_f.insert(0,'group', UNDISCLOSED_FEMALE)
    
    all_data = pd.concat([d_m, d_f, u_m, u_f])
    
    if sample_size is None:
        saple_n = all_data
    else:
        sample_n = all_data.sample(sample_size)
    
    samp_0 = sample_n[sample_n['group'] == DISCLOSED_MALE]
    samp_1 = sample_n[sample_n['group'] == DISCLOSED_FEMALE]
    samp_2 = sample_n[sample_n['group'] == UNDISCLOSED_MALE]
    samp_3 = sample_n[sample_n['group'] == UNDISCLOSED_FEMALE]
    
    print(f'sample DM - {len(samp_0)}, sample DF - {len(samp_1)}, \
    sample UM - {len(samp_2)}, sample UF - {len(samp_3)}')
    
    data = [(samp_0, DISCLOSED_MALE), (samp_1, DISCLOSED_FEMALE), (samp_2, UNDISCLOSED_MALE), (samp_3, UNDISCLOSED_FEMALE)]
    
    for i in range(3):
        samp_treatment, label_treatment = data[i]
        popln_treatment = all_data[all_data['group'] == label_treatment]
        
        for j in range(i+1, 4):
            samp_control, label_control = data[j] 
            popln_control = all_data[all_data['group'] == label_control]

            pop_size = popln_treatment.shape[0] + popln_control.shape[0]
            
            if pop_size < 1000000:
                m = pop_size
            else:
                m = 1000000

            all_sample = pd.concat([popln_treatment, popln_control])
        
            cov = np.cov(all_sample[['stars','timestamp','length','Grade_level','Sentiment']].sample(m).values, rowvar=False)
            cov_inv = linalg.inv(cov)

            print('covariance matrix obtained')
            
            Y_1, similarity_1 = calculateCdist(samp_treatment, popln_control, cov_inv) 
            pair_0_0 = [samp_treatment.iloc[i] for i in range(samp_treatment.shape[0])]
            pair_1_0 = [popln_control.iloc[i] for i in Y_1]
            pairs[str(label_treatment) + '-' + str(label_control)] = list(zip(pair_0_0, pair_1_0)) 
            
            Y_1,similarity_2 = calculateCdist(samp_control, popln_treatment, cov_inv)
            pair_0_1 = [popln_treatment.iloc[i] for i in Y_1]
            pair_1_1 = [samp_control.iloc[i] for i in range(samp_control.shape[0])]
            pairs[str(label_control) + '-' + str(label_treatment)] = list(zip(pair_0_1, pair_1_1))
                   
    return pairs  #,similarity_1,similarity_2                             

# Example of a specific category

In [None]:
category = 'restaurants'

In [None]:
def filterCategory(x,category):
    if category in x.lower():
        return True
    return False
dis_male_c = dis_male[dis_male['categories'].apply(lambda x:filterCategory(x,category))]

In [None]:
dis_male_c.head()

In [None]:
dis_female_c = dis_female[dis_female['categories'].apply(lambda x:filterCategory(x,category))]
undis_male_c = undis_male[undis_male['categories'].apply(lambda x:filterCategory(x,category))]
undis_female_c = undis_female[undis_female['categories'].apply(lambda x:filterCategory(x,category))]

In [None]:
dis_female_c.count()

In [None]:
pairs_all = findMatchAllPairs(dis_male_c,dis_female_c,undis_male_c,undis_female_c, 40000)

In [None]:
pairs = pairs_all
pairs.keys()

# Iterating over all categories

In [None]:
def filterCategory(x,category):
    if category in x.lower():
        return True
    return False

In [None]:
import json
correlated_restaurants = False
if correlated_restaurants:
    with open('correlated_categories.json') as json_file:
        categories = json.load(json_file)
else:
    with open('top_n_correlated_categories.json') as json_file:
        categories = json.load(json_file)
    categories = list(categories.keys())
print(categories)

In [None]:
from pathlib import Path

if 'group' not in dis_male.columns:
    dis_male.insert(0, 'group', DISCLOSED_MALE)
if 'group' not in dis_female.columns:
    dis_female.insert(0, 'group', DISCLOSED_FEMALE)
if 'group' not in undis_male.columns:
    undis_male.insert(0, 'group', UNDISCLOSED_MALE)
if 'group' not in undis_female.columns:
    undis_female.insert(0, 'group', UNDISCLOSED_FEMALE)

# run n times 
sample_times = 1
for i in range(0, sample_times): # one sample enough
    for category in categories:
        print(f'category - {category}')
        
        d_m_c = dis_male[dis_male['categories'].apply(lambda x:filterCategory(x, category))]
        d_f_c = dis_female[dis_female['categories'].apply(lambda x:filterCategory(x, category))]
        u_m_c = undis_male[undis_male['categories'].apply(lambda x:filterCategory(x, category))]
        u_f_c = undis_female[undis_female['categories'].apply(lambda x:filterCategory(x, category))]

        pairs = findMatchAllPairs(d_m_c, d_f_c, u_m_c, u_f_c, 10000)
    
        path = Path('category_pairs/sample_' + str(i))
        abs_file = path / Path(category + '.pickle')
        if not path.exists():                                    
            path.mkdir(parents=True)
        with open(abs_file, 'wb+') as ft:
            pickle.dump(pairs, ft)
    
    print(f'finished sample - {i}')

In [None]:
def perIncr(x,y):
    return (x-y)/min([x,y])*100

In [None]:
def calculateMean(h):
    return np.mean([x[0] for x in h]),np.mean([x[1] for x in h])

In [None]:
def getHelfulnessScore(pairs,keys):
    h1 = [(m_1.useful, m_2.useful) for m_1, m_2 in pairs[keys[0]]]
    h2 = [(m_1.useful, m_2.useful) for m_1, m_2 in pairs[keys[1]]]
    return h1, h2

In [None]:
def calculateMetric(pairs, key=None, bootstrap=False):
    res = {}

    for i in range(3):
        for j in range(i + 1, 4):
            key_1 = str(i) + '-' + str(j)
            key_2 = str(j) + '-' + str(i)
            
            h1, h2 = getHelfulnessScore(pairs, (key_1, key_2))
            #total = h1 + h2
            est = []
            if bootstrap:
                for _ in range(1000):
                    total_set = h1 + h2 
                    bootstrap_set = resample(total_set)
                    h_s_1, h_s_2 = calculateMean(bootstrap_set)
                    est.append(perIncr(h_s_1, h_s_2))
                
                res[key_1] = (np.mean(est), stats.sem(est), np.std(est))
            else:
                h_s_1, h_s_2 = calculateMean(h1 + h2)
                res[key_1] = perIncr(h_s_1, h_s_2)
                
    return res, est#,total   

# [optional] connect to the part Example of a specific category

In [None]:
res, est = calculateMetric(pairs, bootstrap=True)

In [None]:
est, np.mean(est)

In [None]:
len(pairs['2-3'])

In [None]:
pairs['3-2'][0]

In [None]:
total[3666]

In [None]:
res.keys()

In [None]:
with open('category_pairs/sample_1/Books.pickle','rb') as fs:
    pairs = pickle.load(fs)

In [None]:
pairs.keys()

In [None]:
calculateMetric(pairs)

# connect to the part iterating over all categories
# [deprecated] sampling without bootstrap

In [None]:
# categories = ['Books','Electronics','CDs & Vinyl','Clothing, Shoes & Jewelry','Home & Kitchen',\
#              'Kindle Store','Sports & Outdoors','Cell Phones & Accessories', 'Toys & Games','Games','Literature & Fiction',\
#              'Beauty','Health & Personal Care','Movies','Computers']

all_results = {}
for category in tqdm.tqdm(categories):

    results = {'0-1':[], '0-2':[], '0-3':[], '1-2':[], '1-3':[], '2-3':[]}
    
    for _ in range(0, 1):
        path = 'category_pairs/sample_' + str(_) + '/' + category + '.pickle'
        with open(path, 'rb') as fs:
            pairs = pickle.load(fs)
        res, _ = calculateMetric(pairs, boostrap=)
        for key in res:
            results[key].append(res[key])
    
    mean_res = {}
    for key in results.keys():
        mean_res[key] = (np.mean(results[key]), np.std(results[key])) # scipy.stats.sem(results[key])
    
    all_results[category] = mean_res
    
    print(f'done for {category}')

In [None]:
# categories = ['Books','Electronics','CDs & Vinyl','Clothing, Shoes & Jewelry','Home & Kitchen',\
#              'Kindle Store','Sports & Outdoors','Cell Phones & Accessories', 'Toys & Games','Games','Literature & Fiction',\
#              'Beauty','Health & Personal Care','Movies','Computers']

all_results = {}
for category in tqdm.tqdm(categories):

    results = {'0-1':[], '0-2':[], '0-3':[], '1-2':[], '1-3':[], '2-3':[]}
    
    for _ in range(13, 15):
        path = 'category_pairs/sample_' + str(_) + '/' + category + '.pickle'
        with open(path, 'rb') as fs:
            pairs = pickle.load(fs)
        res, _ = calculateMetric(pairs)
        for key in res:
            results[key].append(res[key])
    
    mean_res = {}
    for key in results.keys():
        mean_res[key] = (np.mean(results[key]),  scipy.stats.sem((results[key]))
    
    all_results[category] = mean_res
    
    print(f'done for {category}')

In [None]:
all_results.keys()

In [None]:
key = '0-2'
mean_results = []
for category in categories:
    print(category, all_results[category][key])
    mean_results.append(all_results[category][key][0])
mean = sum(mean_results) / len(mean_results)    
mean_results.append(mean)

# Plot initialization

In [None]:
categories
cat_plot = ('sewing & alterations', 'self storage', 'carpet cleaning', 
            'oral surgeons', "men's hair salons", 'restaurants', 'overall')

In [None]:
import matplotlib as mpl
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
#mpl.style.use('classic')
mpl.rcParams['xtick.labelsize'] = 25
mpl.rcParams['ytick.labelsize'] = 32
mpl.rcParams['font.size'] = 20
mpl.rcParams['figure.figsize'] =  9, 10
mpl.rcParams['axes.labelsize'] = 35
mpl.rcParams['mathtext.fontset'] = 'stix'
mpl.rcParams['font.family'] = 'STIXGeneral'
mpl.rcParams['axes.linewidth'] = 2.5
#plotResults(per_incr,std_err,'UF','UM')

ind = np.arange(5)    
width = 0.35 

In [None]:
def plotResults(per_incr, grp_1, grp_2, std_err=None, dir_=''):
    #plt.rcdefaults()
    fig, ax = plt.subplots()
    y_pos = np.arange(len(cat_plot))
    
    hatch, color, fill = [], [], []
    for i in per_incr:
        if i < 0:
            hatch.append('xxx')
            color.append('b')
            fill.append(False)
        else:
            hatch.append('xxx')
            color.append('b')
            fill.append(True)

    print(len(y_pos), len(per_incr))        
    barlist = ax.barh(y_pos, per_incr, align='center',xerr=std_err)
    for i,thisbar in enumerate(barlist.patches):
        thisbar.set_hatch(hatch[i])
        thisbar.set_color(color[i])
        thisbar.set_fill(fill[i])
    
    majorLocator = MultipleLocator(20)
    majorFormatter = FormatStrFormatter('%d')
    minorLocator = MultipleLocator(2.5)

    ax.xaxis.set_major_locator(majorLocator)
    ax.xaxis.set_major_formatter(majorFormatter)

    # for the minor ticks, use no labels; default NullFormatter
    ax.xaxis.set_minor_locator(minorLocator)

    ax.set_yticks(y_pos)
    plt.ylim(-1,16)
    ax.set_yticklabels(cat_plot)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Advantage (%)')
    plt.xlim(-50,50)
    #plt.xlim(-70,70)
    #x_pos = np.arange(-45,60,step=15)
    x_pos = np.arange(-50, 60, step=10)
    ax.set_xticks(x_pos)
    ax.set_xticklabels([50, 40, 30, 20, 10, 0, 10, 20, 30, 40, 50])
    #ax.set_xticklabels([45,30,15,0,15,30,45])
    plt.axvline(0, color='black')
    plt.axhline(14.58, color='black')
    plt.text(-40, 0, grp_1, fontsize=23)
    plt.text(35, 0, grp_2, fontsize=23)
    plt.grid(linestyle='--')
    ax.get_yticklabels()[-1].set_color("m")
    title = grp_1 + '_' + grp_2 + '.jpg'
    plt.tight_layout()
    plt.savefig(dir_ + '/' + title, dpi=250)
    #plt.savefig('plotNatural/'+title, format='svg', dpi=1200)
    plt.show()

# Bootstrap sampling

In [None]:
# categories = ['Books','Electronics','CDs & Vinyl','Clothing, Shoes & Jewelry','Home & Kitchen',\
#              'Kindle Store','Sports & Outdoors','Cell Phones & Accessories', 'Toys & Games','Games','Literature & Fiction',\
#              'Beauty','Health & Personal Care','Movies','Computers']

all_res = []
sample_times = 1
for i in range(0, sample_times):
    all_results = {}
    for category in categories:
        bootstrap_res = {}
        path = 'category_pairs/sample_' + str(i) + '/' + category + '.pickle'
        with open(path, 'rb') as fs:
            pairs = pickle.load(fs)
        res, _ = calculateMetric(pairs, bootstrap=True)
        # scipy.stats.sem
        all_results[category] = res

        print(f'done for {category}')
    
    print(f'sample {i} finished')
    all_res.append(all_results) 

In [None]:
all_results.keys()

In [None]:
def getResults(res, key):
    val = []
    std_err = []
    for category in categories:
        val.append(res[category][key][0])
        std_err.append(res[category][key][1])

    mean = np.mean(val)
    val.append(mean)
    std_err.append(0)
    return val, std_err

In [None]:
# '0-1', '0-2', '2-3', '1-3'
# DISCLOSED_MALE = 0
# DISCLOSED_FEMALE = 1
# UNDISCLOSED_MALE = 2
# UNDISCLOSED_FEMALE = 3
val, std_err = getResults(all_res[0], '0-1')

In [None]:
val

In [None]:
# PM: Performative Male (undisclosed), SM: Singnal Male (disclosed)
plotResults(val, 'PM', 'SM', std_err=std_err, dir_='./')

# [deprecated] Calculate covariate distribution

In [None]:
categories = ['Books','Electronics','CDs & Vinyl','Clothing, Shoes & Jewelry','Home & Kitchen',\
             'Kindle Store','Sports & Outdoors','Cell Phones & Accessories', 'Toys & Games','Games','Literature & Fiction',\
             'Beauty','Health & Personal Care','Movies','Computers']

In [None]:
category = 'Books'

In [None]:
path = 'category_pairs/sample_3/' + category + '.pickle'
with open(path,'rb') as fs:
    pairs = pickle.load(fs)

In [None]:
pairs.keys()

In [None]:
pairs['1-0'][0]

In [None]:
from scipy.stats import ks_2samp

In [None]:
def createDistribution(lst):
    u_el = list(set(lst))
    u_el.sort()
    dist = []
    for el in u_el:
        el_cnt = lst.count(el)
        dist.append(el_cnt)
    s = sum(dist)
    return u_el, [i/s for i in dist]

In [None]:
def calcCovariateHist(pairs,x_label=None,covariate=None):
    keys = ['0-1', '1-0', '2-3', '3-2', '1-3', '3-1', '0-2', '2-0']
    
    confounder = {}
    
    for key in keys:
        for pair in pairs[key]:
            x = pair[0]['group']
            val_x = pair[0][covariate]
            y = pair[1]['group']
            val_y = pair[1][covariate]
            if x not in confounder:
                confounder[x] = []
            if y not in confounder:
                confounder[y] = []
            confounder[x].append(val_x)
            confounder[y].append(val_y)
    
    
    confounder_fil = {}
    confounder_fil[0] = [i for i in confounder[0] if i in range(1,16)]
    confounder_fil[1] = [i for i in confounder[1] if i in range(1,16)]
    confounder_fil[2] = [i for i in confounder[2] if i in range(1,16)]
    confounder_fil[3] = [i for i in confounder[3] if i in range(1,16)]
    
    print(min(confounder_fil[0]),max(confounder_fil[0]))
    print(min(confounder_fil[1]),max(confounder_fil[1]))
    print(min(confounder_fil[2]),max(confounder_fil[2]))
    print(min(confounder_fil[3]),max(confounder_fil[3]))
    
    width = 0.15
    fig, ax = plt.subplots()
    colors = ['b','r','b','r']
    edge_colors = [None,None,'b','r']
    #hatches = ['','','xxx','xxx']
    linestyles = ['-','-','--','--']
    fills = [True,True,False,False]
    labels = ['SM','SW','PM','PW']
    #x_pos = np.arange(1.5,60,step=10)
    #ax.set_xticks(x_pos)
    #ax.set_xticklabels([50,40,30,20,10,0,10,20,30,40,50])
    #print(min(confounder[0]),min(confounder[1]),min(confounder[2]),min(confounder[3]))
    for i in range(4):
        x,dist = createDistribution(confounder_fil[i])
        plt.plot(x,dist,color = colors[i],label = labels[i],linewidth=2,linestyle = linestyles[i])
        #x = [i*width+j for j in x]
        #plt.bar(x,dist,color=colors[i],hatch=hatches[i],fill=fills[i],label=labels[i],width=0.15,edgecolor=edge_colors[i])
    
    index = np.arange(1,16)
    plt.xticks(index + width*1.5, (1,2,3,4,5,6,7,8,9,10,11,12,13,14,15))
    plt.grid(linestyle='--')
    plt.xlabel(x_label)
    plt.ylabel('PMF')
    plt.tight_layout()
    plt.savefig('plotCovariates/'+covariate+'.jpg',dpi=250)
    plt.legend()
    plt.show()

In [None]:
x = calcCovariateHist(pairs,x_label='Readability',covariate='Grade_level')

In [None]:
len(x)

In [None]:
def calcCovariateDist(pairs,t,x_label='Readability',covariate='Grade_level'):
    
    flag = 0
    dists_all = []
    for i in range(4):
        if i!=t:
            if flag==0:
                grp_0_1 = [(m[covariate],n[covariate]) for m,n in pairs[str(t)+'-'+str(i)]]
                #grp_1_0 = [(n[covariate],m[covariate]) for m,n in pairs[str(i)+'-'+str(t)]]
                #grp = grp_0_1 + grp_1_0
                dist = list(zip(*grp_0_1))
            
                dist_0 = list(dist[0])
                dists_all.append(list(dist[1]))
            
            else:
                dists_all.append([n[covariate] for m,n in pairs[str(t)+'-'+str(i)]])
                
            #val,p = ks_2samp(dist_0,dist_1)
            #print(val,p)
            #dist_0.sort()
            #dist_1.sort()
            #y_1 = np.cumsum(dist_0)
            #y_2 = np.cumsum(dist_1)
            
    #y,binEdges=np.histogram(dist_0,bins=100)
    #bincenters = 0.5*(binEdges[1:]+binEdges[:-1])
    #plt.plot(bincenters,y,'-',linewidth=2)
    sns.distplot(dist_0,hist=False,rug=True,label='DM',kde_kws={'color':'b','linewidth':2})
    
    colors = ['r','b','r']
    ls = ['-','--','--']
    labels = ['DW','UM','UW']
    
    for i,dist in enumerate(dists_all):

        #y,binEdges=np.histogram(dist,bins=100)
        #bincenters = 0.5*(binEdges[1:]+binEdges[:-1])
        #plt.plot(bincenters,y,'-',linewidth=2)
        sns.distplot(dist,hist=False,rug=True,label=labels[i],
                     kde_kws={'color':colors[i],'linestyle':ls[i],'linewidth':2})

    plt.xlabel(x_label)
    plt.ylabel('PDF')
    plt.grid(linestyle='--')
    #plt.xscale('log')
    plt.tight_layout()
    plt.savefig('plotCovariates/'+covariate+'.jpg',dpi=250)
    
            
            
    

In [None]:
pairs['0-1'][100]

In [None]:
calcCovariateDist(pairs,0,x_label='Rating',covariate='Rating')

In [None]:
from scipy.stats import kruskal

In [None]:
d_m_s = dis_male.sample(1000000)

In [None]:
u_m_s = undis_male.sample(1000000)

In [None]:
d_f_s = dis_female.sample(1000000)

In [None]:
u_f_s = undis_female.sample(1000000)

In [None]:
kruskal(d_m_s['overall_sentiment'],u_m_s['overall_sentiment'],d_f_s['overall_sentiment'],u_f_s['overall_sentiment'])

In [None]:
from matplotlib.ticker import MultipleLocator, FormatStrFormatter
import matplotlib.pyplot as plt
import matplotlib as mpl

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
#mpl.style.use('classic')
mpl.rcParams['xtick.labelsize'] = 25
mpl.rcParams['ytick.labelsize'] = 32
mpl.rcParams['font.size'] = 20
mpl.rcParams['figure.figsize'] =  9,10
mpl.rcParams['axes.labelsize'] = 35
mpl.rcParams['mathtext.fontset'] = 'stix'
mpl.rcParams['font.family'] = 'STIXGeneral'
mpl.rcParams['axes.linewidth'] = 2.5

In [None]:
calcCovariateDist(pairs,0)

In [None]:
import seaborn as sns