In [17]:
%matplotlib inline

import pickle
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [18]:
def get_sort_resdir_by_name(res_root, append_n="stats_cos.res"):
    '''
    this function parse the res_dir by name, and sort it by k and gk
    '''
    res_dirs = os.listdir(res_root)
    k_gk_dict = {}
    for rd in res_dirs:
        k = int(rd.split('_')[0][1:])
        gk = int(rd.split('_')[1][2:])
        full_path = os.path.join(res_root, rd, append_n)
        if k not in k_gk_dict:
            k_gk_dict[k] = {gk: full_path}
        else:
            k_gk_dict[k][gk] = full_path
        # sort dict by key
        k_gk_dict = dict(sorted(k_gk_dict.items()))
        for k, v in k_gk_dict.items():
            k_gk_dict[k] = dict(sorted(v.items()))
    return k_gk_dict
    
def preprocess_df(df):
    # parse prob and cv from filename
    ks = []
    gks = []
    cv_list = []
    abs_prob_list = []
    for index, row in df.iterrows():
        k = int(row['filename'].split('_')[0][1:])
        gk = int(row['filename'].split('_')[1][2:])
        ks.append(k)
        gks.append(gk)
        prob = float(row['filename'].split('prob')[-1][:3].replace('_', '.'))
        abs_prob_list.append(prob)
        cv = float(row['filename'].split('_num')[0].split('cv')[-1].replace('_', '.'))
        cv_list.append(cv)
    df['prob'] = abs_prob_list
    df['cv'] = cv_list
    df['k'] = ks
    df['gk'] = gks
    return df
    
def compute_performance_increase(df_path):
    df = pd.read_csv(df_path)
    mean_columns = [col for col in df.columns if ("mean" in col and "distances" in col) or "filename" in col]
    df = df[mean_columns]
    hy_vec = (df['distances_arxiv_vector_mean'].values - df['distances_hybrid_mean'].values) / df['distances_arxiv_vector_mean'].values
    hyw_vec = (df['distances_arxiv_vector_mean'].values - df['distances_weighted_hybrid_mean'].values) / df['distances_arxiv_vector_mean'].values
    hyw_hy = (df['distances_hybrid_mean'].values - df['distances_weighted_hybrid_mean'].values) / df['distances_hybrid_mean'].values
    df['hy_vec'] = hy_vec
    df['hyw_vec'] = hyw_vec
    df['hyw_hy'] = hyw_hy
    return df

In [19]:
k_gk_dict = get_sort_resdir_by_name("../data/results/")
# k_gk_dict

file_list = []
for k, v in k_gk_dict.items():
    for k1, v1 in v.items():
        file_list.append(v1)
file_list

['../data/results/k5_gk3_outputs/stats_cos.res',
 '../data/results/k10_gk3_outputs/stats_cos.res',
 '../data/results/k10_gk5_outputs/stats_cos.res',
 '../data/results/k10_gk7_outputs/stats_cos.res',
 '../data/results/k50_gk5_outputs/stats_cos.res',
 '../data/results/k50_gk15_outputs/stats_cos.res',
 '../data/results/k50_gk25_outputs/stats_cos.res',
 '../data/results/k50_gk35_outputs/stats_cos.res',
 '../data/results/k100_gk10_outputs/stats_cos.res',
 '../data/results/k100_gk30_outputs/stats_cos.res',
 '../data/results/k100_gk50_outputs/stats_cos.res',
 '../data/results/k100_gk70_outputs/stats_cos.res',
 '../data/results/k500_gk50_outputs/stats_cos.res',
 '../data/results/k500_gk150_outputs/stats_cos.res',
 '../data/results/k500_gk250_outputs/stats_cos.res',
 '../data/results/k500_gk350_outputs/stats_cos.res',
 '../data/results/k1000_gk100_outputs/stats_cos.res',
 '../data/results/k1000_gk300_outputs/stats_cos.res',
 '../data/results/k1000_gk500_outputs/stats_cos.res',
 '../data/results

In [20]:
df_list = []
for fp in file_list:
    df_list.append(compute_performance_increase(fp))
df = pd.concat(df_list)
print(len(df))
df.head()

400


Unnamed: 0,filename,distances_arxiv_vector_mean,distances_hybrid_mean,distances_weighted_hybrid_mean,hy_vec,hyw_vec,hyw_hy
0,k5_gk3_cv0_05_num20_prob0_1.csv,0.818925,0.82426,0.828432,-0.006515,-0.011609,-0.005061
1,k5_gk3_cv0_05_num20_prob0_3.csv,0.767487,0.781651,0.786764,-0.018455,-0.025117,-0.006542
2,k5_gk3_cv0_05_num20_prob0_5.csv,0.737609,0.759651,0.760378,-0.029883,-0.030869,-0.000957
3,k5_gk3_cv0_05_num20_prob1_0.csv,0.63593,0.668892,0.670768,-0.051832,-0.054783,-0.002805
4,k5_gk3_cv0_1_num10_prob0_1.csv,0.821715,0.824837,0.828276,-0.003798,-0.007984,-0.00417


In [21]:
preprocess_df(df)

Unnamed: 0,filename,distances_arxiv_vector_mean,distances_hybrid_mean,distances_weighted_hybrid_mean,hy_vec,hyw_vec,hyw_hy,prob,cv,k,gk
0,k5_gk3_cv0_05_num20_prob0_1.csv,0.818925,0.824260,0.828432,-0.006515,-0.011609,-0.005061,0.1,0.05,5,3
1,k5_gk3_cv0_05_num20_prob0_3.csv,0.767487,0.781651,0.786764,-0.018455,-0.025117,-0.006542,0.3,0.05,5,3
2,k5_gk3_cv0_05_num20_prob0_5.csv,0.737609,0.759651,0.760378,-0.029883,-0.030869,-0.000957,0.5,0.05,5,3
3,k5_gk3_cv0_05_num20_prob1_0.csv,0.635930,0.668892,0.670768,-0.051832,-0.054783,-0.002805,1.0,0.05,5,3
4,k5_gk3_cv0_1_num10_prob0_1.csv,0.821715,0.824837,0.828276,-0.003798,-0.007984,-0.004170,0.1,0.10,5,3
...,...,...,...,...,...,...,...,...,...,...,...
15,k1000_gk700_cv0_5_num3_prob1_0.csv,0.816152,0.817257,0.797985,-0.001353,0.022260,0.023581,1.0,0.50,1000,700
16,k1000_gk700_cv0_7_num2_prob0_1.csv,0.864041,0.855282,0.832983,0.010137,0.035944,0.026072,0.1,0.70,1000,700
17,k1000_gk700_cv0_7_num2_prob0_3.csv,0.854112,0.847598,0.827666,0.007627,0.030962,0.023515,0.3,0.70,1000,700
18,k1000_gk700_cv0_7_num2_prob0_5.csv,0.843882,0.839951,0.819278,0.004659,0.029157,0.024613,0.5,0.70,1000,700


In [22]:
res_df = df[['hy_vec', 'hyw_vec', 'hyw_hy', 'k']]
res_df.describe()

Unnamed: 0,hy_vec,hyw_vec,hyw_hy,k
count,400.0,400.0,400.0,400.0
mean,-0.002041,0.008162,0.010222,331.75
std,0.010473,0.016435,0.009097,377.909018
min,-0.058392,-0.057041,-0.009225,5.0
25%,-0.002947,0.002417,0.004253,50.0
50%,-0.000383,0.008131,0.010073,100.0
75%,0.002989,0.019323,0.017618,500.0
max,0.016647,0.039313,0.02807,1000.0


In [23]:
res_df[res_df['k']>100].describe()

Unnamed: 0,hy_vec,hyw_vec,hyw_hy,k
count,160.0,160.0,160.0,160.0
mean,0.00249,0.019139,0.016703,750.0
std,0.004357,0.009365,0.006567,250.784931
min,-0.003899,0.005511,0.005872,500.0
25%,-0.000819,0.01181,0.012307,500.0
50%,0.002065,0.019249,0.016883,750.0
75%,0.005094,0.02616,0.022578,1000.0
max,0.016647,0.039313,0.02807,1000.0
