In [1]:
%matplotlib inline

import pickle
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def get_sort_resdir_by_name(res_root, append_n="stats.res"):
    '''
    this function parse the res_dir by name, and sort it by k and gk
    '''
    res_dirs = os.listdir(res_root)
    k_gk_dict = {}
    for rd in res_dirs:
        k = int(rd.split('_')[0][1:])
        gk = int(rd.split('_')[1][2:])
        full_path = os.path.join(res_root, rd, append_n)
        if k not in k_gk_dict:
            k_gk_dict[k] = {gk: full_path}
        else:
            k_gk_dict[k][gk] = full_path
        # sort dict by key
        k_gk_dict = dict(sorted(k_gk_dict.items()))
        for k, v in k_gk_dict.items():
            k_gk_dict[k] = dict(sorted(v.items()))
    return k_gk_dict
    
def preprocess_df(df):
    # parse prob and cv from filename
    ks = []
    gks = []
    cv_list = []
    abs_prob_list = []
    for index, row in df.iterrows():
        k = int(row['filename'].split('_')[0][1:])
        gk = int(row['filename'].split('_')[1][2:])
        ks.append(k)
        gks.append(gk)
        prob = float(row['filename'].split('prob')[-1][:3].replace('_', '.'))
        abs_prob_list.append(prob)
        cv = float(row['filename'].split('_num')[0].split('cv')[-1].replace('_', '.'))
        cv_list.append(cv)
    df['prob'] = abs_prob_list
    df['cv'] = cv_list
    df['k'] = ks
    df['gk'] = gks
    return df
    
def compute_performance_increase(df_path):
    df = pd.read_csv(df_path)
    mean_columns = [col for col in df.columns if ("mean" in col and "distances" in col) or "filename" in col]
    df = df[mean_columns]
    hy_vec = (df['distances_arxiv_vector_mean'].values - df['distances_hybrid_mean'].values) / df['distances_arxiv_vector_mean'].values
    hyw_vec = (df['distances_arxiv_vector_mean'].values - df['distances_weighted_hybrid_mean'].values) / df['distances_arxiv_vector_mean'].values
    hyw_hy = (df['distances_hybrid_mean'].values - df['distances_weighted_hybrid_mean'].values) / df['distances_hybrid_mean'].values
    df['hy_vec'] = hy_vec
    df['hyw_vec'] = hyw_vec
    df['hyw_hy'] = hyw_hy
    return df

In [3]:
k_gk_dict = get_sort_resdir_by_name("../data/results/")
# k_gk_dict

file_list = []
for k, v in k_gk_dict.items():
    for k1, v1 in v.items():
        file_list.append(v1)
file_list

['../data/results/k5_gk3_outputs/stats.res',
 '../data/results/k10_gk3_outputs/stats.res',
 '../data/results/k10_gk5_outputs/stats.res',
 '../data/results/k10_gk7_outputs/stats.res',
 '../data/results/k50_gk5_outputs/stats.res',
 '../data/results/k50_gk15_outputs/stats.res',
 '../data/results/k50_gk25_outputs/stats.res',
 '../data/results/k50_gk35_outputs/stats.res',
 '../data/results/k100_gk10_outputs/stats.res',
 '../data/results/k100_gk30_outputs/stats.res',
 '../data/results/k100_gk50_outputs/stats.res',
 '../data/results/k100_gk70_outputs/stats.res',
 '../data/results/k500_gk50_outputs/stats.res',
 '../data/results/k500_gk150_outputs/stats.res',
 '../data/results/k500_gk250_outputs/stats.res',
 '../data/results/k500_gk350_outputs/stats.res',
 '../data/results/k1000_gk100_outputs/stats.res',
 '../data/results/k1000_gk300_outputs/stats.res',
 '../data/results/k1000_gk500_outputs/stats.res',
 '../data/results/k1000_gk700_outputs/stats.res']

In [4]:
df_list = []
for fp in file_list:
    df_list.append(compute_performance_increase(fp))
df = pd.concat(df_list)
print(len(df))
df.head()

400


Unnamed: 0,filename,distances_arxiv_vector_mean,distances_hybrid_mean,distances_weighted_hybrid_mean,hy_vec,hyw_vec,hyw_hy
0,k5_gk3_cv0_05_num20_prob0_1.csv,1.259858,1.271147,1.273698,-0.00896,-0.010985,-0.002007
1,k5_gk3_cv0_05_num20_prob0_3.csv,1.209279,1.23001,1.236381,-0.017144,-0.022412,-0.005179
2,k5_gk3_cv0_05_num20_prob0_5.csv,1.182717,1.214093,1.212368,-0.026529,-0.025071,0.00142
3,k5_gk3_cv0_05_num20_prob1_0.csv,1.072682,1.124198,1.129678,-0.048025,-0.053134,-0.004875
4,k5_gk3_cv0_1_num10_prob0_1.csv,1.262373,1.271931,1.274508,-0.007572,-0.009613,-0.002026


In [5]:
preprocess_df(df)

Unnamed: 0,filename,distances_arxiv_vector_mean,distances_hybrid_mean,distances_weighted_hybrid_mean,hy_vec,hyw_vec,hyw_hy,prob,cv,k,gk
0,k5_gk3_cv0_05_num20_prob0_1.csv,1.259858,1.271147,1.273698,-0.008960,-0.010985,-0.002007,0.1,0.05,5,3
1,k5_gk3_cv0_05_num20_prob0_3.csv,1.209279,1.230010,1.236381,-0.017144,-0.022412,-0.005179,0.3,0.05,5,3
2,k5_gk3_cv0_05_num20_prob0_5.csv,1.182717,1.214093,1.212368,-0.026529,-0.025071,0.001420,0.5,0.05,5,3
3,k5_gk3_cv0_05_num20_prob1_0.csv,1.072682,1.124198,1.129678,-0.048025,-0.053134,-0.004875,1.0,0.05,5,3
4,k5_gk3_cv0_1_num10_prob0_1.csv,1.262373,1.271931,1.274508,-0.007572,-0.009613,-0.002026,0.1,0.10,5,3
...,...,...,...,...,...,...,...,...,...,...,...
15,k1000_gk700_cv0_5_num3_prob1_0.csv,1.263550,1.271106,1.256672,-0.005980,0.005443,0.011355,1.0,0.50,1000,700
16,k1000_gk700_cv0_7_num2_prob0_1.csv,1.304176,1.299810,1.281993,0.003348,0.017009,0.013707,0.1,0.70,1000,700
17,k1000_gk700_cv0_7_num2_prob0_3.csv,1.295281,1.293514,1.277628,0.001364,0.013629,0.012282,0.3,0.70,1000,700
18,k1000_gk700_cv0_7_num2_prob0_5.csv,1.286389,1.288281,1.272014,-0.001471,0.011175,0.012627,0.5,0.70,1000,700


In [6]:
res_df = df[['hy_vec', 'hyw_vec', 'hyw_hy', 'k']]
res_df.describe()

Unnamed: 0,hy_vec,hyw_vec,hyw_hy,k
count,400.0,400.0,400.0,400.0
mean,-0.004009,0.000802,0.004809,331.75
std,0.008437,0.011483,0.005113,377.909018
min,-0.049124,-0.053134,-0.007718,5.0
25%,-0.005172,-0.001257,0.001874,50.0
50%,-0.000887,0.003284,0.004592,100.0
75%,0.000412,0.007841,0.008472,500.0
max,0.005849,0.017688,0.014006,1000.0


In [8]:
res_df[res_df['k']==1000].describe()

Unnamed: 0,hy_vec,hyw_vec,hyw_hy,k
count,80.0,80.0,80.0,80.0
mean,5e-06,0.008699,0.008691,1000.0
std,0.002316,0.004062,0.004062,0.0
min,-0.006197,0.003159,0.00258,1000.0
25%,-0.000824,0.004315,0.00546,1000.0
50%,0.000627,0.008682,0.009424,1000.0
75%,0.001314,0.011686,0.012399,1000.0
max,0.003735,0.017688,0.014006,1000.0
