# RQ1 

In [59]:
import pandas as pd
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 200)

def get_clean_path(path_dir_compile):
    path_list = []
    if os.path.isdir(path_dir_compile):
        for root, dirs, files in os.walk(path_dir_compile, topdown=True):
            for file in files:
                file_absolute_path = os.path.join(root, file)
                if file_absolute_path.endswith('.csv') and 'missing' not in file_absolute_path and 'noise' not in file_absolute_path:
                    path_list.append(file_absolute_path)
    return path_list


def read_all_csv(path_list):
    df_list = []
    for path in path_list:
        tmp_df = pd.read_csv(path)
        data_name = path.split('/')[-1].split('_')[2]
        model_name = path.split('/')[-1].split('_')[0]
        tmp_df['data'] = data_name
        tmp_df['model'] = model_name
        df_list.append(tmp_df)
    df = pd.concat(df_list, ignore_index=True)
    return df
    
data_list = ['adult', 'bank', 'stroke']
approach_list = ['random_apfd','deepGini_apfd', 'entropy_apfd', 'pcs_apfd', 'vanillasoftmax_apfd', 
                'dt', 'knn', 'lr', 'nb', 'xgb']

model_list = ['dtree', 'knn', 'lr', 'nb', 'xgb']

path_list = get_clean_path('./result/')
df = read_all_csv(path_list)

all_list = []
for approach in approach_list:
    res_list = []
    for data in data_list:
        for model in model_list:
            values = df[(df['Approach']==approach)&(df['data']==data)&(df['model']==model)]['apfd'].values[0]
            values = str(round(values, 3))
            res_list.append(values)
    all_list.append(res_list)

df = pd.DataFrame(all_list, columns=model_list*3, index=approach_list)
df.to_excel('./tables/apfd_clean.xlsx', index=True)
df



Unnamed: 0,dtree,knn,lr,nb,xgb,dtree.1,knn.1,lr.1,nb.1,xgb.1,dtree.2,knn.2,lr.2,nb.2,xgb.2
random_apfd,0.508,0.506,0.493,0.505,0.5,0.502,0.494,0.504,0.49,0.494,0.519,0.505,0.502,0.497,0.499
deepGini_apfd,0.739,0.738,0.688,0.71,0.793,0.77,0.704,0.769,0.694,0.837,0.768,0.604,0.593,0.615,0.758
entropy_apfd,0.739,0.738,0.688,0.71,0.793,0.77,0.704,0.769,0.694,0.837,0.768,0.604,0.593,0.615,0.758
pcs_apfd,0.739,0.738,0.688,0.71,0.793,0.77,0.704,0.769,0.694,0.837,0.768,0.604,0.593,0.615,0.758
vanillasoftmax_apfd,0.739,0.738,0.688,0.71,0.793,0.77,0.704,0.769,0.694,0.837,0.768,0.604,0.593,0.615,0.758
dt,0.706,0.715,0.786,0.787,0.742,0.747,0.79,0.801,0.817,0.779,0.839,0.753,0.837,0.832,0.889
knn,0.787,0.775,0.737,0.739,0.729,0.823,0.775,0.784,0.782,0.792,0.765,0.626,0.589,0.604,0.669
lr,0.74,0.743,0.722,0.672,0.688,0.786,0.769,0.757,0.771,0.751,0.898,0.621,0.608,0.608,0.703
nb,0.787,0.775,0.737,0.739,0.729,0.823,0.775,0.784,0.782,0.792,0.765,0.626,0.589,0.604,0.669
xgb,0.81,0.811,0.829,0.83,0.813,0.863,0.872,0.878,0.877,0.868,0.99,0.787,0.845,0.839,0.9


# RQ2

In [91]:
import pandas as pd
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 200)

def get_mixture_path(path_dir_compile):
    path_list = []
    if os.path.isdir(path_dir_compile):
        for root, dirs, files in os.walk(path_dir_compile, topdown=True):
            for file in files:
                file_absolute_path = os.path.join(root, file)
                if file_absolute_path.endswith('.csv') and 'mixture' in file_absolute_path:
                    path_list.append(file_absolute_path)
    return path_list


def read_all_csv(path_list):
    df_list = []
    for path in path_list:
        tmp_df = pd.read_csv(path)
        model_name = path.split('/')[-1].split('_')[0]
        data_type = path.split('/')[-1].split('_')[1]
        data_name = path.split('/')[-1].split('_')[2]
        tmp_df['data'] = data_name
        tmp_df['data_type'] = data_type
        tmp_df['model'] = model_name
        df_list.append(tmp_df)
    df = pd.concat(df_list, ignore_index=True)
    return df
    
data_list = ['adult', 'bank', 'stroke']
approach_list = ['random_apfd','deepGini_apfd', 'entropy_apfd', 'pcs_apfd', 'vanillasoftmax_apfd', 
                'dt', 'knn', 'lr', 'nb', 'xgb']

model_list = ['dtree', 'knn', 'lr', 'nb', 'xgb']

path_list = get_mixture_path('./result/')

df = read_all_csv(path_list)

df_noise = df[df['data_type']=='noise']

all_list = []
for approach in approach_list:
    res_list = []
    for data in data_list:
        for model in model_list:
            tmp_df = df_noise[(df_noise['Approach']==approach)&(df_noise['data']==data)&(df_noise['model']==model)]
            
            values = tmp_df['apfd'].mean()
            values = str(round(values, 3))

            res_list.append(values)
    all_list.append(res_list)
        

df_noise = pd.DataFrame(all_list, columns=model_list*3, index=approach_list)
df_noise.to_excel('./tables/apfd_mixture_noise.xlsx', index=True)
df_noise


Unnamed: 0,dtree,knn,lr,nb,xgb,dtree.1,knn.1,lr.1,nb.1,xgb.1,dtree.2,knn.2,lr.2,nb.2,xgb.2
random_apfd,0.499,0.499,0.5,0.5,0.501,0.498,0.502,0.497,0.502,0.5,0.509,0.5,0.499,0.501,0.501
deepGini_apfd,0.46,0.701,0.682,0.711,0.744,0.74,0.707,0.766,0.676,0.826,0.702,0.602,0.593,0.611,0.755
entropy_apfd,0.46,0.701,0.682,0.711,0.744,0.74,0.707,0.766,0.676,0.826,0.702,0.602,0.593,0.611,0.755
pcs_apfd,0.46,0.701,0.682,0.711,0.744,0.74,0.707,0.766,0.676,0.826,0.702,0.602,0.593,0.611,0.755
vanillasoftmax_apfd,0.46,0.701,0.682,0.711,0.744,0.74,0.707,0.766,0.676,0.826,0.702,0.602,0.593,0.611,0.755
dt,0.771,0.74,0.776,0.781,0.773,0.812,0.791,0.801,0.815,0.78,0.848,0.784,0.836,0.831,0.887
knn,0.741,0.746,0.718,0.727,0.736,0.813,0.783,0.778,0.783,0.79,0.727,0.623,0.592,0.593,0.662
lr,0.68,0.714,0.678,0.677,0.688,0.813,0.745,0.76,0.766,0.759,0.83,0.626,0.607,0.605,0.69
nb,0.741,0.746,0.718,0.727,0.736,0.813,0.783,0.778,0.783,0.79,0.727,0.623,0.592,0.593,0.662
xgb,0.83,0.81,0.825,0.827,0.829,0.867,0.872,0.875,0.875,0.868,0.982,0.825,0.845,0.838,0.898
