# Testing on original data

In [318]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 200)

dic_approach = {'lgb': 'LGMP', 
               'random_apfd': 'Random', 
               'deepGini_apfd': 'DeepGini', 
               'leastConfidence_apfd': 'LC', 
               'margin_apfd': 'Margin', 
               'rf': 'RFMP', 
               'xgb': 'XGMP', 
               'lr': 'LRMP'}
dic_other = {'lr': 'LR', 
             'xgboost': 'XGBoost', 
             'lgb': 'LightGBM', 
             'rf': 'RF', 
             'bank': 'Bank', 
             'heart': 'Heart', 
             'stroke': 'Stroke', 
             'adult': 'Adult'
            }

def get_path(path_dir_compile):
    path_list = []
    if os.path.isdir(path_dir_compile):
        for root, dirs, files in os.walk(path_dir_compile, topdown=True):
            for file in files:
                file_absolute_path = os.path.join(root, file)
                if file_absolute_path.endswith('.csv'):
                    path_list.append(file_absolute_path)
    return path_list

path_list = get_path('result/original_res/')
compare_list = [i for i in path_list if 'compare' in i]
compare_model = [i.split('/')[-1].split('_')[0] for i in compare_list]
compare_data = [i.split('/')[-1].split('_')[1] for i in compare_list]

model_list = [i for i in path_list if 'compare' not in i]
model_name = [i.split('/')[-1].split('_')[0] for i in model_list]
model_data = [i.split('/')[-1].split('_')[1] for i in model_list]

data_list = []
for i in range(len(compare_list)):
    tmp_df = pd.read_csv(compare_list[i])
    tmp_df['model'] = compare_model[i]
    tmp_df['data'] = compare_data[i]
    data_list.append(tmp_df)
df_compare = pd.concat(data_list, ignore_index=True)

data_list = []
for i in range(len(model_list)):
    tmp_df = pd.read_csv(model_list[i])
    tmp_df['model'] = model_name[i]
    tmp_df['data'] = model_data[i]
    del tmp_df['mutation_feature_apfd']
    del tmp_df['mutation_model_apfd']
    del tmp_df['fusion_2_feature_apfd']
    tmp_df = tmp_df.rename(columns={"fusion_3_feature_apfd": "apfd"})
    data_list.append(tmp_df)
df_model = pd.concat(data_list, ignore_index=True)

df = pd.concat([df_compare, df_model], ignore_index=True)
df['Approach'] = [dic_approach[i] for i in df['Approach']]
df['model'] = [dic_other[i] for i in df['model']]
df['data'] = [dic_other[i] for i in df['data']]
df['apfd'] = [str(i)[:6] for i in df['apfd']]

approach_list = ['Random', 'Margin', 'LC', 'DeepGini', 'LRMP',  'RFMP', 'XGMP', 'LGMP',]
data_list = ['Adult', 'Bank', 'Heart', 'Stroke']
model_list = ['LR', 'RF', 'XGBoost', 'LightGBM']

res_list = []
for approach in approach_list:
    tmp_res = [approach]
    for data in data_list:
        for model in model_list:
            apfd = df[(df['Approach']==approach)&(df['data']==data)&(df['model']==model)]['apfd'].values[0]
            tmp_res.append(apfd)
    res_list.append(tmp_res)
df_re = pd.DataFrame(data=res_list, columns=['Approach']+model_list*4)
df_re.to_excel('result/table//RQ1-1.xlsx', index=False)
df_re

Unnamed: 0,Approach,LR,RF,XGBoost,LightGBM,LR.1,RF.1,XGBoost.1,LightGBM.1,LR.2,RF.2,XGBoost.2,LightGBM.2,LR.3,RF.3,XGBoost.3,LightGBM.3
0,Random,0.4975,0.4924,0.4964,0.4992,0.4995,0.5051,0.4881,0.4994,0.4971,0.4993,0.5061,0.4977,0.4978,0.5074,0.4865,0.4959
1,Margin,0.7366,0.6639,0.7873,0.8048,0.7987,0.7367,0.8262,0.8471,0.7667,0.8259,0.9082,0.8494,0.5959,0.8573,0.7579,0.6942
2,LC,0.7366,0.6639,0.7873,0.8048,0.7987,0.7367,0.8262,0.8471,0.7639,0.8259,0.9069,0.8561,0.5959,0.8573,0.7579,0.6942
3,DeepGini,0.7366,0.6699,0.7875,0.8047,0.7987,0.732,0.8255,0.8467,0.7608,0.8197,0.9066,0.8566,0.5959,0.8661,0.7577,0.6948
4,LRMP,0.7342,0.7749,0.7451,0.7754,0.8127,0.7131,0.8059,0.815,0.803,0.824,0.7558,0.7627,0.6132,0.9774,0.7171,0.683
5,RFMP,0.8053,0.7377,0.7947,0.809,0.8768,0.7448,0.8755,0.881,0.9264,0.8664,0.9507,0.9613,0.8437,0.9842,0.8997,0.9078
6,XGMP,0.8244,0.7775,0.8161,0.8232,0.8785,0.7526,0.8667,0.8821,0.9228,0.8972,0.9303,0.9474,0.8443,0.978,0.8992,0.9078
7,LGMP,0.8258,0.7943,0.8188,0.8245,0.8783,0.7215,0.8682,0.8828,0.9192,0.8669,0.9328,0.945,0.8412,0.9771,0.8975,0.907


In [319]:
res_list = []
res_np = df_re.to_numpy()[:, 1:].astype(np.float64)
improvement = (res_np[4]-res_np[3])/res_np[3]
improvement = ['LRMP']+[str(i*100)[:5] for i in improvement]
res_list.append(improvement)
improvement =(res_np[5]-res_np[3])/res_np[3]
improvement = ['RFMP']+[str(i*100)[:5] for i in improvement]
res_list.append(improvement)
improvement =(res_np[6]-res_np[3])/res_np[3]
improvement = ['XGMP']+[str(i*100)[:5] for i in improvement]
res_list.append(improvement)
improvement =(res_np[7]-res_np[3])/res_np[3]
improvement =['LGMP']+[str(i*100)[:5] for i in improvement]
res_list.append(improvement)
df_re = pd.DataFrame(data=res_list, columns=['Approach']+model_list*4)
df_re.to_excel('result/table/RQ1-2.xlsx', index=False)
df_re

Unnamed: 0,Approach,LR,RF,XGBoost,LightGBM,LR.1,RF.1,XGBoost.1,LightGBM.1,LR.2,RF.2,XGBoost.2,LightGBM.2,LR.3,RF.3,XGBoost.3,LightGBM.3
0,LRMP,-0.32,15.67,-5.38,-3.64,1.752,-2.58,-2.37,-3.74,5.546,0.524,-16.6,-10.9,2.903,12.85,-5.35,-1.69
1,RFMP,9.326,10.12,0.914,0.534,9.778,1.748,6.056,4.051,21.76,5.697,4.864,12.22,41.58,13.63,18.74,30.65
2,XGMP,11.91,16.06,3.631,2.298,9.991,2.814,4.99,4.18,21.29,9.454,2.614,10.6,41.68,12.91,18.67,30.65
3,LGMP,12.1,18.56,3.974,2.46,9.966,-1.43,5.172,4.263,20.82,5.758,2.889,10.31,41.16,12.81,18.45,30.54


# Testing on fairness data

In [25]:
import pandas as pd
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 200)

dic_approach = {'lgb': 'LGMP', 
               'random_apfd': 'Random', 
               'deepGini_apfd': 'DeepGini', 
               'leastConfidence_apfd': 'LC', 
               'margin_apfd': 'Margin', 
               'rf': 'RFMP', 
               'xgb': 'XGMP', 
               'lr': 'LRMP', 'age': 'Age', 'gender': 'Gender', 'race': 'Race'}
dic_other = {'lr': 'LR', 
             'xgboost': 'XGBoost', 
             'lgb': 'LightGBM', 
             'rf': 'RF', 
             'bank': 'Bank', 
             'heart': 'Heart', 
             'stroke': 'Stroke', 
             'adult': 'Adult', 'age': 'Age', 'gender': 'Gender', 'race': 'Race'
            }

def get_path(path_dir_compile):
    path_list = []
    if os.path.isdir(path_dir_compile):
        for root, dirs, files in os.walk(path_dir_compile, topdown=True):
            for file in files:
                file_absolute_path = os.path.join(root, file)
                if file_absolute_path.endswith('.csv'):
                    path_list.append(file_absolute_path)
    return path_list

path_list = get_path('result/fairness_res/')
compare_list = [i for i in path_list if 'compare' in i]
compare_model = [i.split('/')[-1].split('_')[0] for i in compare_list]
compare_data = [i.split('/')[-1].split('_')[2] for i in compare_list]

model_list = [i for i in path_list if 'compare' not in i]
model_name = [i.split('/')[-1].split('_')[0] for i in model_list]
model_data = [i.split('/')[-1].split('_')[2] for i in model_list]

data_list = []
for i in range(len(compare_list)):
    tmp_df = pd.read_csv(compare_list[i])
    tmp_df['model'] = compare_model[i]
    tmp_df['data'] = compare_data[i]
    data_list.append(tmp_df)
df_compare = pd.concat(data_list, ignore_index=True)

data_list = []
for i in range(len(model_list)):
    tmp_df = pd.read_csv(model_list[i])
    tmp_df['model'] = model_name[i]
    tmp_df['data'] = model_data[i]
    del tmp_df['mutation_feature_apfd']
    del tmp_df['mutation_model_apfd']
    del tmp_df['fusion_2_feature_apfd']
    tmp_df = tmp_df.rename(columns={"fusion_3_feature_apfd": "apfd"})
    data_list.append(tmp_df)
df_model = pd.concat(data_list, ignore_index=True)

df = pd.concat([df_compare, df_model], ignore_index=True)
df['Approach'] = [dic_approach[i] for i in df['Approach']]
df['model'] = [dic_other[i] for i in df['model']]
df['data'] = [dic_other[i] for i in df['data']]
df['apfd'] = [str(i)[:6] for i in df['apfd']]

approach_list = ['Random', 'Margin', 'LC', 'DeepGini', 'LRMP',  'RFMP', 'XGMP', 'LGMP',]
data_list = ['Age', 'Gender', 'Race']
model_list = ['LR', 'RF', 'XGBoost', 'LightGBM']

res_list = []
for approach in approach_list:
    tmp_res = [approach]
    for data in data_list:
        for model in model_list:
            apfd = df[(df['Approach']==approach)&(df['data']==data)&(df['model']==model)]['apfd'].values[0]
            tmp_res.append(apfd)
    res_list.append(tmp_res)
df_re = pd.DataFrame(data=res_list, columns=['Approach']+model_list*3)

df_re.to_excel('result/table/RQ2-1.xlsx', index=False)
df_re

Unnamed: 0,Approach,LR,RF,XGBoost,LightGBM,LR.1,RF.1,XGBoost.1,LightGBM.1,LR.2,RF.2,XGBoost.2,LightGBM.2
0,Random,0.5043,0.5005,0.4973,0.4973,0.5003,0.5077,0.5087,0.5007,0.4959,0.5028,0.5,0.5039
1,Margin,0.7981,0.6632,0.8227,0.8323,0.7136,0.6688,0.7869,0.8048,0.7338,0.6593,0.7872,0.8048
2,LC,0.7981,0.6632,0.8227,0.8323,0.7136,0.6688,0.7869,0.8048,0.7338,0.6593,0.7872,0.8048
3,DeepGini,0.7981,0.6647,0.8223,0.8324,0.7136,0.6711,0.7873,0.8047,0.7338,0.6544,0.7874,0.8047
4,LRMP,0.8133,0.8302,0.8211,0.8143,0.7515,0.741,0.7418,0.7754,0.7404,0.7587,0.7418,0.7752
5,RFMP,0.8735,0.8537,0.8773,0.8805,0.8103,0.758,0.7953,0.8093,0.8073,0.7864,0.7955,0.8082
6,XGMP,0.8794,0.8464,0.8726,0.8828,0.8266,0.7882,0.8162,0.8232,0.827,0.801,0.8148,0.823
7,LGMP,0.8793,0.8522,0.8716,0.8821,0.8286,0.7911,0.8189,0.8245,0.8294,0.81,0.8189,0.8246


# Testing on missing data

In [5]:
import pandas as pd
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 200)

dic_approach = {'lgb': 'LGMP', 
               'random_apfd': 'Random', 
               'deepGini_apfd': 'DeepGini', 
               'leastConfidence_apfd': 'LC', 
               'margin_apfd': 'Margin', 
               'rf': 'RFMP', 
               'xgb': 'XGMP', 
               'lr': 'LRMP', 'age': 'Age', 'gender': 'Gender', 'race': 'Race'}
dic_other = {'lr': 'LR', 
             'xgboost': 'XGBoost', 
             'lgb': 'LightGBM', 
             'rf': 'RF', 
             'bank': 'Bank', 
             'heart': 'Heart', 
             'stroke': 'Stroke', 
             'adult': 'Adult', 'age': 'Age', 'gender': 'Gender', 'race': 'Race'
            }

def get_path(path_dir_compile):
    path_list = []
    if os.path.isdir(path_dir_compile):
        for root, dirs, files in os.walk(path_dir_compile, topdown=True):
            for file in files:
                file_absolute_path = os.path.join(root, file)
                if file_absolute_path.endswith('.csv'):
                    path_list.append(file_absolute_path)
    return path_list

path_list = get_path('result/missing/')
compare_list = [i for i in path_list if 'compare' in i]
compare_model = [i.split('/')[-1].split('_')[0] for i in compare_list]
compare_data = [i.split('/')[-1].split('_')[1]+'_'+i.split('/')[-1].split('_')[2]+'_'+i.split('/')[-1].split('_')[3] for i in compare_list]

model_list = [i for i in path_list if 'compare' not in i]
model_name = [i.split('/')[-1].split('_')[0] for i in model_list]
model_data = [i.split('/')[-1].split('_')[1]+'_'+i.split('/')[-1].split('_')[2]+'_'+i.split('/')[-1].split('_')[3] for i in model_list]

data_list = []
for i in range(len(compare_list)):
    tmp_df = pd.read_csv(compare_list[i])
    tmp_df['model'] = compare_model[i]
    tmp_df['data'] = compare_data[i]
    data_list.append(tmp_df)
df_compare = pd.concat(data_list, ignore_index=True)


data_list = []
for i in range(len(model_list)):
    tmp_df = pd.read_csv(model_list[i])
    tmp_df['model'] = model_name[i]
    tmp_df['data'] = model_data[i]
    del tmp_df['mutation_feature_apfd']
    del tmp_df['mutation_model_apfd']
    del tmp_df['fusion_2_feature_apfd']
    tmp_df = tmp_df.rename(columns={"fusion_3_feature_apfd": "apfd"})
    data_list.append(tmp_df)
df_model = pd.concat(data_list, ignore_index=True)

df = pd.concat([df_compare, df_model], ignore_index=True)
df['Approach'] = [dic_approach[i] for i in df['Approach']]
df['model'] = [dic_other[i] for i in df['model']]
df['apfd'] = [str(i)[:6] for i in df['apfd']]

df = df[df['Approach'] != 'Random']

df = df[df['apfd']>'0.62']

df['data_type'] = [i.split('_')[1] for i in df['data']]
df1 = df[df['data_type']=='1']
df2 = df[df['data_type']=='2']
df3 = df[df['data_type']=='3']
df4 = df[df['data_type']=='4']

In [6]:
# missing 1 col
dic = {'DeepGini': 0, 'LC': 0, 'LGMP': 0, 'LRMP': 0, 'Margin': 0, 'RFMP': 0, 'XGMP': 0}

for _, pdf in df1.groupby(['data', 'model']):
    tmp_dic = dict(zip(pdf['Approach'], pdf['apfd']))
    key = max(tmp_dic,key=tmp_dic.get)
    dic[key]+=1
print(dic)

df1['f_apfd'] = [float(i) for i in df1['apfd']]
df1_re = df1.groupby(['Approach']).mean().reset_index(drop=False)
df1_re.sort_values(by=['Approach'])

df_n = pd.DataFrame(columns=['Approach', 'bestcase'])
df_n['Approach'] = dic.keys()
df_n['bestcase'] = dic.values()
df1_re = df_n.merge(df1_re, left_on='Approach', right_on='Approach', how='left')
df1_re['f_apfd'] = [str(i)[:6] for i in df1_re['f_apfd']]
df1_re.to_excel('result/table/RQ3-1.xlsx', index=False)
df1_re

{'DeepGini': 0, 'LC': 0, 'LGMP': 61, 'LRMP': 0, 'Margin': 0, 'RFMP': 80, 'XGMP': 27}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['f_apfd'] = [float(i) for i in df1['apfd']]


Unnamed: 0,Approach,bestcase,f_apfd
0,DeepGini,0,0.7141
1,LC,0,0.7143
2,LGMP,61,0.815
3,LRMP,0,0.7341
4,Margin,0,0.7135
5,RFMP,80,0.8104
6,XGMP,27,0.8154


In [7]:
# missing 2 col
dic = {'DeepGini': 0, 'LC': 0, 'LGMP': 0, 'LRMP': 0, 'Margin': 0, 'RFMP': 0, 'XGMP': 0}

for _, pdf in df2.groupby(['data', 'model']):
    tmp_dic = dict(zip(pdf['Approach'], pdf['apfd']))
    key = max(tmp_dic,key=tmp_dic.get)
    dic[key]+=1
print(dic)

df2['f_apfd'] = [float(i) for i in df2['apfd']]
df2_re = df2.groupby(['Approach']).mean().reset_index(drop=False)
df2_re.sort_values(by=['Approach'])

df_n = pd.DataFrame(columns=['Approach', 'bestcase'])
df_n['Approach'] = dic.keys()
df_n['bestcase'] = dic.values()
df2_re = df_n.merge(df2_re, left_on='Approach', right_on='Approach', how='left')
df2_re['f_apfd'] = [str(i)[:6] for i in df2_re['f_apfd']]
df2_re.to_excel('result/table/RQ3-2.xlsx', index=False)
df2_re

{'DeepGini': 0, 'LC': 0, 'LGMP': 82, 'LRMP': 0, 'Margin': 0, 'RFMP': 119, 'XGMP': 33}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['f_apfd'] = [float(i) for i in df2['apfd']]


Unnamed: 0,Approach,bestcase,f_apfd
0,DeepGini,0,0.7058
1,LC,0,0.7063
2,LGMP,82,0.8089
3,LRMP,0,0.7128
4,Margin,0,0.7061
5,RFMP,119,0.8044
6,XGMP,33,0.8096


In [8]:
# missing 3 col
dic = {'DeepGini': 0, 'LC': 0, 'LGMP': 0, 'LRMP': 0, 'Margin': 0, 'RFMP': 0, 'XGMP': 0}

for _, pdf in df3.groupby(['data', 'model']):
    tmp_dic = dict(zip(pdf['Approach'], pdf['apfd']))
    key = max(tmp_dic,key=tmp_dic.get)
    dic[key]+=1
print(dic)

df3['f_apfd'] = [float(i) for i in df3['apfd']]
df3_re = df3.groupby(['Approach']).mean().reset_index(drop=False)
df3_re.sort_values(by=['Approach'])

df_n = pd.DataFrame(columns=['Approach', 'bestcase'])
df_n['Approach'] = dic.keys()
df_n['bestcase'] = dic.values()
df3_re = df_n.merge(df3_re, left_on='Approach', right_on='Approach', how='left')
df3_re['f_apfd'] = [str(i)[:6] for i in df3_re['f_apfd']]
df3_re.to_excel('result/table/RQ3-3.xlsx', index=False)
df3_re

{'DeepGini': 0, 'LC': 0, 'LGMP': 85, 'LRMP': 0, 'Margin': 0, 'RFMP': 86, 'XGMP': 58}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['f_apfd'] = [float(i) for i in df3['apfd']]


Unnamed: 0,Approach,bestcase,f_apfd
0,DeepGini,0,0.7136
1,LC,0,0.7154
2,LGMP,85,0.798
3,LRMP,0,0.7113
4,Margin,0,0.7152
5,RFMP,86,0.794
6,XGMP,58,0.7998


In [9]:
# missing 4 col
dic = {'DeepGini': 0, 'LC': 0, 'LGMP': 0, 'LRMP': 0, 'Margin': 0, 'RFMP': 0, 'XGMP': 0}

for _, pdf in df4.groupby(['data', 'model']):
    tmp_dic = dict(zip(pdf['Approach'], pdf['apfd']))
    key = max(tmp_dic,key=tmp_dic.get)
    dic[key]+=1
print(dic)

df4['f_apfd'] = [float(i) for i in df4['apfd']]
df4_re = df4.groupby(['Approach']).mean().reset_index(drop=False)
df4_re.sort_values(by=['Approach'])

df_n = pd.DataFrame(columns=['Approach', 'bestcase'])
df_n['Approach'] = dic.keys()
df_n['bestcase'] = dic.values()
df4_re = df_n.merge(df4_re, left_on='Approach', right_on='Approach', how='left')
df4_re['f_apfd'] = [str(i)[:6] for i in df4_re['f_apfd']]
df4_re.to_excel('result/table/RQ3-4.xlsx', index=False)
df4_re

{'DeepGini': 0, 'LC': 0, 'LGMP': 92, 'LRMP': 0, 'Margin': 0, 'RFMP': 88, 'XGMP': 50}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['f_apfd'] = [float(i) for i in df4['apfd']]


Unnamed: 0,Approach,bestcase,f_apfd
0,DeepGini,0,0.7274
1,LC,0,0.7298
2,LGMP,92,0.789
3,LRMP,0,0.7182
4,Margin,0,0.7296
5,RFMP,88,0.7855
6,XGMP,50,0.7912


# Testing on noise data

In [25]:
import pandas as pd
import os

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 200)

dic_approach = {'lgb': 'LGMP', 
               'random_apfd': 'Random', 
               'deepGini_apfd': 'DeepGini', 
               'leastConfidence_apfd': 'LC', 
               'margin_apfd': 'Margin', 
               'rf': 'RFMP', 
               'xgb': 'XGMP', 
               'lr': 'LRMP', 'age': 'Age', 'gender': 'Gender', 'race': 'Race'}
dic_other = {'lr': 'LR', 
             'xgboost': 'XGBoost', 
             'lgb': 'LightGBM', 
             'rf': 'RF', 
             'bank': 'Bank', 
             'heart': 'Heart', 
             'stroke': 'Stroke', 
             'adult': 'Adult', 'age': 'Age', 'gender': 'Gender', 'race': 'Race'
            }

def get_path(path_dir_compile):
    path_list = []
    if os.path.isdir(path_dir_compile):
        for root, dirs, files in os.walk(path_dir_compile, topdown=True):
            for file in files:
                file_absolute_path = os.path.join(root, file)
                if file_absolute_path.endswith('.csv'):
                    path_list.append(file_absolute_path)
    return path_list

path_list = get_path('result/noise/')
compare_list = [i for i in path_list if 'compare' in i]
compare_model = [i.split('/')[-1].split('_')[0] for i in compare_list]
compare_data = [i.split('/')[-1].split('_')[1]+'_'+i.split('/')[-1].split('_')[2]+'_'+i.split('/')[-1].split('_')[3] for i in compare_list]

model_list = [i for i in path_list if 'compare' not in i]
model_name = [i.split('/')[-1].split('_')[0] for i in model_list]
model_data = [i.split('/')[-1].split('_')[1]+'_'+i.split('/')[-1].split('_')[2]+'_'+i.split('/')[-1].split('_')[3] for i in model_list]

data_list = []
for i in range(len(compare_list)):
    tmp_df = pd.read_csv(compare_list[i])
    tmp_df['model'] = compare_model[i]
    tmp_df['data'] = compare_data[i]
    data_list.append(tmp_df)
df_compare = pd.concat(data_list, ignore_index=True)


data_list = []
for i in range(len(model_list)):
    tmp_df = pd.read_csv(model_list[i])
    tmp_df['model'] = model_name[i]
    tmp_df['data'] = model_data[i]
    del tmp_df['mutation_feature_apfd']
    del tmp_df['mutation_model_apfd']
    del tmp_df['fusion_2_feature_apfd']
    tmp_df = tmp_df.rename(columns={"fusion_3_feature_apfd": "apfd"})
    data_list.append(tmp_df)
df_model = pd.concat(data_list, ignore_index=True)

df = pd.concat([df_compare, df_model], ignore_index=True)
df['Approach'] = [dic_approach[i] for i in df['Approach']]
df['model'] = [dic_other[i] for i in df['model']]
df['apfd'] = [str(i)[:6] for i in df['apfd']]

df = df[df['Approach'] != 'Random']

df = df[df['apfd']>'0.55']

df['data_type'] = [i.split('_')[1] for i in df['data']]
df1 = df[df['data_type']=='1']
df2 = df[df['data_type']=='2']
df3 = df[df['data_type']=='3']
df4 = df[df['data_type']=='4']

In [26]:
# noise 1 col
dic = {'DeepGini': 0, 'LC': 0, 'LGMP': 0, 'LRMP': 0, 'Margin': 0, 'RFMP': 0, 'XGMP': 0}

for _, pdf in df1.groupby(['data', 'model']):
    tmp_dic = dict(zip(pdf['Approach'], pdf['apfd']))
    key = max(tmp_dic,key=tmp_dic.get)
    dic[key]+=1
print(dic)

df1['f_apfd'] = [float(i) for i in df1['apfd']]
df1_re = df1.groupby(['Approach']).mean().reset_index(drop=False)
df1_re.sort_values(by=['Approach'])

df_n = pd.DataFrame(columns=['Approach', 'bestcase'])
df_n['Approach'] = dic.keys()
df_n['bestcase'] = dic.values()
df1_re = df_n.merge(df1_re, left_on='Approach', right_on='Approach', how='left')
df1_re['f_apfd'] = [str(i)[:6] for i in df1_re['f_apfd']]
df1_re.to_excel('result/table/RQ4-1.xlsx', index=False)
df1_re

{'DeepGini': 0, 'LC': 0, 'LGMP': 31, 'LRMP': 0, 'Margin': 0, 'RFMP': 40, 'XGMP': 25}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['f_apfd'] = [float(i) for i in df1['apfd']]


Unnamed: 0,Approach,bestcase,f_apfd
0,DeepGini,0,0.7009
1,LC,0,0.7013
2,LGMP,31,0.8095
3,LRMP,0,0.7258
4,Margin,0,0.701
5,RFMP,40,0.808
6,XGMP,25,0.8115


In [27]:
# noise 2 col
dic = {'DeepGini': 0, 'LC': 0, 'LGMP': 0, 'LRMP': 0, 'Margin': 0, 'RFMP': 0, 'XGMP': 0}

for _, pdf in df2.groupby(['data', 'model']):
    tmp_dic = dict(zip(pdf['Approach'], pdf['apfd']))
    key = max(tmp_dic,key=tmp_dic.get)
    dic[key]+=1
print(dic)

df2['f_apfd'] = [float(i) for i in df2['apfd']]
df2_re = df2.groupby(['Approach']).mean().reset_index(drop=False)
df2_re.sort_values(by=['Approach'])

df_n = pd.DataFrame(columns=['Approach', 'bestcase'])
df_n['Approach'] = dic.keys()
df_n['bestcase'] = dic.values()
df2_re = df_n.merge(df2_re, left_on='Approach', right_on='Approach', how='left')
df2_re['f_apfd'] = [str(i)[:6] for i in df2_re['f_apfd']]
df2_re.to_excel('result/table/RQ4-2.xlsx', index=False)
df2_re

{'DeepGini': 0, 'LC': 0, 'LGMP': 31, 'LRMP': 0, 'Margin': 0, 'RFMP': 30, 'XGMP': 34}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['f_apfd'] = [float(i) for i in df2['apfd']]


Unnamed: 0,Approach,bestcase,f_apfd
0,DeepGini,0,0.6892
1,LC,0,0.6919
2,LGMP,31,0.8007
3,LRMP,0,0.7088
4,Margin,0,0.6917
5,RFMP,30,0.7986
6,XGMP,34,0.8039


In [28]:
# noise 3 col
dic = {'DeepGini': 0, 'LC': 0, 'LGMP': 0, 'LRMP': 0, 'Margin': 0, 'RFMP': 0, 'XGMP': 0}

for _, pdf in df3.groupby(['data', 'model']):
    tmp_dic = dict(zip(pdf['Approach'], pdf['apfd']))
    key = max(tmp_dic,key=tmp_dic.get)
    dic[key]+=1
print(dic)

df3['f_apfd'] = [float(i) for i in df3['apfd']]
df3_re = df3.groupby(['Approach']).mean().reset_index(drop=False)
df3_re.sort_values(by=['Approach'])

df_n = pd.DataFrame(columns=['Approach', 'bestcase'])
df_n['Approach'] = dic.keys()
df_n['bestcase'] = dic.values()
df3_re = df_n.merge(df3_re, left_on='Approach', right_on='Approach', how='left')
df3_re['f_apfd'] = [str(i)[:6] for i in df3_re['f_apfd']]
df3_re.to_excel('result/table/RQ4-3.xlsx', index=False)
df3_re

{'DeepGini': 0, 'LC': 0, 'LGMP': 43, 'LRMP': 0, 'Margin': 0, 'RFMP': 25, 'XGMP': 26}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['f_apfd'] = [float(i) for i in df3['apfd']]


Unnamed: 0,Approach,bestcase,f_apfd
0,DeepGini,0,0.6884
1,LC,0,0.6885
2,LGMP,43,0.7967
3,LRMP,0,0.7031
4,Margin,0,0.6883
5,RFMP,25,0.7937
6,XGMP,26,0.8025


In [29]:
# noise 4 col
dic = {'DeepGini': 0, 'LC': 0, 'LGMP': 0, 'LRMP': 0, 'Margin': 0, 'RFMP': 0, 'XGMP': 0}

for _, pdf in df4.groupby(['data', 'model']):
    tmp_dic = dict(zip(pdf['Approach'], pdf['apfd']))
    key = max(tmp_dic,key=tmp_dic.get)
    dic[key]+=1
print(dic)

df4['f_apfd'] = [float(i) for i in df4['apfd']]
df4_re = df4.groupby(['Approach']).mean().reset_index(drop=False)
df4_re.sort_values(by=['Approach'])

df_n = pd.DataFrame(columns=['Approach', 'bestcase'])
df_n['Approach'] = dic.keys()
df_n['bestcase'] = dic.values()
df4_re = df_n.merge(df4_re, left_on='Approach', right_on='Approach', how='left')
df4_re['f_apfd'] = [str(i)[:6] for i in df4_re['f_apfd']]
df4_re.to_excel('result/table/RQ4-4.xlsx', index=False)
df4_re

{'DeepGini': 0, 'LC': 0, 'LGMP': 44, 'LRMP': 0, 'Margin': 0, 'RFMP': 25, 'XGMP': 22}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['f_apfd'] = [float(i) for i in df4['apfd']]


Unnamed: 0,Approach,bestcase,f_apfd
0,DeepGini,0,0.7089
1,LC,0,0.7093
2,LGMP,44,0.7941
3,LRMP,0,0.6978
4,Margin,0,0.7091
5,RFMP,25,0.7863
6,XGMP,22,0.7963


# Feature importance analysis

In [85]:
def get_path(path_dir_compile):
    path_list = []
    if os.path.isdir(path_dir_compile):
        for root, dirs, files in os.walk(path_dir_compile, topdown=True):
            for file in files:
                file_absolute_path = os.path.join(root, file)
                if file_absolute_path.endswith('.csv'):
                    path_list.append(file_absolute_path)
    return path_list

path_list = get_path('result/original_res/')
path_list = [i for i in path_list if 'compare' not in i]
path_list = [i for i in path_list if 'adult' in i]
data_list = [pd.read_csv(i) for i in path_list]

df = pd.DataFrame(columns=['Approach'])
df['Approach'] = ['lr', 'rf', 'xgb', 'lgb']
df['mutation_model_apfd'] = 0
df['fusion_2_feature_apfd'] = 0
df['fusion_3_feature_apfd'] = 0
for i in range(len(data_list)):
    df['mutation_model_apfd'] = df['mutation_model_apfd']+data_list[i]['mutation_model_apfd'].astype(float)
    df['fusion_2_feature_apfd'] = df['fusion_2_feature_apfd']+data_list[i]['fusion_2_feature_apfd'].astype(float)
    df['fusion_3_feature_apfd'] = df['fusion_3_feature_apfd']+data_list[i]['fusion_3_feature_apfd'].astype(float)
df['mutation_model_apfd'] = df['mutation_model_apfd'] / len(data_list)
df['fusion_2_feature_apfd'] = df['fusion_2_feature_apfd'] / len(data_list)
df['fusion_3_feature_apfd'] = df['fusion_3_feature_apfd'] / len(data_list)
df

Unnamed: 0,Approach,mutation_model_apfd,fusion_2_feature_apfd,fusion_3_feature_apfd
0,lr,0.636879,0.68164,0.757448
1,rf,0.628414,0.658665,0.786708
2,xgb,0.630705,0.678002,0.810332
3,lgb,0.634101,0.68376,0.815891


In [86]:
path_list = get_path('result/original_res/')
path_list = [i for i in path_list if 'compare' not in i]
path_list = [i for i in path_list if 'bank' in i]
data_list = [pd.read_csv(i) for i in path_list]

df = pd.DataFrame(columns=['Approach'])
df['Approach'] = ['lr', 'rf', 'xgb', 'lgb']
df['mutation_model_apfd'] = 0
df['fusion_2_feature_apfd'] = 0
df['fusion_3_feature_apfd'] = 0
for i in range(len(data_list)):
    df['mutation_model_apfd'] = df['mutation_model_apfd']+data_list[i]['mutation_model_apfd'].astype(float)
    df['fusion_2_feature_apfd'] = df['fusion_2_feature_apfd']+data_list[i]['fusion_2_feature_apfd'].astype(float)
    df['fusion_3_feature_apfd'] = df['fusion_3_feature_apfd']+data_list[i]['fusion_3_feature_apfd'].astype(float)
df['mutation_model_apfd'] = df['mutation_model_apfd'] / len(data_list)
df['fusion_2_feature_apfd'] = df['fusion_2_feature_apfd'] / len(data_list)
df['fusion_3_feature_apfd'] = df['fusion_3_feature_apfd'] / len(data_list)
df

Unnamed: 0,Approach,mutation_model_apfd,fusion_2_feature_apfd,fusion_3_feature_apfd
0,lr,0.642666,0.67392,0.786727
1,rf,0.640537,0.660799,0.844585
2,xgb,0.653147,0.683356,0.845011
3,lgb,0.648082,0.666844,0.837749


In [87]:
path_list = get_path('result/original_res/')
path_list = [i for i in path_list if 'compare' not in i]
path_list = [i for i in path_list if 'heart' in i]
data_list = [pd.read_csv(i) for i in path_list]

df = pd.DataFrame(columns=['Approach'])
df['Approach'] = ['lr', 'rf', 'xgb', 'lgb']
df['mutation_model_apfd'] = 0
df['fusion_2_feature_apfd'] = 0
df['fusion_3_feature_apfd'] = 0
for i in range(len(data_list)):
    df['mutation_model_apfd'] = df['mutation_model_apfd']+data_list[i]['mutation_model_apfd'].astype(float)
    df['fusion_2_feature_apfd'] = df['fusion_2_feature_apfd']+data_list[i]['fusion_2_feature_apfd'].astype(float)
    df['fusion_3_feature_apfd'] = df['fusion_3_feature_apfd']+data_list[i]['fusion_3_feature_apfd'].astype(float)
df['mutation_model_apfd'] = df['mutation_model_apfd'] / len(data_list)
df['fusion_2_feature_apfd'] = df['fusion_2_feature_apfd'] / len(data_list)
df['fusion_3_feature_apfd'] = df['fusion_3_feature_apfd'] / len(data_list)
df

Unnamed: 0,Approach,mutation_model_apfd,fusion_2_feature_apfd,fusion_3_feature_apfd
0,lr,0.678729,0.724417,0.786439
1,rf,0.696694,0.747915,0.926271
2,xgb,0.696039,0.752517,0.92446
3,lgb,0.696807,0.748837,0.916044


In [88]:
path_list = get_path('result/original_res/')
path_list = [i for i in path_list if 'compare' not in i]
path_list = [i for i in path_list if 'stroke' in i]
data_list = [pd.read_csv(i) for i in path_list]

df = pd.DataFrame(columns=['Approach'])
df['Approach'] = ['lr', 'rf', 'xgb', 'lgb']
df['mutation_model_apfd'] = 0
df['fusion_2_feature_apfd'] = 0
df['fusion_3_feature_apfd'] = 0
for i in range(len(data_list)):
    df['mutation_model_apfd'] = df['mutation_model_apfd']+data_list[i]['mutation_model_apfd'].astype(float)
    df['fusion_2_feature_apfd'] = df['fusion_2_feature_apfd']+data_list[i]['fusion_2_feature_apfd'].astype(float)
    df['fusion_3_feature_apfd'] = df['fusion_3_feature_apfd']+data_list[i]['fusion_3_feature_apfd'].astype(float)
df['mutation_model_apfd'] = df['mutation_model_apfd'] / len(data_list)
df['fusion_2_feature_apfd'] = df['fusion_2_feature_apfd'] / len(data_list)
df['fusion_3_feature_apfd'] = df['fusion_3_feature_apfd'] / len(data_list)
df

Unnamed: 0,Approach,mutation_model_apfd,fusion_2_feature_apfd,fusion_3_feature_apfd
0,lr,0.674679,0.73651,0.747743
1,rf,0.686697,0.763495,0.908897
2,xgb,0.68961,0.762601,0.90739
3,lgb,0.685081,0.761327,0.905748


In [90]:
deepgini_adult = sum([0.7366, 0.6699, 0.7875, 0.8047]) / 4
deepgini_bank = sum([0.7987, 0.7320, 0.8255, 0.8467]) / 4
deepgini_heart = sum([0.7608, 0.8197, 0.9066, 0.8566]) / 4
deepgini_stroke = sum([0.5959, 0.8661, 0.7577, 0.6948]) / 4
print(deepgini_adult)
print(deepgini_bank)
print(deepgini_heart)
print(deepgini_stroke)

0.749675
0.8007249999999999
0.835925
0.728625


In [101]:
random_adult = sum([0.4992,0.4975,0.4924,0.4964]) / 4
random_bank = sum([0.4994,0.4995,0.50513,0.4881]) / 4
random_heart = sum([0.4977880733944954,0.49719365753092404,0.49935658174097663,0.5061374698067632]) / 4
random_stroke = sum([0.4959, 0.4978,0.5074,0.4865]) / 4
print(random_adult)
print(random_bank)
print(random_heart)
print(random_stroke)

0.49637499999999996
0.4980325
0.5001189456182898
0.4969


In [107]:
import numpy as np
np.mean([0.7496,0.8007,0.8359,0.7286])

0.7787000000000001