In [None]:
import numpy as np 
import pandas as pd
import os
import glob
from os.path import join
import pdb
from collections import defaultdict

In [None]:
# OUTPUT_DIRS=['./output1']
# if you executed more than one run, just add the other directories to the list :
OUTPUT_DIRS=['./output1', './output2', './output3', './output4', './output5']
# the tables below report mean and std over the runs

In [None]:
OUTPUT_DIR=OUTPUT_DIRS[0]
AVBLOCK_AUCS_FILE=join(OUTPUT_DIR, 'binary_avblock_rocaucs_values.csv')
AVBLOCK_MODAUCS_FILE=join(OUTPUT_DIR, 'binary_avblock_modrocaucs_values.csv')
LBBB_AUCS_FILE=join(OUTPUT_DIR, 'binary_lbbb_rocaucs_values.csv')
LBBB_MODAUCS_FILE=join(OUTPUT_DIR, 'binary_lbbb_modrocaucs_values.csv')
RBBB_AUCS_FILE=join(OUTPUT_DIR, 'binary_rbbb_rocaucs_values.csv')
RBBB_MODAUCS_FILE=join(OUTPUT_DIR, 'binary_rbbb_modrocaucs_values.csv')

In [None]:
AVBLOCK_IMPORTANT_FEATURES = ['Int_PR_Interval']
AVBLOCK_NON_DISCRIMINATING_FEATURES = ['ILM_S_Amp_I', 'ILM_S_Amp_V2']

In [None]:
RBBB_IMPORTANT_FEATURES = ['OnOff_QRS_Duration', 'ILM_R_Amp_V1', 'ILM_R__Amp_V1',
                           'ILM_S_Amp_I', 'ILM_S_Amp_aVL', 'ILM_S_Amp_V1', 'ILM_S_Amp_V6']
RBBB_IMPORTANT_FEATURES_W_CORR = RBBB_IMPORTANT_FEATURES + ['ILM_R__Amp_V5', 'ILM_S_Amp_V5', 'ILM_P2P_Amp_V1']
RBBB_NON_DISCRIMINATING_FEATURES = ['ILM_R__Amp_I', 'ILM_R__Amp_V6']

In [None]:
LBBB_IMPORTANT_FEATURES = ['OnOff_QRS_Duration', 'ILM_Q_Amp_V1',
                           'ILM_R_Amp_I','ILM_R_Amp_aVL', 'ILM_R_Amp_V5', 'ILM_R_Amp_V6',
                           'ILM_R__Amp_I','ILM_R__Amp_aVL', 'ILM_R__Amp_V5', 'ILM_R__Amp_V6',
                           'ILM_S_Amp_I','ILM_S_Amp_aVL', 'ILM_S_Amp_V5', 'ILM_S_Amp_V6']

LBBB_IMPORTANT_FEATURES_W_CORR = LBBB_IMPORTANT_FEATURES + \
['ILM_Q_Amp_V4', 'ILM_S_Amp_V3', 'ILM_T__Amp_V1', 'ILM_ST_Slope_I','ILM_ST_Slope_V1','ILM_ST_Slope_V6', 
'Int_ST_Duration', 'ILM_T_Morph_I', 'ILM_T_Morph_aVR','ILM_T_Morph_V6', 'ILM_P2P_Amp_V1', 'ILM_T__Amp_V1', 
'ILM_P2P_Amp_I', 'ILM_P2P_Amp_I', 'ILM_P2P_Amp_aVL', 'ILM_Q_Amp_III', 'ILM_Q_Amp_aVF', 'ILM_S_Amp_III', 
'GM_QrsFrontalAxis', 'ILM_P2P_Amp_V5', 'ILM_P2P_Amp_V6', 'ILM_R_Amp_V4', 'ILM_P2P_Amp_V5', 'ILM_P2P_Amp_V6',
'ILM_P2P_Amp_aVL', 'ILM_P2P_Amp_aVL', 'ILM_S_Amp_II', 'ILM_S_Amp_III', 'ILM_S_Amp_aVF',
 'ILM_R__Amp_aVR', 'ILM_T__Amp_aVR',
'ILM_R_Amp_III', 'ILM_R__Amp_II', 'ILM_R__Amp_III', 'ILM_R__Amp_V1']

LBBB_IMPORTANT_FEATURES_W_CORR = set(LBBB_IMPORTANT_FEATURES_W_CORR)

LBBB_NON_DISCRIMINATING_FEATURES = ['ILM_R_Amp_I', 'ILM_R__Amp_V6']

In [None]:
def get_top5_feats(file):
    try:
        return pd.read_csv(file)['Feature'][:5].values
    except:
        print(file)
def top5score(file, important_features):
    features = get_top5_feats(file)
    n = len(features)
    weights = np.arange(1, n+1)[::-1]/(n*(n+1)/2/100)
    occurence = np.array([float(feature in important_features) for feature in features])
    return int((weights @ occurence) + 0.5)

In [None]:
def table_v_fetch(file, important_features, non_discriminative_features):
    df = pd.read_csv(file)
    # print(file)
    feature_ranking = df['Feature'].values
    important_features_ranking = [np.where(feature_ranking == ifeat)[0][0]+1 for ifeat in important_features]
    nondiscriminative_features_ranking = [np.where(feature_ranking == ndfeat)[0][0]+1 for ndfeat in non_discriminative_features]
    return [important_features_ranking, nondiscriminative_features_ranking]

In [None]:
def feat_status(feature, important_feats, corr_feats):
    if feature in important_feats:
        return "Important"
    elif corr_feats is not None and feature in corr_feats:
        return "Correlated"
    return "Unimportant"

In [None]:
def get_top5s(files, files_key, important_feats, corr_feats, rocaucs, modrocaucs, n=5):
    fetched_files = fetch_files(files, files_key)
    top5s = dict(zip(rocaucs.keys(), np.zeros(len(rocaucs))))
    for file in fetched_files:
        top5_feats = list(get_top5_feats(file))
        for feat in top5_feats:
            top5s[feat] += 1
    return top5s

In [None]:
def get_frequency_of_topn(top5_features_per_run, important_feats, corr_feats, rocaucs, modrocaucs, n=5):    
    # Initialize the resulting dictionary of lists
    aggregated_top5s = {key: [] for key in top5_features_per_run[0].keys()}

    # Iterate through each dictionary in the list
    for d in top5_features_per_run:
        # For each key in the dictionary, append the value to the corresponding list in the resulting dictionary
        for key, value in d.items():
            aggregated_top5s[key].append(value)

    aggregated_top5s = {key: [np.mean(val), np.std(val)] for key, val in aggregated_top5s.items()}
    freqs = sorted(list(aggregated_top5s.items()), key=lambda x:x[1][0], reverse=True)
    top_feats, counts = zip(*freqs[:n])
    counts = [f"{int(val[0]+0.5)}({int(val[1]+0.5)})" for val in counts]
    top5_rocaucs = [rocaucs[feat] for feat in top_feats]
    top5_modrocaucs = [modrocaucs[feat] for feat in top_feats]
    top5_status = [feat_status(feat, important_feats, corr_feats) for feat in top_feats]
    return top_feats, counts, top5_rocaucs, top5_modrocaucs, top5_status

In [None]:
def fetch_files(files, key, method=None, model=None):
    """Fetch files that contain a specific keyword."""
    filtered_files = [file for file in files if key in file.lower() and 'values' not in file.lower()]
    if method:
        filtered_files = [file for file in filtered_files if method in file.lower()]
    if model:
        filtered_files = [file for file in filtered_files if model in file.lower()]
    return filtered_files


def populate_results(files, key, results_dict, fetch_fun):
    for method in modelmethods:
        for model in models:
            ffiles = fetch_files(files, key, method, model)
            if ffiles:
                results_dict[f"{method}_{model}"] = fetch_fun(ffiles[0])
    for method in filtermethods:
        ffiles = fetch_files(files, key, method)
        if ffiles:
            results_dict[f"{method}"] = fetch_fun(ffiles[0])
        

def populate_dataframe(files, files_key, results, fetch_function, order, columns):
    fetched_files = fetch_files(files, files_key)
    populate_results(fetched_files, files_key, results, fetch_function)
    
    if files_key == 'avblock':
        df = pd.DataFrame([(key, *val) for key, val in results.items()], columns=columns)
    else:
        df = pd.DataFrame([[key, f"{val[0]}/{val[1]}", val[2]] for key, val in results.items()], columns=columns)
    
    df = df.set_index('Method').loc[order].reset_index()
    return df

In [None]:
def aggregate_scores(vals):
    # Separate numerators and denominators
    nums = [int(val.split("/")[0]) if type(val) == str  else val for val in vals]
    denoms = [int(val.split("/")[1]) if type(val) == str  else val for val in vals]

    # Calculate mean and standard deviation for numerators and denominators
    mean = np.mean(nums)
    std = np.std(nums)
    
    mean_corr = np.mean(denoms)
    std_corr = np.std(denoms)

    # Format and return the result
    return f'{int(mean)}({int(std)})/{int(mean_corr)}({int(std_corr)})'

In [None]:
def aggregate_rankings(vals):
    # Separate numerators and denominators
    first_feature_rankings  = [val[0] for val in vals]
    second_feature_rankings = [val[1] for val in vals]

    # Calculate mean and standard deviation for numerators and denominators
    mean1 = np.mean(first_feature_rankings)
    std1 = np.std(first_feature_rankings)
    
    mean2 = np.mean(second_feature_rankings)
    std2 = np.std(second_feature_rankings)

    # Format and return the result
    return f'{int(mean1)}({int(std1)})/{int(mean2)}({int(std2)})'

In [None]:
def mean_std(lodfs, key_column, value_column, ranking_column):
    zipped_dfs_scores = list(zip(*[d[value_column] for d in lodfs]))
    aggregated_scores = np.array([aggregate_scores(vals) for vals in zipped_dfs_scores])
    
    zipped_dfs_rankings = list(zip(*[d[ranking_column] for d in lodfs]))
    aggregated_rankings = np.array([aggregate_rankings(vals) for vals in zipped_dfs_rankings])
    
    output = lodfs[0][[key_column]].copy()
    output["Mean(Std) of Experiments"] = aggregated_scores
    output[f"Mean(Std) of {ranking_column}"] = aggregated_rankings
    return output

In [None]:
files = glob.glob(OUTPUT_DIR+'/*.csv')

In [None]:
models = ['lr', 'dn', 'gp', 'rf', 'xgb']
modelmethods = ['implicit', 'shap', 'lime', 'permutation']
filtermethods =['modifiedrocauc', 'chisquared', 'relieff', 'nca', 'mrmr']

In [None]:
order = ["permutation_rf", "shap_rf", "lime_rf", "implicit_rf", "permutation_xgb", "shap_xgb", "lime_xgb",
         "permutation_lr", "shap_lr", "lime_lr", "implicit_lr", "permutation_dn", "shap_dn", "lime_dn",
         "implicit_gp", "chisquared","mrmr", "nca", "relieff", "modifiedrocauc"]

### Table V

In [None]:
tableV_results = {}
tableV_fetch_function = lambda x: table_v_fetch(x, AVBLOCK_IMPORTANT_FEATURES, AVBLOCK_NON_DISCRIMINATING_FEATURES)
avblock_df_columns = ['Method', 'Ranking of important Features', 'Ranking of non-discriminative Features']
avblock_dfs = [populate_dataframe(glob.glob(f'{dirname}/*.csv'), 'avblock', {},
                             tableV_fetch_function, order, avblock_df_columns) for dirname in OUTPUT_DIRS]
mean_std(avblock_dfs, *avblock_df_columns)

In [None]:
avblockrocaucs = dict(pd.read_csv(AVBLOCK_AUCS_FILE).values)
avblockmodrocaucs = dict(pd.read_csv(AVBLOCK_MODAUCS_FILE).values)
avblock_top5_features_per_run =[get_top5s(glob.glob(f'{dirname}/*.csv'), "avblock", AVBLOCK_IMPORTANT_FEATURES, None, 
                       avblockrocaucs, avblockmodrocaucs) for dirname in OUTPUT_DIRS]
avtop5_features_aggregated, avcounts, avtop5_rocaucs, avtop5_modrocaucs, avtop5_status = \
    get_frequency_of_topn(avblock_top5_features_per_run, AVBLOCK_IMPORTANT_FEATURES, None, avblockrocaucs, avblockmodrocaucs)

### Table VI

In [None]:
pd.DataFrame(list(zip(avtop5_features_aggregated, avcounts, avtop5_modrocaucs, avtop5_rocaucs)), 
             columns=['Feature', 'Frequency in Top5', 'Modified ROCAUC', 'ROCAUC'])

### Table VII

In [None]:
tableVII_results = {}
tableVII_fetch_function = lambda x: [top5score(x, RBBB_IMPORTANT_FEATURES), 
                                     top5score(x, RBBB_IMPORTANT_FEATURES_W_CORR),
                                     table_v_fetch(x, RBBB_IMPORTANT_FEATURES, RBBB_NON_DISCRIMINATING_FEATURES)[1]]
rbbb_df_columns = ['Method', 'Top 5 score imp. / imp. + corr.', 'Ranking of non-discriminative features']
rbbb_dfs = [populate_dataframe(glob.glob(f'{dirname}/*.csv'), 'rbbb', {},
                             tableVII_fetch_function, order, rbbb_df_columns) for dirname in OUTPUT_DIRS]
mean_std(rbbb_dfs, *rbbb_df_columns)

In [None]:
rbbbrocaucs = dict(pd.read_csv(RBBB_AUCS_FILE).values)
rbbbmodrocaucs = dict(pd.read_csv(RBBB_MODAUCS_FILE).values)
rbbb_top5_features_per_run =[get_top5s(glob.glob(f'{dirname}/*.csv'), "rbbb", RBBB_IMPORTANT_FEATURES, None, 
                       rbbbrocaucs, rbbbmodrocaucs) for dirname in OUTPUT_DIRS]
rbbbtop5_features_aggregated, rbbbcounts, rbbbtop5_rocaucs, rbbbtop5_modrocaucs, rbbbtop5_status = \
    get_frequency_of_topn(rbbb_top5_features_per_run, RBBB_IMPORTANT_FEATURES, RBBB_IMPORTANT_FEATURES_W_CORR, rbbbrocaucs, rbbbmodrocaucs, n=6)

### Table VIII

In [None]:
pd.DataFrame(list(zip(rbbbtop5_features_aggregated, rbbbcounts, rbbbtop5_status, rbbbtop5_modrocaucs, rbbbtop5_rocaucs)), 
             columns=['Feature', 'Frequency in Top5', 'Type of feature', 'Modified ROCAUC', 'ROCAUC'])

### Table IX

In [None]:
tableIX_results = {}
tableIX_fetch_function = lambda x: [top5score(x, LBBB_IMPORTANT_FEATURES), 
                                     top5score(x, LBBB_IMPORTANT_FEATURES_W_CORR),
                                     table_v_fetch(x, LBBB_IMPORTANT_FEATURES, LBBB_NON_DISCRIMINATING_FEATURES)[1]]
lbbb_df_columns = ['Method', 'Top 5 score imp. / imp. + corr.', 'Ranking of non-discriminative features']
lbbb_dfs = [populate_dataframe(glob.glob(f'{dirname}/*.csv'), 'lbbb', {},
                             tableIX_fetch_function, order, lbbb_df_columns) for dirname in OUTPUT_DIRS]
mean_std(lbbb_dfs, *lbbb_df_columns)

In [None]:
lbbbrocaucs = dict(pd.read_csv(LBBB_AUCS_FILE).values)
lbbbmodrocaucs = dict(pd.read_csv(LBBB_MODAUCS_FILE).values)
lbbb_top5_features_per_run =[get_top5s(glob.glob(f'{dirname}/*.csv'), "lbbb", LBBB_IMPORTANT_FEATURES, None, 
                       lbbbrocaucs, lbbbmodrocaucs) for dirname in OUTPUT_DIRS]
lbbbtop5_features_aggregated, lbbbcounts, lbbbtop5_rocaucs, lbbbtop5_modrocaucs, lbbbtop5_status = \
    get_frequency_of_topn(lbbb_top5_features_per_run, LBBB_IMPORTANT_FEATURES, LBBB_IMPORTANT_FEATURES_W_CORR, lbbbrocaucs, lbbbmodrocaucs)

### Table X

In [None]:
pd.DataFrame(list(zip(lbbbtop5_features_aggregated, lbbbcounts, lbbbtop5_status, lbbbtop5_modrocaucs, lbbbtop5_rocaucs)), 
             columns=['Feature', 'Frequency in Top5', 'Type of feature', 'Modified ROCAUC', 'ROCAUC'])