In [1]:
import json 
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from sklearn.metrics import jaccard_score
import sys 
from tqdm.notebook import tqdm
import sklearn.feature_extraction.text as ft 



In [2]:
sys.path.insert(0, '../')
import src.preprocessing.text_preprocessing as tp

In [3]:
plt.rcParams['text.usetex'] = False

In [4]:
FOLDER = 'ecg_mit'
base_path = '../results/'
path = f'{base_path}{FOLDER}/'
full_paths = [f'{path}{file}' for file in os.listdir(path)]


In [5]:
files = {}
for p in full_paths:
    name = p.split('/')[-1]
    split = name.split('_')
    method = '_'.join(split[1:-1])
    n_words = split[-1].split('.')[0]
    with open(p, 'r') as file:
        if method not in files:
            files[method] = {}
        files[method][n_words] = json.load(file)
    

In [7]:
tqdm.pandas()
df = pd.read_csv('../data/mitdb_data/mitdb_ecg.csv', sep=';')
vocabulary = df.columns.tolist()

In [13]:
def get_extractor_timings(files: dict) -> pd.DataFrame:
    methods = files.keys()
    df = pd.DataFrame(columns=methods)
    n_words_list = set()
    method_list = set()
    for method, words_dict in files.items():
        method_list.add(method)
        for n_words, info in words_dict.items():
                df.loc[n_words, method] = info['timing']['extractor_fit']
                n_words_list.add(n_words)
                
    df.index = df.index.astype('int')
    df = df.sort_index()
    df = df.apply(pd.to_timedelta)
    for col in df.columns:
        df[col] = df[col].dt.total_seconds()
    return df, method_list, n_words_list

def plot_extractors_timings(df) -> plt.figure:
    axes = df.plot(logy=True, logx=True, figsize=(16, 10))
    axes.set_xlabel('# selected features')
    axes.set_ylabel('runtime (sec)')
    axes.set_label('runtime (sec)')
    plt.title('Feature extractor runtimes vs number of the selected featues.')
    plt.savefig('../figures/mit_bih_extractor_runtime_vs_n_features.png')
    plt.close()

def get_selected_words_per_extractor_per_n_words(files: dict, vocabulary, n_words_list, method_list):
    df_dict = {}

    for n_words in n_words_list:
        df_dict[n_words] = pd.DataFrame(index=vocabulary, columns=list(method_list)).fillna(0)

    for method, words_dict in files.items():
        for n_words, info in words_dict.items():
            df_dict[n_words].loc[info['selected_vocabulary'], method] = 1
    return df_dict

def get_cross_jaccard_score(df):
    methods = df.columns.tolist()
    jaccard_df = pd.DataFrame(index=methods, columns=methods)
    for method_1 in methods:
        for method_2 in methods:
            jaccard_df.loc[method_1, method_2] = jaccard_score(df[method_1], df[method_2])
    return jaccard_df

def get_similarity_metrics(df_dict):
    correlations_dict = {}
    jaccard_score_dict = {}
    for n_words, df in df_dict.items():
        df_filtered = df.loc[:,(df.sum(axis=0) != 0).values] # remove lfs (or other methods) when they have no values
        correlations_dict[n_words] = df_filtered.corr() 
        jaccard_score_dict[n_words] = get_cross_jaccard_score(df_filtered)
    
    return correlations_dict, jaccard_score_dict

def compare_shap_over_n_words_set_similarity(df_dict: dict, n_words_list, method_list):
    df_comp = pd.DataFrame(columns=method_list, index=n_words_list)
    for n_words, df in df_dict.items():
        cols = df.columns
        df_comp.loc[n_words, cols] = df['shap'][cols]
    df_comp.index = df_comp.index.astype('int')
    df_comp = df_comp.sort_index()
    df_comp = df_comp.drop(columns=['shap'])
    return df_comp

def compare_performance_over_n_words_enron(files, n_words_list, method_list, baseline=None):
    metrics = ['precision', 'recall', 'f1-score']
    cols = []
    for method in method_list:
        for metric in metrics:
            cols.append((method, metric))
    df_metrics = pd.DataFrame(columns=pd.MultiIndex.from_tuples(cols), index=n_words_list)

    for method, words_dict in files.items():
        for n_words, info in words_dict.items():
            for metric in metrics:
                df_metrics.loc[n_words, (method, metric)] = info['classification_report_test']['1.0'][metric]
    if baseline is not None:
        for n_words in n_words_list:
            for metric in metrics:
                df_metrics.loc[n_words, ('baseline',metric)] = baseline['1.0'][metric]
    
    df_metrics.index = df_metrics.index.astype('int')
    df_metrics = df_metrics.sort_index()
    return df_metrics



In [14]:
df, method_list, n_words_list = get_extractor_timings(files)
df.to_csv('../results/tables/mit_bih_n_features_timings.csv', sep=';')
plot_extractors_timings(df)

In [15]:
df_dict = get_selected_words_per_extractor_per_n_words(files, vocabulary, n_words_list, method_list)
correlations_dict, jaccard_score_dict = get_similarity_metrics(df_dict)

In [16]:
shap_correlations = compare_shap_over_n_words_set_similarity(correlations_dict, n_words_list, method_list)
shap_jaccard = compare_shap_over_n_words_set_similarity(jaccard_score_dict, n_words_list, method_list)
shap_correlations.to_csv('../results/tables/mit_bih_correlations.csv', sep=';')
shap_jaccard.to_csv('../results/tables/mit_bih_jaccard.csv', sep=';')

In [25]:
with open('../results/mit_bih_baseline.json', 'r') as file:
    baseline_mit_bih = json.load(file)

In [29]:
df_metrics = compare_performance_over_n_words_enron(files, n_words_list, method_list, baseline=baseline_mit_bih['classification_report_test']).iloc[:-1]
df_metrics.to_csv('../results/tables/mit_bih_performance.csv', sep=';')

In [30]:
df_metrics

Unnamed: 0_level_0,lfs,lfs,lfs,mutual_information,mutual_information,mutual_information,shap,shap,shap,f_val,f_val,f_val,baseline,baseline,baseline
Unnamed: 0_level_1,precision,recall,f1-score,precision,recall,f1-score,precision,recall,f1-score,precision,recall,f1-score,precision,recall,f1-score
2,0.755747,0.862295,0.805513,0.921233,0.881967,0.901173,0.840237,0.931148,0.883359,0.808442,0.816393,0.812398,0.845481,0.95082,0.895062
3,0.792793,0.865574,0.827586,0.901899,0.934426,0.917874,0.906752,0.92459,0.915584,0.831169,0.839344,0.835237,0.845481,0.95082,0.895062
5,0.822857,0.944262,0.879389,0.891566,0.970492,0.929356,0.905956,0.947541,0.926282,0.86478,0.901639,0.882825,0.845481,0.95082,0.895062
10,0.863905,0.957377,0.908243,0.846377,0.957377,0.898462,0.915094,0.954098,0.934189,0.87988,0.960656,0.918495,0.845481,0.95082,0.895062
15,0.855072,0.967213,0.907692,0.829609,0.97377,0.895928,0.902141,0.967213,0.933544,0.839888,0.980328,0.90469,0.845481,0.95082,0.895062
