In [1]:
import json 
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from sklearn.metrics import jaccard_score
import sys 
from tqdm.notebook import tqdm
import sklearn.feature_extraction.text as ft 



In [2]:
sys.path.insert(0, '../')
import src.preprocessing.text_preprocessing as tp

In [3]:
plt.rcParams['text.usetex'] = False

In [4]:
FOLDER = 'ecg_mit_remove'
base_path = '../results/'
path = f'{base_path}{FOLDER}/'
full_paths = [f'{path}{file}' for file in os.listdir(path)]


In [5]:
files = {}
for p in full_paths:
    name = p.split('/')[-1]
    split = name.split('_')
    method = '_'.join(split[1:-1])
    n_words = split[-1].split('.')[0]
    with open(p, 'r') as file:
        if method not in files:
            files[method] = {}
        files[method][n_words] = json.load(file)
    

In [6]:
tqdm.pandas()
df = pd.read_csv('../data/mitdb_data/mitdb_ecg.csv', sep=';')
vocabulary = df.columns.tolist()

In [7]:
methods_renames = {
    'shap': 'SHAP',
    'term_strength': 'TS',
    'trl': 'TRL',
    'eccd': 'ECCD',
    'mutual_information': 'MI',
    'chi2': 'chi2',
    'tfidf': 'TF-IDF',
    'linear_measure_5': 'LM',
    'lfs': 'LFS',
    'f_val': 'F-stat',
    'baseline': 'Baseline'
}

In [15]:
TEST_TRAIN = 'test'

In [None]:
def get_extractor_timings(files: dict) -> pd.DataFrame:
    methods = files.keys()
    df = pd.DataFrame(columns=methods)
    n_words_list = set()
    method_list = set()
    for method, words_dict in files.items():
        method_list.add(method)
        for n_words, info in words_dict.items():
                df.loc[n_words, method] = info['timing']['extractor_fit']
                n_words_list.add(n_words)
                
    df.index = df.index.astype('int')
    df = df.sort_index()
    df = df.apply(pd.to_timedelta)
    for col in df.columns:
        df[col] = df[col].dt.total_seconds()
    return df, method_list, n_words_list

def plot_extractors_timings(df) -> plt.figure:
    axes = df.plot(logy=True, logx=True, figsize=(16, 10))
    axes.set_xlabel('# selected features')
    axes.set_ylabel('runtime (sec)')
    axes.set_label('runtime (sec)')
    plt.title('Feature extractor runtimes vs number of the removed featues.')
    plt.savefig('../figures/mit_bih/mit_bih_extractor_runtime_vs_n_features.pdf')
    plt.close()

def get_selected_words_per_extractor_per_n_words(files: dict, vocabulary, n_words_list, method_list):
    df_dict = {}

    for n_words in n_words_list:
        df_dict[n_words] = pd.DataFrame(index=vocabulary, columns=list(method_list)).fillna(0)

    for method, words_dict in files.items():
        for n_words, info in words_dict.items():
            df_dict[n_words].loc[info['selected_vocabulary'][0], method] = 1
    return df_dict

def get_cross_jaccard_score(df):
    methods = df.columns.tolist()
    jaccard_df = pd.DataFrame(index=methods, columns=methods)
    for method_1 in methods:
        for method_2 in methods:
            jaccard_df.loc[method_1, method_2] = jaccard_score(df[method_1], df[method_2])
    return jaccard_df

def get_similarity_metrics(df_dict):
    correlations_dict = {}
    jaccard_score_dict = {}
    for n_words, df in df_dict.items():
        df_filtered = df.loc[:,(df.sum(axis=0) != 0).values] # remove lfs (or other methods) when they have no values
        correlations_dict[n_words] = df_filtered.corr() 
        jaccard_score_dict[n_words] = get_cross_jaccard_score(df_filtered)
    
    return correlations_dict, jaccard_score_dict

def compare_shap_over_n_words_set_similarity(df_dict: dict, n_words_list, method_list):
    df_comp = pd.DataFrame(columns=method_list, index=n_words_list)
    for n_words, df in df_dict.items():
        cols = df.columns
        df_comp.loc[n_words, cols] = df['shap'][cols]
    df_comp.index = df_comp.index.astype('int')
    df_comp = df_comp.sort_index()
    df_comp = df_comp.drop(columns=['shap'])
    return df_comp

def compare_performance_over_n_words(files, n_words_list, method_list, baseline=None):
    metrics = ['precision', 'recall', 'f1-score']
    cols = []
    for method in method_list:
        for metric in metrics:
            cols.append((method, f'{metric}_mean'))
            cols.append((method, f'{metric}_std'))
    df_metrics = pd.DataFrame(columns=pd.MultiIndex.from_tuples(cols), index=n_words_list)

    for method, words_dict in files.items():
        for n_words, info in words_dict.items():
            for metric in metrics:
                df_metrics.loc[n_words, (method, f'{metric}_mean')] = info[f'classification_report_{TEST_TRAIN}']['macro avg'][f'{metric}_mean']
                df_metrics.loc[n_words, (method, f'{metric}_std')] = info[f'classification_report_{TEST_TRAIN}']['macro avg'][f'{metric}_std']
    if baseline is not None:
        for n_words in n_words_list:
            for metric in metrics:
                df_metrics.loc[n_words, ('baseline',f'{metric}_mean')] = baseline['macro avg'][f'{metric}_mean']
                df_metrics.loc[n_words, ('baseline',f'{metric}_std')] = baseline['macro avg'][f'{metric}_std']

    df_metrics.index = df_metrics.index.astype('int')
    df_metrics = df_metrics.sort_index()
    return df_metrics

def plot_performance_metrics(df_metrics, metrics_to_plot, methods_to_plot):
    for metric in metrics_to_plot:
        fig, ax = plt.subplots(figsize=(9,5))

        for method in methods_to_plot:
            errbar = ax.errorbar(x=df_metrics.index, y=df_metrics.loc[:, (method, f'{metric}_mean')], yerr=df_metrics.loc[:, (method, f'{metric}_std')], label=methods_renames[method], alpha=0.5, linestyle='dashed', fmt='--', barsabove=True, elinewidth=2, linewidth=1)
            errbar[-1][0].set_linestyle(':')

        fig.tight_layout()
        ax.set_xlabel('# of removed words')
        ax.set_ylabel(f'{metric}')
        ax.legend(loc="lower right")
        plt.title(f'MIT-BIH {metric} vs # of the removed features.')
        plt.savefig(f'../figures/mit_bih/mit_bih_perf_{metric}_{TEST_TRAIN}_removed.pdf', bbox_inches="tight")
        plt.close()        

In [10]:
df, method_list, n_words_list = get_extractor_timings(files)
# df.to_csv('../results/tables/mit_bih_n_features_timings.csv', sep=';')
# plot_extractors_timings(df)

In [11]:
df_dict = get_selected_words_per_extractor_per_n_words(files, vocabulary, n_words_list, method_list)
correlations_dict, jaccard_score_dict = get_similarity_metrics(df_dict)

In [12]:
# shap_correlations = compare_shap_over_n_words_set_similarity(correlations_dict, n_words_list, method_list)
shap_jaccard = compare_shap_over_n_words_set_similarity(jaccard_score_dict, n_words_list, method_list)
# shap_correlations.to_csv('../results/tables/mit_bih_correlations.csv', sep=';')
shap_jaccard.to_csv('../figures/csv_outputs/jaccard/mit_bih_jaccard_remove.csv', sep=';')

In [13]:
with open('../results/mit_bih_baseline.json', 'r') as file:
    baseline_mit_bih = json.load(file)

In [14]:
df_metrics = compare_performance_over_n_words(files, n_words_list, method_list, baseline=baseline_mit_bih[f'classification_report_{TEST_TRAIN}'])
df_metrics.to_csv(f'../figures/csv_outputs/metrics/mit_bih_performance_remove_{TEST_TRAIN}.csv', sep=';')

In [39]:
methods = list(method_list)[:]
methods.append('baseline')
plot_performance_metrics(df_metrics, ['recall', 'precision', 'f1-score'], methods)

In [75]:
jaccard_table = pd.read_csv('../results/tables/enron_jaccard.csv', sep=';', index_col=0)
jaccard_table.index.name = 'n_words'
# jaccard_table = jaccard_table.drop(columns='lfs')

In [80]:
with pd.option_context('display.float_format', '{:,.3f}'.format):
    print(jaccard_table.to_latex(caption='Jaccard score between Shapley values based method and the rest.'))

\begin{table}
\centering
\caption{Jaccard score between Shapley values based method and the rest.}
\begin{tabular}{lrrrrrrr}
\toprule
{} &  eccd &  mutual\_information &   trl &  linear\_measure\_5 &  term\_strength &  tfidf &  chi2 \\
n\_words &       &                     &       &                   &                &        &       \\
\midrule
50      & 0.205 &               0.351 & 0.075 &             0.235 &          0.220 &  0.266 & 0.299 \\
100     & 0.235 &               0.351 & 0.047 &             0.282 &          0.266 &  0.282 & 0.351 \\
200     & 0.286 &               0.394 & 0.042 &             0.356 &          0.338 &  0.270 & 0.375 \\
500     & 0.247 &               0.344 & 0.018 &             0.445 &          0.422 &  0.357 & 0.350 \\
1000    & 0.214 &               0.299 & 0.010 &             0.505 &          0.499 &  0.463 & 0.305 \\
3000    & 0.099 &               0.132 & 0.016 &             0.192 &          0.191 &  0.184 & 0.137 \\
5000    & 0.082 &               0

In [None]:
jaccard_3000_df = jaccard_score_dict['3000']
with pd.option_context('display.float_format', '{:,.3f}'.format):
    print(jaccard_3000_df.to_latex(caption='Jaccard score between the methods at 3000 kept words.'))
