In [1]:
import json 
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from sklearn.metrics import jaccard_score
import sys 
from tqdm.notebook import tqdm
import sklearn.feature_extraction.text as ft 



In [2]:
sys.path.insert(0, '../')
import src.preprocessing.text_preprocessing as tp

In [3]:
plt.rcParams['text.usetex'] = False

In [4]:
FOLDER = 'arcene'
base_path = '../results/'
path = f'{base_path}{FOLDER}/'
full_paths = [f'{path}{file}' for file in os.listdir(path)]


In [5]:
files = {}
for p in full_paths:
    name = p.split('/')[-1]
    if name == 'results_ppfs.json':
        continue # handle separately together with baseline
        if 'ppfs' not in files:
            files['ppfs'] = {}
        with open(p, 'r') as file:
            f = json.load(file)
        files['ppfs'][f['n_words'][0]] = f
        continue
    split = name.split('_')
    method = '_'.join(split[1:-1])
    n_words = split[-1].split('.')[0]
    with open(p, 'r') as file:
        if method not in files:
            files[method] = {}
        files[method][n_words] = json.load(file)
    

In [6]:
methods_renames = {
    'shap': 'SHAP',
    'term_strength': 'TS',
    'trl': 'TRL',
    'eccd': 'ECCD',
    'mutual_information': 'MI',
    'chi2': 'chi2',
    'tfidf': 'TF-IDF',
    'linear_measure_5': 'LM',
    'lfs': 'LFS',
    'baseline': 'Baseline',
    'ppfs': 'PPFS',
    'f_val': 'F-stat'
}

In [7]:
tqdm.pandas()
df = pd.read_csv('../data/arcene/arcene.csv', sep=';', index_col=0)
df = df.fillna('')
df = df.astype('float')
df['Class'] = df['Class'].astype('category')
df['Class'] = df['Class'].cat.codes


vocabulary = df.columns[:-1]
print(len(vocabulary))

10000


In [17]:
TEST_TRAIN = 'test'

In [9]:
def get_extractor_timings(files: dict) -> pd.DataFrame:
    methods = files.keys()
    df = pd.DataFrame(columns=methods)
    n_words_list = set()
    method_list = set()
    for method, words_dict in files.items():
        method_list.add(method)
        for n_words, info in words_dict.items():
                df.loc[n_words, method] = info['timing']['extractor_fit']
                n_words_list.add(n_words)
                
    df.index = df.index.astype('int')
    df = df.sort_index()
    df = df.apply(pd.to_timedelta)
    for col in df.columns:
        df[col] = df[col].dt.total_seconds()
    return df, method_list, n_words_list

def plot_extractors_timings(df) -> plt.figure:
    df = df.copy(deep=True)
    df = df.rename(columns=methods_renames)
    axes = df.plot(logy=True, logx=True, figsize=(16, 10))
    axes.set_xlabel('# selected variables')
    axes.set_ylabel('runtime (sec)')
    # axes.set_label('runtime (sec)')
    axes.legend(loc="lower right")

    # plt.title('Feature extractor runtimes vs number of the selected words.')
    plt.savefig('../figures/arcene/arcene_extractor_runtime_vs_n_words.pdf', bbox_inches="tight")
    plt.close()

def get_selected_words_per_extractor_per_n_words(files: dict, vocabulary, n_words_list, method_list):
    df_dict = {}

    for n_words in n_words_list:
        df_dict[n_words] = pd.DataFrame(index=vocabulary, columns=list(method_list)).fillna(0)

    for method, words_dict in files.items():
        for n_words, info in words_dict.items():
            df_dict[n_words].loc[info['selected_vocabulary'][0], method] = 1
    return df_dict

def get_cross_jaccard_score(df):
    methods = df.columns.tolist()
    jaccard_df = pd.DataFrame(index=methods, columns=methods)
    for method_1 in methods:
        for method_2 in methods:
            jaccard_df.loc[method_1, method_2] = jaccard_score(df[method_1], df[method_2])
    return jaccard_df

def get_similarity_metrics(df_dict):
    correlations_dict = {}
    jaccard_score_dict = {}
    for n_words, df in df_dict.items():
        df_filtered = df.loc[:,(df.sum(axis=0) != 0).values] # remove lfs (or other methods) when they have no values
        correlations_dict[n_words] = df_filtered.corr() 
        jaccard_score_dict[n_words] = get_cross_jaccard_score(df_filtered)
    
    return correlations_dict, jaccard_score_dict

def compare_shap_over_n_words_set_similarity(df_dict: dict, n_words_list, method_list):
    df_comp = pd.DataFrame(columns=method_list, index=n_words_list)
    for n_words, df in df_dict.items():
        cols = df.columns
        df_comp.loc[n_words, cols] = df['shap'][cols]
    df_comp.index = df_comp.index.astype('int')
    df_comp = df_comp.sort_index()
    df_comp = df_comp.drop(columns=['shap'])
    return df_comp

def compare_performance_over_n_words(files, n_words_list, method_list, baseline=None, ppfs=None):
    metrics = ['precision', 'recall', 'f1-score']
    cols = []
    for method in method_list:
        for metric in metrics:
            cols.append((method, f'{metric}_mean'))
            cols.append((method, f'{metric}_std'))
    df_metrics = pd.DataFrame(columns=pd.MultiIndex.from_tuples(cols), index=n_words_list)

    for method, words_dict in files.items():
        for n_words, info in words_dict.items():
            for metric in metrics:
                df_metrics.loc[n_words, (method, f'{metric}_mean')] = info[f'classification_report_{TEST_TRAIN}']['macro avg'][f'{metric}_mean']
                df_metrics.loc[n_words, (method, f'{metric}_std')] = info[f'classification_report_{TEST_TRAIN}']['macro avg'][f'{metric}_std']
    if baseline is not None:
        for n_words in n_words_list:
            for metric in metrics:
                df_metrics.loc[n_words, ('baseline',f'{metric}_mean')] = baseline['macro avg'][f'{metric}_mean']
                df_metrics.loc[n_words, ('baseline',f'{metric}_std')] = baseline['macro avg'][f'{metric}_std']
    
    if ppfs is not None:
        for n_words in n_words_list:
            for metric in metrics:
                df_metrics.loc[n_words, ('ppfs',f'{metric}_mean')] = ppfs['macro avg'][f'{metric}_mean']
                df_metrics.loc[n_words, ('ppfs',f'{metric}_std')] = ppfs['macro avg'][f'{metric}_std']

    df_metrics.index = df_metrics.index.astype('int')
    df_metrics = df_metrics.sort_index()
    return df_metrics

def plot_performance_metrics(df_metrics, metrics_to_plot, methods_to_plot):
    for metric in metrics_to_plot:
        fig, ax = plt.subplots(figsize=(9,5))

        for method in methods_to_plot:
            errbar = ax.errorbar(x=df_metrics.index, y=df_metrics.loc[:, (method, f'{metric}_mean')], yerr=df_metrics.loc[:, (method, f'{metric}_std')], label=methods_renames[method], alpha=0.5, linestyle='dashed', fmt='--', barsabove=True, elinewidth=2, linewidth=1)
            errbar[-1][0].set_linestyle(':')
        ax.set_xscale('log')
        ax.set_ylabel('log')
        ax.set_xlabel('# of kept words')
        ax.set_ylabel(f'{metric}')
        ax.legend()
        plt.title(f'Arcene corpus {metric} vs # of removed words.')
        plt.savefig(f'../figures/arcene/arcene_perf_{metric}_remove_{TEST_TRAIN}.pdf')
        plt.close()

In [12]:
df, method_list, n_words_list = get_extractor_timings(files)
df.to_csv(f'../figures/csv_outputs/timings/arcene_n_word_timings.csv', sep=';')
plot_extractors_timings(df)

In [13]:
df_dict = get_selected_words_per_extractor_per_n_words(files, vocabulary, n_words_list, method_list)
correlations_dict, jaccard_score_dict = get_similarity_metrics(df_dict)

In [14]:
# shap_correlations = compare_shap_over_n_words_set_similarity(correlations_dict, n_words_list, method_list)
shap_jaccard = compare_shap_over_n_words_set_similarity(jaccard_score_dict, n_words_list, method_list)
# shap_correlations.to_csv('../results/tables/brown_correlations.csv', sep=';')
shap_jaccard.to_csv(f'../figures/csv_outputs/jaccard/arcene_jaccard_{TEST_TRAIN}.csv', sep=';')

In [15]:
with open('../results/arcene_baseline.json', 'r') as file:
    baseline_arcene = json.load(file)

with open('../results/arcene/results_ppfs.json', 'r') as file:
    ppfs_arcene = json.load(file)

In [16]:
df_metrics = compare_performance_over_n_words(files, 
                                                n_words_list,
                                                method_list, 
                                                baseline=baseline_arcene[f'classification_report_{TEST_TRAIN}'],
                                                ppfs=ppfs_arcene[f'classification_report_{TEST_TRAIN}']) 
df_metrics.to_csv(f'../figures/csv_outputs/metrics/arcene_performance_{TEST_TRAIN}.csv', sep=';')

In [128]:
methods = list(method_list)[:]
methods.append('baseline')
methods.append('ppfs')
plot_performance_metrics(df_metrics, ['recall', 'precision', 'f1-score'], methods)

In [104]:
jaccard_table = pd.read_csv('../results/tables/arcene_jaccard.csv', sep=';', index_col=0)
jaccard_table.index.name = 'n_words'
# jaccard_table = jaccard_table.drop(columns='lfs')

In [105]:
with pd.option_context('display.float_format', '{:,.3f}'.format):
    jaccard_table = jaccard_table.rename(columns=methods_renames)
    print(jaccard_table.to_latex(caption='Jaccard score between Shapley values based method and the rest.'))

\begin{table}
\centering
\caption{Jaccard score between Shapley values based method and the rest.}
\begin{tabular}{lrr}
\toprule
{} &  F-stat &    MI \\
n\_words &         &       \\
\midrule
10      &   0.000 & 0.000 \\
100     &   0.010 & 0.031 \\
500     &   0.044 & 0.046 \\
1000    &   0.051 & 0.062 \\
3000    &   0.186 & 0.180 \\
5000    &   0.344 & 0.336 \\
8000    &   0.667 & 0.697 \\
\bottomrule
\end{tabular}
\end{table}



In [20]:
jaccard_3000_df = jaccard_score_dict['15000']
jaccard_3000_df = jaccard_3000_df.rename(columns=methods_renames)
jaccard_3000_df = jaccard_3000_df.rename(index=methods_renames)
with pd.option_context('display.float_format', '{:,.3f}'.format):
    print(jaccard_3000_df.to_latex(caption='Jaccard score between the methods at 3000 kept variables.'))


\begin{table}
\centering
\caption{Jaccard score between the methods at 3000 kept words.}
\begin{tabular}{lllllllll}
\toprule
{} & TF-IDF &    LM &   TRL &  ECCD &    TS &  SHAP &    MI &  chi2 \\
\midrule
TF-IDF &  1.000 & 0.536 & 0.477 & 0.674 & 0.630 & 0.357 & 0.719 & 0.777 \\
LM     &  0.536 & 1.000 & 0.403 & 0.537 & 0.740 & 0.435 & 0.644 & 0.545 \\
TRL    &  0.477 & 0.403 & 1.000 & 0.352 & 0.477 & 0.348 & 0.532 & 0.399 \\
ECCD   &  0.674 & 0.537 & 0.352 & 1.000 & 0.544 & 0.336 & 0.581 & 0.817 \\
TS     &  0.630 & 0.740 & 0.477 & 0.544 & 1.000 & 0.396 & 0.795 & 0.592 \\
SHAP   &  0.357 & 0.435 & 0.348 & 0.336 & 0.396 & 1.000 & 0.362 & 0.354 \\
MI     &  0.719 & 0.644 & 0.532 & 0.581 & 0.795 & 0.362 & 1.000 & 0.672 \\
chi2   &  0.777 & 0.545 & 0.399 & 0.817 & 0.592 & 0.354 & 0.672 & 1.000 \\
\bottomrule
\end{tabular}
\end{table}

