In [2]:
import json 
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from sklearn.metrics import jaccard_score
import sys 
from tqdm.notebook import tqdm
import sklearn.feature_extraction.text as ft 



In [3]:
sys.path.insert(0, '../')
import src.preprocessing.text_preprocessing as tp

In [4]:
plt.rcParams['text.usetex'] = False

In [5]:
FOLDER = 'enron'
base_path = '../results/'
path = f'{base_path}{FOLDER}/'
full_paths = [f'{path}{file}' for file in os.listdir(path)]


In [6]:
files = {}
for p in full_paths:
    name = p.split('/')[-1]
    split = name.split('_')
    method = '_'.join(split[1:-1])
    n_words = split[-1].split('.')[0]
    with open(p, 'r') as file:
        if method not in files:
            files[method] = {}
        files[method][n_words] = json.load(file)
    

In [7]:
tqdm.pandas()
df = pd.read_csv('../data/enron/enron_spam_data.csv', sep=',')
df = df.fillna('')
df = df.astype('str')
df['Text'] = df.apply(lambda x: x['Subject'] + ', ' + x['Message'], axis=1)
df['Label'] = np.where(df['Spam/Ham'].values == 'ham', 0, 1)
df['Text'] = df['Text'].progress_apply(tp.normalize_text)

count_vectorizer = ft.CountVectorizer()
count_vectorizer.fit(df['Text'])
vocabulary = count_vectorizer.get_feature_names_out()

  0%|          | 0/33716 [00:00<?, ?it/s]

In [46]:
def get_extractor_timings(files: dict) -> pd.DataFrame:
    methods = files.keys()
    df = pd.DataFrame(columns=methods)
    n_words_list = set()
    method_list = set()
    for method, words_dict in files.items():
        method_list.add(method)
        for n_words, info in words_dict.items():
                df.loc[n_words, method] = info['timing']['extractor_fit']
                n_words_list.add(n_words)
                
    df.index = df.index.astype('int')
    df = df.sort_index()
    df = df.apply(pd.to_timedelta)
    for col in df.columns:
        df[col] = df[col].dt.total_seconds()
    return df, method_list, n_words_list

def plot_extractors_timings(df) -> plt.figure:
    axes = df.plot(logy=True, logx=True, figsize=(16, 10))
    axes.set_xlabel('# selected words')
    axes.set_ylabel('runtime (sec)')
    axes.set_label('runtime (sec)')
    plt.title('Feature extractor runtimes vs number of the selected words.')
    plt.savefig('../figures/enron_extractor_runtime_vs_n_words.png')
    plt.close()

def get_selected_words_per_extractor_per_n_words(files: dict, vocabulary, n_words_list, method_list):
    df_dict = {}

    for n_words in n_words_list:
        df_dict[n_words] = pd.DataFrame(index=vocabulary, columns=list(method_list)).fillna(0)

    for method, words_dict in files.items():
        for n_words, info in words_dict.items():
            df_dict[n_words].loc[info['selected_vocabulary'], method] = 1
    return df_dict

def get_cross_jaccard_score(df):
    methods = df.columns.tolist()
    jaccard_df = pd.DataFrame(index=methods, columns=methods)
    for method_1 in methods:
        for method_2 in methods:
            jaccard_df.loc[method_1, method_2] = jaccard_score(df[method_1], df[method_2])
    return jaccard_df

def get_similarity_metrics(df_dict):
    correlations_dict = {}
    jaccard_score_dict = {}
    for n_words, df in df_dict.items():
        df_filtered = df.loc[:,(df.sum(axis=0) != 0).values] # remove lfs (or other methods) when they have no values
        correlations_dict[n_words] = df_filtered.corr() 
        jaccard_score_dict[n_words] = get_cross_jaccard_score(df_filtered)
    
    return correlations_dict, jaccard_score_dict

def compare_shap_over_n_words_set_similarity(df_dict: dict, n_words_list, method_list):
    df_comp = pd.DataFrame(columns=method_list, index=n_words_list)
    for n_words, df in df_dict.items():
        cols = df.columns
        df_comp.loc[n_words, cols] = df['shap'][cols]
    df_comp.index = df_comp.index.astype('int')
    df_comp = df_comp.sort_index()
    df_comp = df_comp.drop(columns=['shap'])
    return df_comp

def compare_performance_over_n_words_enron(files, n_words_list, method_list):
    metrics = ['precision', 'recall', 'f1-score']
    cols = []
    for method in method_list:
        for metric in metrics:
            cols.append((method, metric))
    df_metrics = pd.DataFrame(columns=pd.MultiIndex.from_tuples(cols), index=n_words_list)

    for method, words_dict in files.items():
        for n_words, info in words_dict.items():
            for metric in metrics:
                df_metrics.loc[n_words, (method, metric)] = info['classification_report_test']['1'][metric]
    
    df_metrics.index = df_metrics.index.astype('int')
    df_metrics = df_metrics.sort_index()
    return df_metrics



In [13]:
df, method_list, n_words_list = get_extractor_timings(files)
plot_extractors_timings(df)

In [14]:
df_dict = get_selected_words_per_extractor_per_n_words(files, vocabulary, n_words_list, method_list)
correlations_dict, jaccard_score_dict = get_similarity_metrics(df_dict)

In [15]:
shap_correlations = compare_shap_over_n_words_set_similarity(correlations_dict, n_words_list, method_list)
shap_jaccard = compare_shap_over_n_words_set_similarity(jaccard_score_dict, n_words_list, method_list)

In [47]:
df_metrics = compare_performance_over_n_words_enron(files, n_words_list, method_list)

In [52]:
df_metrics

Unnamed: 0_level_0,term_strength,term_strength,term_strength,eccd,eccd,eccd,shap,shap,shap,trl,...,linear_measure_5,mutual_information,mutual_information,mutual_information,chi2,chi2,chi2,tfidf,tfidf,tfidf
Unnamed: 0_level_1,precision,recall,f1-score,precision,recall,f1-score,precision,recall,f1-score,precision,...,f1-score,precision,recall,f1-score,precision,recall,f1-score,precision,recall,f1-score
10,0.653981,0.759744,0.702906,0.744748,0.976858,0.845157,0.840322,0.921833,0.879192,0.649051,...,0.709567,0.755491,0.93042,0.83388,0.751019,0.937146,0.833822,0.702219,0.899003,0.788519
50,0.894914,0.951566,0.922371,0.915214,0.958,0.936118,0.932077,0.977095,0.954056,0.718275,...,0.92125,0.912372,0.974195,0.942271,0.912753,0.96931,0.940182,0.883897,0.965277,0.922797
100,0.922246,0.976129,0.948423,0.937615,0.980177,0.958423,0.943517,0.984167,0.963413,0.72208,...,0.951467,0.944601,0.985178,0.964463,0.946201,0.983851,0.964659,0.917489,0.969316,0.942691
200,0.943976,0.987348,0.965175,0.947718,0.985267,0.966128,0.95511,0.989976,0.97223,0.720287,...,0.963017,0.955897,0.989028,0.97218,0.951466,0.990464,0.970574,0.937942,0.979992,0.958506
500,0.954633,0.990899,0.972428,0.957006,0.991457,0.973927,0.967176,0.992286,0.97957,0.731114,...,0.974574,0.963023,0.99327,0.977912,0.960878,0.992171,0.976273,0.947592,0.987951,0.967351
1000,0.963423,0.993588,0.978273,0.96326,0.992496,0.977659,0.963837,0.992947,0.978176,0.715983,...,0.97833,0.964529,0.993742,0.978918,0.964487,0.992871,0.978473,0.959142,0.988411,0.973557
3000,0.965487,0.993131,0.979114,0.962799,0.993919,0.978111,0.964364,0.993589,0.978758,0.780547,...,0.979067,0.96803,0.993471,0.980586,0.968674,0.992681,0.980531,0.962524,0.98727,0.97474
5000,0.966484,0.993112,0.979617,0.964064,0.994088,0.978846,0.960239,0.99482,0.977223,0.787528,...,0.975752,0.969464,0.993625,0.981396,0.966375,0.992098,0.979067,0.956845,0.994011,0.975074
10000,0.959505,0.99608,0.97745,0.96275,0.993893,0.978074,0.960032,0.994313,0.976872,0.800334,...,0.975393,0.961075,0.994684,0.977591,0.961871,0.995203,0.978253,0.955449,0.993772,0.974234
15000,0.954952,0.995338,0.974727,0.958507,0.994938,0.976382,0.959434,0.99329,0.976068,0.811655,...,0.975449,0.964656,0.995689,0.979927,0.958888,0.99611,0.977144,0.955148,0.994681,0.974513
