In [1]:
import json 
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
from sklearn.metrics import jaccard_score
import sys 
from tqdm.notebook import tqdm
import sklearn.feature_extraction.text as ft 



In [2]:
sys.path.insert(0, '../')
import src.preprocessing.text_preprocessing as tp

In [3]:
plt.rcParams['text.usetex'] = False

In [4]:
FOLDER = 'brown'
base_path = '../results/'
path = f'{base_path}{FOLDER}/'
full_paths = [f'{path}{file}' for file in os.listdir(path)]


In [5]:
files = {}
for p in full_paths:
    name = p.split('/')[-1]
    split = name.split('_')
    method = '_'.join(split[1:-1])
    n_words = split[-1].split('.')[0]
    with open(p, 'r') as file:
        if method not in files:
            files[method] = {}
        files[method][n_words] = json.load(file)
    

In [6]:
files['shap']['1000']['classification_report_test']

{'0': {'precision': 0.620253164556962,
  'recall': 0.5632183908045977,
  'f1-score': 0.5903614457831324,
  'support': 87},
 '1': {'precision': 0.4030612244897959,
  'recall': 0.5563380281690141,
  'f1-score': 0.4674556213017752,
  'support': 142},
 '2': {'precision': 0.4461538461538462,
  'recall': 0.47540983606557374,
  'f1-score': 0.4603174603174603,
  'support': 61},
 '3': {'precision': 0.5151515151515151,
  'recall': 0.5666666666666667,
  'f1-score': 0.5396825396825397,
  'support': 90},
 '4': {'precision': 0.7123287671232876,
  'recall': 0.6753246753246753,
  'f1-score': 0.6933333333333332,
  'support': 77},
 '5': {'precision': 0.7052631578947368,
  'recall': 0.6568627450980392,
  'f1-score': 0.6802030456852792,
  'support': 102},
 '6': {'precision': 0.75,
  'recall': 0.13636363636363635,
  'f1-score': 0.23076923076923075,
  'support': 22},
 '7': {'precision': 0.6879432624113475,
  'recall': 0.6217948717948718,
  'f1-score': 0.6531986531986532,
  'support': 156},
 '8': {'precision

In [7]:
tqdm.pandas()
df = pd.read_csv('../data/brown_corpus/brown_corpus.csv', sep=';')
df = df.fillna('')
df = df.astype('str')
df['Label'] = df['Label'].astype('category')
df['Label'] = df['Label'].cat.codes
df['Text'] = df['Text'].apply(tp.normalize_text)

count_vectorizer = ft.CountVectorizer()
count_vectorizer.fit(df['Text'])
vocabulary = count_vectorizer.get_feature_names_out()

In [8]:
def get_extractor_timings(files: dict) -> pd.DataFrame:
    methods = files.keys()
    df = pd.DataFrame(columns=methods)
    n_words_list = set()
    method_list = set()
    for method, words_dict in files.items():
        method_list.add(method)
        for n_words, info in words_dict.items():
                df.loc[n_words, method] = info['timing']['extractor_fit']
                n_words_list.add(n_words)
                
    df.index = df.index.astype('int')
    df = df.sort_index()
    df = df.apply(pd.to_timedelta)
    for col in df.columns:
        df[col] = df[col].dt.total_seconds()
    return df, method_list, n_words_list

def plot_extractors_timings(df) -> plt.figure:
    axes = df.plot(logy=True, logx=True, figsize=(16, 10))
    axes.set_xlabel('# selected words')
    axes.set_ylabel('runtime (sec)')
    axes.set_label('runtime (sec)')
    plt.title('Feature extractor runtimes vs number of the selected words.')
    plt.savefig('../figures/brown_extractor_runtime_vs_n_words.png')
    plt.close()

def get_selected_words_per_extractor_per_n_words(files: dict, vocabulary, n_words_list, method_list):
    df_dict = {}

    for n_words in n_words_list:
        df_dict[n_words] = pd.DataFrame(index=vocabulary, columns=list(method_list)).fillna(0)

    for method, words_dict in files.items():
        for n_words, info in words_dict.items():
            df_dict[n_words].loc[info['selected_vocabulary'], method] = 1
    return df_dict

def get_cross_jaccard_score(df):
    methods = df.columns.tolist()
    jaccard_df = pd.DataFrame(index=methods, columns=methods)
    for method_1 in methods:
        for method_2 in methods:
            jaccard_df.loc[method_1, method_2] = jaccard_score(df[method_1], df[method_2])
    return jaccard_df

def get_similarity_metrics(df_dict):
    correlations_dict = {}
    jaccard_score_dict = {}
    for n_words, df in df_dict.items():
        df_filtered = df.loc[:,(df.sum(axis=0) != 0).values] # remove lfs (or other methods) when they have no values
        correlations_dict[n_words] = df_filtered.corr() 
        jaccard_score_dict[n_words] = get_cross_jaccard_score(df_filtered)
    
    return correlations_dict, jaccard_score_dict

def compare_shap_over_n_words_set_similarity(df_dict: dict, n_words_list, method_list):
    df_comp = pd.DataFrame(columns=method_list, index=n_words_list)
    for n_words, df in df_dict.items():
        cols = df.columns
        df_comp.loc[n_words, cols] = df['shap'][cols]
    df_comp.index = df_comp.index.astype('int')
    df_comp = df_comp.sort_index()
    df_comp = df_comp.drop(columns=['shap'])
    return df_comp

def compare_performance_over_n_words_enron(files, n_words_list, method_list, baseline=None):
    metrics = ['precision', 'recall', 'f1-score']
    cols = []
    for method in method_list:
        for metric in metrics:
            cols.append((method, metric))
    df_metrics = pd.DataFrame(columns=pd.MultiIndex.from_tuples(cols), index=n_words_list)

    for method, words_dict in files.items():
        for n_words, info in words_dict.items():
            for metric in metrics:
                df_metrics.loc[n_words, (method, metric)] = info['classification_report_test']['weighted avg'][metric]
    if baseline is not None:
        for n_words in n_words_list:
            for metric in metrics:
                df_metrics.loc[n_words, ('baseline',metric)] = baseline['weighted avg'][metric]
    
    df_metrics.index = df_metrics.index.astype('int')
    df_metrics = df_metrics.sort_index()
    return df_metrics



In [9]:
df, method_list, n_words_list = get_extractor_timings(files)
df.to_csv('../results/tables/brown_n_word_timings.csv', sep=';')
plot_extractors_timings(df)

In [10]:
df_dict = get_selected_words_per_extractor_per_n_words(files, vocabulary, n_words_list, method_list)
correlations_dict, jaccard_score_dict = get_similarity_metrics(df_dict)

In [11]:
shap_correlations = compare_shap_over_n_words_set_similarity(correlations_dict, n_words_list, method_list)
shap_jaccard = compare_shap_over_n_words_set_similarity(jaccard_score_dict, n_words_list, method_list)
shap_correlations.to_csv('../results/tables/brown_correlations.csv', sep=';')
shap_jaccard.to_csv('../results/tables/brown_jaccard.csv', sep=';')

In [12]:
with open('../results/brown_baseline.json', 'r') as file:
    baseline_brown = json.load(file)

In [21]:
df_metrics = compare_performance_over_n_words_enron(files, n_words_list, method_list, baseline=baseline_brown['classification_report_test']) 
df_metrics.to_csv('../results/tables/brown_performance.csv', sep=';')

In [22]:
df_metrics

Unnamed: 0_level_0,chi2,chi2,chi2,eccd,eccd,eccd,term_strength,term_strength,term_strength,mutual_information,...,trl,tfidf,tfidf,tfidf,shap,shap,shap,baseline,baseline,baseline
Unnamed: 0_level_1,precision,recall,f1-score,precision,recall,f1-score,precision,recall,f1-score,precision,...,f1-score,precision,recall,f1-score,precision,recall,f1-score,precision,recall,f1-score
10,0.093401,0.103188,0.081946,0.08652,0.138423,0.052191,0.15577,0.122483,0.116472,0.240846,...,0.051524,0.235641,0.145134,0.107318,0.22341,0.213087,0.205448,0.718198,0.692953,0.68397
50,0.327815,0.312081,0.314942,0.286137,0.194631,0.167472,0.335594,0.29698,0.308667,0.335952,...,0.113259,0.352469,0.312919,0.313933,0.333439,0.311242,0.317682,0.718198,0.692953,0.68397
100,0.399376,0.386745,0.387908,0.379083,0.260067,0.259112,0.360713,0.328859,0.338548,0.349741,...,0.213319,0.351809,0.348993,0.34758,0.378142,0.364094,0.367382,0.718198,0.692953,0.68397
200,0.421726,0.408557,0.412039,0.413566,0.410235,0.406668,0.413927,0.395973,0.402467,0.399631,...,0.367004,0.419435,0.407718,0.410236,0.429727,0.417785,0.422438,0.718198,0.692953,0.68397
500,0.516304,0.500839,0.503248,0.517232,0.510906,0.510158,0.465611,0.458054,0.459769,0.49909,...,0.39464,0.504958,0.487416,0.490759,0.506378,0.493289,0.493926,0.718198,0.692953,0.68397
1000,0.599997,0.593121,0.592533,0.599241,0.592282,0.592301,0.527559,0.502517,0.508079,0.547545,...,0.373913,0.570023,0.548658,0.55032,0.57655,0.560403,0.557376,0.718198,0.692953,0.68397
3000,0.649813,0.63255,0.634677,0.687217,0.678691,0.679113,0.628443,0.60906,0.608692,0.609246,...,0.528577,0.649043,0.637584,0.635232,0.667415,0.657718,0.653321,0.718198,0.692953,0.68397
5000,0.677005,0.665268,0.662692,0.695175,0.687081,0.684728,0.671095,0.656879,0.653155,0.689206,...,0.496224,0.66554,0.644295,0.639693,0.645298,0.638423,0.630951,0.718198,0.692953,0.68397
10000,0.684984,0.678691,0.668407,0.709728,0.697148,0.691344,0.688091,0.671141,0.664329,0.690701,...,0.546121,0.700859,0.687919,0.680537,0.669754,0.651846,0.644496,0.718198,0.692953,0.68397
15000,0.704048,0.685403,0.680983,0.709212,0.69547,0.691205,0.713322,0.689597,0.682676,0.724977,...,0.625048,0.702355,0.676174,0.670141,0.673435,0.644295,0.63926,0.718198,0.692953,0.68397
