# Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import random
import time
from tqdm.notebook import tqdm_notebook

tqdm_notebook.pandas()

In [2]:
exec_time_path = '/home/sylli/Documents/MSAD-E/data/execution_time/detectors_inference_time.csv'
csv_dir = '/home/sylli/Documents/MSAD-E/reports/results_06_2024'
save_fig_path = os.path.join(csv_dir, "figures")

In [3]:
# Detectors
detector_names = [
	'AE', 
	'CNN', 
	'HBOS', 
	'IFOREST', 
	'IFOREST1', 
	'LOF', 
	'LSTM', 
	'MP', 
	'NORMA', 
	'OCSVM', 
	'PCA', 
	'POLY'
]

weight_names = [f"weight_{x}" for x in detector_names]

model_selectors = [
    "convnet128",
    "resnet1024",
    "sit512",
    "knn1024",
]

combine_methods = ['average', 'vote']

datasets = [
    'MGAB', 
    'GHL', 
    'MITDB', 
    'Genesis', 
    'OPPORTUNITY', 
    'SensorScope', 
    'KDD21', 
    'Occupancy', 
    'SVDB', 
    'IOPS', 
    'SMD', 
    'Daphnet',
    'ECG',
    'NAB',
    'YAHOO'
]

metrics = ['AUC-ROC', 'AUC-PR', 'VUS-ROC', 'VUS-PR']

k_values = np.arange(1, 13)

In [4]:
def load_results(csv_file):
    """Load results from a CSV file."""
    return pd.read_csv(csv_file, index_col=0)

# Load data

In [5]:
# Read files
result_files = [file for file in os.listdir(csv_dir) if file.endswith('.csv')]
print(np.random.choice(result_files, 3))

supervised_files = [file for file in result_files if "testsize" not in file]
unsupervised_files = [file for file in result_files if "testsize" in file]
print((f"Total number of files {len(result_files)}, of which {len(supervised_files)} "
        f"supervised and {len(unsupervised_files)} ({len(supervised_files)} + "
        f"{len(unsupervised_files)} = {len(supervised_files) + len(unsupervised_files)})"))

FileNotFoundError: [Errno 2] No such file or directory: '/home/sylli/Documents/MSAD-E/reports/results_06_2024'

In [None]:
# Load data
all_results = []
curr_experiment_env = set()

for file in supervised_files:
    parts = file.split('_')

    if len(parts) != 4:
        print("STOP! THERE IS A MISTAKE WITH GROUPING THE FILES. FILE NAME HAS PROBABLY CHANGED. TAKE CARE")
    else:
        dataset = parts[0]
        model_selector = parts[1]
        combine_method = parts[-2]
        k = int(parts[-1].split('.')[0][1:])
        if (dataset not in datasets) or \
        (model_selector not in model_selectors) or \
        (combine_method not in combine_methods) or \
        (k not in k_values):
            print("STOP! THERE IS A MISTAKE WITH GROUPING THE FILES. FILE NAME HAS PROBABLY CHANGED. TAKE CARE")
            break
        else:
            curr_df = pd.read_csv(os.path.join(csv_dir, file))
            curr_df.rename(columns={'Unnamed: 0': 'Time series'}, inplace=True)
            curr_df['Model Selector'] = model_selector
            curr_df['k'] = k
            curr_df['Combine Method'] = combine_method
            curr_df['Dataset'] = dataset
            all_results.append(curr_df)

            # Update curr existing datasets, MSs, etc.
            curr_experiment_env.add(dataset)
            curr_experiment_env.add(model_selector)
            curr_experiment_env.add(combine_method)
            curr_experiment_env.add(k)

df = pd.concat(all_results)
df

In [None]:
# Update available parameters depending on read data
datasets = [x for x in datasets if x in curr_experiment_env]
model_selectors = [x for x in model_selectors if x in curr_experiment_env]
combine_methods = [x for x in combine_methods if x in curr_experiment_env]
k_values = [x for x in k_values if x in curr_experiment_env]

In [None]:
# Read execution time per detector, per time series
time_df = pd.read_csv(exec_time_path)
time_df['filename'] = time_df.apply(lambda x: f"{x['dataset']}/{x['filename']}", axis='columns')
time_df.rename(columns={'filename': 'Time series', 'dataset': 'Dataset'}, inplace=True)
time_df.set_index('Time series', inplace=True)

# Keep time series that are in the final validation set 
df = df[df['Time series'].isin(time_df.index)].copy()

In [None]:
def compute_execution_time(elem, time_df, detector_names, weight_names):
    non_zero_columns = elem[weight_names][elem[weight_names] != 0].index
    detectors_used = [x.replace('weight_', '') for x in non_zero_columns]

    return np.sum(time_df.loc[elem['Time series'], detectors_used])

In [None]:
df['Execution Time'] = df.progress_apply(lambda x: compute_execution_time(x, time_df, detector_names, weight_names), axis=1)

In [None]:
df["k_real"] = np.sum(df.filter(like="weight_") != 0, axis=1)

# Analysis

In [None]:
# k Distribution
fig, axes = plt.subplots(4, 2, figsize=(10, 16), sharey=True)

for i, model_selector in enumerate(model_selectors):    
    sns.histplot(ax=axes[i, 0], data=df, x='k', hue='Combine Method')
    sns.histplot(ax=axes[i, 1], data=df, x='k_real', hue='Combine Method')

    
    axes[i, 0].set_title(model_selector)
    axes[i, 1].set_title(model_selector)
    
for ax in axes.flatten():
    ax.grid()

plt.tight_layout()
plt.show()

In [None]:
# Execution Time
fig, axes = plt.subplots(4, 2, figsize=(10, 16), sharey=True)

for i, model_selector in enumerate(model_selectors):    
    sns.boxplot(ax=axes[i, 0], data=df[df['Model Selector'] == model_selector], x='k', y='Execution Time', hue='Combine Method')
    sns.boxplot(ax=axes[i, 1], data=df[df['Model Selector'] == model_selector], x='k_real', y='Execution Time', hue='Combine Method')
    
    axes[i, 0].set_title(model_selector)
    axes[i, 1].set_title(model_selector)
    
for ax in axes.flatten():
    ax.set_yscale('log')
    ax.grid()

plt.tight_layout()
plt.show()

In [None]:
# Execution Time
fig, axes = plt.subplots(4, 2, figsize=(10, 16), sharey=True)

for i, model_selector in enumerate(model_selectors):    
    sns.lineplot(ax=axes[i, 0], data=df, x='k', y='AUC-PR', hue='Combine Method')
    # sns.lineplot(ax=axes[i, 1], data=df, x='k_real', y='AUC-PR', hue='Combine Method')
    
    axes[i, 0].set_title(model_selector)
    axes[i, 1].set_title(model_selector)
    
for ax in axes.flatten():
    ax.grid()

plt.tight_layout()
plt.show()

In [None]:
# Execution Time
fig, axes = plt.subplots(4, 2, figsize=(10, 16), sharey=True)

for i, model_selector in enumerate(model_selectors):    
    sns.lineplot(ax=axes[i, 0], data=df, x='k', y='VUS-PR', hue='Combine Method')
    sns.lineplot(ax=axes[i, 1], data=df, x='k_real', y='VUS-PR', hue='Combine Method')
    
    axes[i, 0].set_title(model_selector)
    axes[i, 1].set_title(model_selector)
    
for ax in axes.flatten():
    ax.grid()

plt.tight_layout()
plt.show()