<font size="+3"><mark>Classification benchmark / Clean the results</mark></font>

# Introduction

## README

_Associated GitHub repository: https://github.com/sylvaincom/astride._

## Configuration parameters

In [1]:
METHOD_ABBREVIATES = ["sax", "saxtslearn", "1dsax", "astride", "fastride"]
DATE_EXP = "2023_02_08"

IS_EXPORT_DF = True

## Imports

In [2]:
import pandas as pd

import numpy as np
from sklearn.utils import Bunch
import ast

from pathlib import Path
import pprint

from src.utils import concatenate_df, create_path
from src.utils_clean_classification_results import (
    clean_acc_method_alldatasets,
)
from src.metadata import D_REPLACE_METHOD_NAMES;

In [3]:
pd.set_option('display.max_columns', 50)
pp = pprint.PrettyPrinter()
cwd = Path.cwd()

# Load the classification results

In [4]:
d_df_acc_allmethods = dict()

In [5]:
for METHOD_ABBREVIATE in METHOD_ABBREVIATES:
    path = Path(cwd / "results" / DATE_EXP)
    l_csvfiles_acc_method_alldatasets = list(path.rglob(f"acc/df_acc_{METHOD_ABBREVIATE}_*.csv"))
    d_df_acc_allmethods[METHOD_ABBREVIATE] = concatenate_df(l_csvfiles_acc_method_alldatasets).drop_duplicates()

In [6]:
d_df_acc_allmethods["astride"].sample(5)

Unnamed: 0,dataset,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_segmentation__mean_or_slope,param_segmentation__n_segments,param_segmentation__pen_factor,param_segmentation__uniform_or_adaptive,param_segmentation__univariate_or_multivariate,param_segmentfeature__features_names,param_symbolicsignaldistance__distance,param_symbolicsignaldistance__n_samples,param_symbolicsignaldistance__weighted_bool,param_symbolization__features_scaling,param_symbolization__lookup_table_type,param_symbolization__n_regime_lengths,param_symbolization__n_symbols,param_symbolization__reconstruct_bool,param_symbolization__seglen_bins_method,param_symbolization__symb_cluster_method,param_symbolization__symb_method,param_symbolization__symb_quantif_method,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
1090,NonInvasiveFetalECGThorax2,935.847596,0.0,933.834386,0.0,mean,15,,adaptive,multivariate,['mean'],lev,,True,,mof,divide_exact,16,True,,,quantif,quantiles,"{'segmentation__mean_or_slope': 'mean', 'segme...",0.714504,0.714504,0.0,6
125,BirdChicken,0.219375,0.0,0.195752,0.0,mean,10,,adaptive,multivariate,['mean'],lev,,True,,mof,divide_exact,9,True,,,quantif,quantiles,"{'segmentation__mean_or_slope': 'mean', 'segme...",0.6,0.6,0.0,15
367,ECGFiveDays,0.291909,0.0,8.599017,0.0,mean,10,,adaptive,multivariate,['mean'],lev,,True,,mof,divide_exact,25,True,,,quantif,quantiles,"{'segmentation__mean_or_slope': 'mean', 'segme...",0.714286,0.714286,0.0,13
1088,NonInvasiveFetalECGThorax2,823.953697,0.0,831.718551,0.0,mean,15,,adaptive,multivariate,['mean'],lev,,True,,mof,divide_exact,4,True,,,quantif,quantiles,"{'segmentation__mean_or_slope': 'mean', 'segme...",0.317048,0.317048,0.0,18
1195,PigArtPressure,2.252927,0.0,2.158841,0.0,mean,20,,adaptive,multivariate,['mean'],lev,,True,,mof,divide_exact,25,True,,,quantif,quantiles,"{'segmentation__mean_or_slope': 'mean', 'segme...",0.110577,0.110577,0.0,4


# Clean the classification results

In [7]:
d_df_acc_allmethods_alldatasets_clean = dict()
for METHOD_ABBREVIATE in METHOD_ABBREVIATES:
    print(METHOD_ABBREVIATE)
    df_acc_method_alldatasets_clean, _ = clean_acc_method_alldatasets(
        df_acc_method_alldatasets=d_df_acc_allmethods[METHOD_ABBREVIATE],
        method_abbreviate=METHOD_ABBREVIATE
    )
    d_df_acc_allmethods_alldatasets_clean[METHOD_ABBREVIATE] = df_acc_method_alldatasets_clean

sax
saxtslearn
1dsax
astride


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_acc_method_alldatasets[col] = None


fastride


In [8]:
d_df_acc_allmethods_alldatasets_clean["astride"].sample(5)

Unnamed: 0,method,dataset,mean_or_slope,n_segments,pen_factor,uniform_or_adaptive,univariate_or_multivariate,features_names,distance,weighted_bool,features_scaling,lookup_table_type,n_regime_lengths,n_symbols,reconstruct_bool,seglen_bins_method,symb_cluster_method,symb_method,symb_quantif_method,mean_test_score,std_test_score,n_samples
903,ASTRIDE,InsectWingbeatSound,mean,5,,adaptive,multivariate,['mean'],lev,True,,mof,divide_exact,25,True,,,quantif,quantiles,0.460101,0.0,256
1120,ASTRIDE,OliveOil,mean,5,,adaptive,multivariate,['mean'],lev,True,,mof,divide_exact,4,True,,,quantif,quantiles,0.466667,0.0,570
1255,ASTRIDE,PowerCons,mean,20,,adaptive,multivariate,['mean'],lev,True,,mof,divide_exact,25,True,,,quantif,quantiles,0.911111,0.0,144
1294,ASTRIDE,Rock,mean,20,,adaptive,multivariate,['mean'],lev,True,,mof,divide_exact,16,True,,,quantif,quantiles,0.7,0.0,2844
1262,ASTRIDE,RefrigerationDevices,mean,5,,adaptive,multivariate,['mean'],lev,True,,mof,divide_exact,16,True,,,quantif,quantiles,0.426667,0.0,720


# Export the cleaned classification results

In [9]:
if IS_EXPORT_DF:
    folder = path / "acc_clean"
    create_path(folder) 
    for METHOD_ABBREVIATE in METHOD_ABBREVIATES:
        d_df_acc_allmethods_alldatasets_clean[METHOD_ABBREVIATE].to_csv(
            folder / f'df_acc_{METHOD_ABBREVIATE}_alldatasets_clean.csv', index=False
        )