In [154]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import  StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import csv
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [155]:
def preprocessing_base(csv_path):
    """
    Creazione dataframe e preprocessing
    Preprocessing: eliminate feature con valori costanti (1)
    Preprocessing: cambiate le varie anomalie con nomi diversi in 'anomaly' (2)
    Preprocessing: mapping 0: normal 1:anomaly (3)
    """
    df = pd.read_csv(csv_path)

    df=df.loc[:, (df != df.iloc[0]).any()]

    df.loc[df['label'] != 'normal', 'label'] = 'anomaly'

    df['label'] = df['label'].map({'normal': 0, 'anomaly': 1})

    y = df['label']
    X = df.drop(columns='label')

    return X,y,df

In [156]:
def my_LogisticRegression_pipeline(preprocessor, X_train, y_train):
    # LogisticRegression training
    modelLog = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))
    modelLog.fit(X_train, y_train)
    
    return modelLog

In [157]:
def my_LDA_pipeline(preprocessor,X_train,y_train):
    #LDA training
    modelLDA=make_pipeline(preprocessor,LinearDiscriminantAnalysis())
    modelLDA.fit(X_train,y_train)
    
    return modelLDA

In [158]:
def is_csv_empty(file_path):
    #Utility per scrivere su file csv
    with open(file_path, 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        try:
            first_row = next(csvreader)  
            return False if first_row else True
        except StopIteration:
            return True

def split_array(array, size): 
    #Utility per convertire un array in un array bidimensionale composto da array di 5 elementi ciascuno
    split_array = []
    for i in range(0, len(array), size):
        split_array.append(array[i:i+size])
    split_array = [subarray for subarray in split_array if len(subarray) == 5]
    
    return split_array

In [159]:
def get_mcc (csv_name,model, X_test, y_test,csv_note=None):
    #Calcola MCC e scrive su file csv
    MCC=(matthews_corrcoef(y_test, model.predict(X_test))+1)/2 # MCC scalato a [0,1]

    if csv_note is not None:
        with open(csv_name, 'a', newline='') as file:
            fieldnames = ['csv_note','MCC']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            if is_csv_empty(csv_name):
                writer.writeheader()

            writer.writerow({'csv_note': csv_note,'MCC': MCC})

    return MCC

In [160]:
def get_accuracy_cross_val (csv_name,model, X_train, y_train,csv_note=None):
    #Calcola accuracy ottenuta tramite cross validation  e scrive su file csv
    accuracies_lr = cross_val_score(estimator=model, X=X_train, y=y_train)
    avg_accuracy=np.mean(accuracies_lr)

    if csv_note is not None:
        with open(csv_name, 'a', newline='') as file:
            fieldnames = ['csv_note','Accuracy']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            if is_csv_empty(csv_name):
                writer.writeheader()

            writer.writerow({'csv_note': csv_note,'Accuracy': avg_accuracy})
    
    return avg_accuracy

In [161]:
def my_RandomForest_pipeline(preprocessor, X_train, y_train):
    # RandomForest training
    modelRand = make_pipeline(preprocessor, RandomForestClassifier(random_state=42)) 
    modelRand.fit(X_train, y_train)
    
    return modelRand

In [162]:
def my_XGB_pipeline(preprocessor, X_train, y_train):
    # XGB training
    modelXGB = make_pipeline(preprocessor, xgb.XGBClassifier(random_state=42)) 
    modelXGB.fit(X_train, y_train)
    
    return modelXGB

In [163]:
def window_input_difference(window_length: int, data: pd.DataFrame) -> pd.DataFrame:
    #Trasformazione del dataset in TS con differenze 
    df = data.copy()

    i = 1
    while i < window_length:
        for x in data.columns:
            df[f'{x}_{i}'] =data[f'{x}']-data[f'{x}'].shift(i)
        i = i + 1

    df = df.dropna(axis=0)

    for i in range(1, window_length):
        df = df.drop(columns=f'label_{i}')

    return df

In [164]:
def window_input_moving_average(window_length: int, data: pd.DataFrame) -> pd.DataFrame:
    #Trasformazione del dataset in TS con media mobile 
    df = data.copy()

    for x in data.columns:
        df[f'{x}_ma']=df[f'{x}'].rolling(window_length).mean()

    df = df.dropna(axis=0)

    df = df.drop(columns=f'label_ma')

    return df

In [165]:
def speed_detection_score(csv_name,model, X_test: pd.DataFrame, y_test: pd.DataFrame,csv_note=None):
    #Calcolo dello speed detection score come media pesata delle frequenze di rilevamento (decrescita quadratica)
    X_test['label'] = y_test
    X_test=X_test.sort_index()
    
    X_test_anomalies = X_test[X_test['label'] == 1]
    X_test_anomalies = X_test_anomalies.drop(columns='label')

    predictions = model.predict(X_test_anomalies)
    predictions = split_array(predictions, 5)

    counters={}
    numerator=0
    denominator=0
    score=0

    for i in range(5):
        counters[f'count_{i}']=0

    for x in predictions:
        if (x[0] == 1):
            counters['count_0'] = counters['count_0']+1
        elif (x[1] == 1):
            counters['count_1'] = counters['count_1']+1
        elif (x[2] == 1):
            counters['count_2'] = counters['count_2']+1
        elif (x[3] == 1):
            counters['count_3'] = counters['count_3']+1
        elif (x[4] == 1):
            counters['count_4'] = counters['count_4']+1

    numerator=(counters['count_0']*1+counters['count_1']*0.8**2+counters['count_2']*0.6**2+counters['count_3']*0.4**2+counters['count_4']*0.2**2)
    denominator=(counters['count_0']+counters['count_1']+counters['count_2']+counters['count_3']+counters['count_4'])
    score=numerator/denominator

    if csv_note is not None:
        with open(csv_name, 'a', newline='') as file:
            fieldnames = ['csv_note','speed_0', 'speed_1','speed_2','speed_3','speed_4']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            if is_csv_empty(csv_name):
                writer.writeheader()

            writer.writerow({'csv_note': csv_note,'speed_0': counters['count_0'],
                             'speed_1':counters['count_1'],'speed_2': counters['count_2'],
                             'speed_3':counters['count_3'],'speed_4':counters['count_4']})
        
    return score

In [166]:
def feature_importance_tree(note, model ,X_train):
    #Calcola la feature importance per i modelli basati su alberi decisionali e ne salva il plot delle prime 20 feature
    feature_importances = model.steps[-1][1].feature_importances_
    features = X_train.columns
    importances_df = pd.DataFrame(data={
    'Feature': features,
    'Importance': feature_importances
    })
    importances_df = importances_df.sort_values(by='Importance', ascending=False).head(20)

    plt.bar(x=importances_df['Feature'], height=importances_df['Importance'])
    plt.title(f'{note}')
    plt.xticks(rotation='vertical')
    plt.savefig(f'{note}',bbox_inches='tight')
    plt.show()

    return importances_df
   

In [167]:
def feature_importance_coef(note, model ,X_train):
    #Calcola la feature importance per i modelli statistici e ne salva il plot delle prime 20 feature
    feature_importances = model.steps[-1][1].coef_[0]
    features = X_train.columns
    importances_df = pd.DataFrame(data={
    'Feature': features,
    'Importance': np.abs(feature_importances)
    })
    importances_df = importances_df.sort_values(by='Importance', ascending=False).head(20)

    plt.bar(x=importances_df['Feature'], height=importances_df['Importance'])
    plt.title(f'{note}')
    plt.xticks(rotation='vertical')
    plt.savefig(f'{note}',bbox_inches='tight')
    plt.show()

    return importances_df
   

In [168]:
def training_classic(shuffle,X,y,fig_details):
    #Training dei modelli con approccio classico
    preprocessor=StandardScaler()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=shuffle) 

    modelXGBClassic=my_XGB_pipeline(preprocessor,X_train,y_train)

    modelLogClassic=my_LogisticRegression_pipeline(preprocessor,X_train,y_train)

    modelRandClassic=my_RandomForest_pipeline(preprocessor,X_train,y_train)

    modelLDAClassic=my_LDA_pipeline(preprocessor,X_train,y_train)

    classic_models={
        'XGB':modelXGBClassic,
        'LogisticRegression':modelLogClassic,
        'RandomForest':modelRandClassic,
        'LDA':modelLDAClassic
    }

    feature_importance_tree('Feature Importance XGB Classic_'+fig_details,classic_models['XGB'],X_train)
    feature_importance_coef('Feature Importance Logistic Regression Classic_'+fig_details,classic_models['LogisticRegression'],X_train)
    feature_importance_tree('Feature Importance Random Forest Classic_'+fig_details,classic_models['RandomForest'],X_train)
    feature_importance_coef('Feature Importance LDA Classic_'+fig_details,classic_models['LDA'],X_train)
    
    return classic_models,X_train,y_train,X_test,y_test

In [169]:
def map_creation_classic(models,X_train,y_train,X_test,y_test,csv_details):
    #Crea dei dizionari che contengono rispettivamente MCC, accuracy e speed score per ogni modello
    mcc_map={
        'Logistic Regression Classic':get_mcc('mcc_'+csv_details+'.csv',models['LogisticRegression'],X_test,y_test,'Logistic Regression Classic'),
        'XGB Classic':get_mcc('mcc_'+csv_details+'.csv',models['XGB'],X_test,y_test,'XGB Classic'),
        'Random Forest Classic':get_mcc('mcc_'+csv_details+'.csv',models['RandomForest'],X_test,y_test,'Random Forest Classic'),
        'LDA Classic':get_mcc('mcc_'+csv_details+'.csv',models['LDA'],X_test,y_test,'LDA Classic')
    }

    accuracy_map={
        'Logistic Regression Classic':get_accuracy_cross_val('accuracy_'+csv_details+'.csv',models['LogisticRegression'],X_train,y_train,'Logistic Regression Classic'),
        'XGB Classic':get_accuracy_cross_val('accuracy_'+csv_details+'.csv',models['XGB'],X_train,y_train,'XGB Classic'),
        'Random Forest Classic':get_accuracy_cross_val('accuracy_'+csv_details+'.csv',models['RandomForest'],X_train,y_train,'Random Forest Classic'),
        'LDA Classic':get_accuracy_cross_val('accuracy_'+csv_details+'.csv',models['LDA'],X_train,y_train,'LDA Classic')
    }

    speed_map={
        'Logistic Regression Classic':speed_detection_score('speed_'+csv_details+'.csv',models['LogisticRegression'],X_test,y_test,'Logistic Regression Classic'),
        'XGB Classic':speed_detection_score('speed_'+csv_details+'.csv',models['XGB'],X_test,y_test,'XGB Classic'),
        'Random Forest Classic':speed_detection_score('speed_'+csv_details+'.csv',models['RandomForest'],X_test,y_test,'Random Forest Classic'),
        'LDA Classic':speed_detection_score('speed_'+csv_details+'.csv',models['LDA'],X_test,y_test,'LDA Classic')
    }
    return mcc_map,accuracy_map,speed_map

In [170]:
def training_ts_diff(shuffle,X,y,df,size,fig_details):    
    #Training dei modelli con approccio time series con differenze
    new_df_difference = window_input_difference(size, df)
    y = new_df_difference['label']
    X = new_df_difference.drop(columns='label')

    preprocessor=StandardScaler()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=shuffle)

    modelLogTS=my_LogisticRegression_pipeline(preprocessor,X_train,y_train)

    modelRandTS=my_RandomForest_pipeline(preprocessor,X_train,y_train)

    modelXGBTS=my_XGB_pipeline(preprocessor,X_train,y_train)

    modelLDATS=my_LDA_pipeline(preprocessor,X_train,y_train)

    ts_diff_models={
        'XGB':modelXGBTS,
        'LogisticRegression':modelLogTS,
        'RandomForest':modelRandTS,
        'LDA':modelLDATS
    }

    feature_importance_tree('Feature Importance XGB TS Diff_'+fig_details,ts_diff_models['XGB'],X_train)
    feature_importance_coef('Feature Importance Logistic Regression TS Diff_'+fig_details,ts_diff_models['LogisticRegression'],X_train)
    feature_importance_tree('Feature Importance Random Forest TS Diff_'+fig_details,ts_diff_models['RandomForest'],X_train)
    feature_importance_coef('Feature Importance LDA TS Diff_'+fig_details,ts_diff_models['LDA'],X_train)
    

    return ts_diff_models,X_train,y_train,X_test,y_test




In [171]:
def map_update_TS(models,TS_type,X_train,y_train,X_test,y_test,mcc_map,accuracy_map,speed_map,csv_details):
    #Update dei dizionari
    keys=[None,None,None,None]
    if(TS_type=='Diff'):
        keys=['Logistic Regression TS Diff','XGB TS Diff','Random Forest TS Diff','LDA TS Diff']
    elif (TS_type=='MA'):
        keys=['Logistic Regression TS MA','XGB TS MA','Random Forest TS MA','LDA TS MA']

    mcc_map.update({
        keys[0]:get_mcc('mcc_'+csv_details+'.csv',models['LogisticRegression'],X_test,y_test,keys[0]),
        keys[1]:get_mcc('mcc_'+csv_details+'.csv',models['XGB'],X_test,y_test,keys[1]),
        keys[2]:get_mcc('mcc_'+csv_details+'.csv',models['RandomForest'],X_test,y_test,keys[2]),
        keys[3]:get_mcc('mcc_'+csv_details+'.csv',models['LDA'],X_test,y_test,keys[3])
    })

    accuracy_map.update({
        keys[0]:get_accuracy_cross_val('accuracy_'+csv_details+'.csv',models['LogisticRegression'],X_train,y_train,keys[0]),
        keys[1]:get_accuracy_cross_val('accuracy_'+csv_details+'.csv',models['XGB'],X_train,y_train,keys[1]),
        keys[2]:get_accuracy_cross_val('accuracy_'+csv_details+'.csv',models['RandomForest'],X_train,y_train,keys[2]),
        keys[3]:get_accuracy_cross_val('accuracy_'+csv_details+'.csv',models['LDA'],X_train,y_train,keys[3])
    })

    speed_map.update({
        keys[0]:speed_detection_score('speed_'+csv_details+'.csv',models['LogisticRegression'],X_test,y_test,keys[0]),
        keys[1]:speed_detection_score('speed_'+csv_details+'.csv',models['XGB'],X_test,y_test,keys[1]),
        keys[2]:speed_detection_score('speed_'+csv_details+'.csv',models['RandomForest'],X_test,y_test,keys[2]),
        keys[3]:speed_detection_score('speed_'+csv_details+'.csv',models['LDA'],X_test,y_test,keys[3])
    })
    return mcc_map,accuracy_map,speed_map

In [172]:
def training_ts_ma(shuffle,X,y,df,size,fig_details):
    #Training dei modelli con approccio time series con media mobile
    new_df_moving_average = window_input_moving_average(size, df)
    y = new_df_moving_average['label']
    X = new_df_moving_average.drop(columns='label')

    preprocessor=StandardScaler()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=shuffle)

    modelLogTSMA=my_LogisticRegression_pipeline(preprocessor,X_train,y_train)

    modelRandTSMA=my_RandomForest_pipeline(preprocessor,X_train,y_train)

    modelXGBTSMA=my_XGB_pipeline(preprocessor,X_train,y_train)

    modelLDATSMA=my_LDA_pipeline(preprocessor,X_train,y_train)

    ts_ma_models={
        'XGB':modelXGBTSMA,
        'LogisticRegression':modelLogTSMA,
        'RandomForest':modelRandTSMA,
        'LDA':modelLDATSMA
    }

    feature_importance_tree('Feature Importance XGB TS MA_'+fig_details,ts_ma_models['XGB'],X_train)
    feature_importance_coef('Feature Importance Logistic Regression TS MA_'+fig_details,ts_ma_models['LogisticRegression'],X_train)
    feature_importance_tree('Feature Importance Random Forest TS MA_'+fig_details,ts_ma_models['RandomForest'],X_train)
    feature_importance_coef('Feature Importance LDA TS MA_'+fig_details,ts_ma_models['LDA'],X_train)

    return ts_ma_models,X_train,y_train,X_test,y_test

In [173]:
def progression_plot(ax,map,note,mcc_map=None):
    #Utility per costruire i plot di progressione per MCC,error rate,accuracy e speed score
    x_values = list(map.keys())
    map_values=list(map.values()) #Può essere speed_map o accuracy_map

    if mcc_map==None:
        ax.plot(x_values, map_values, marker='o', linestyle='-', label='Speed Score')
    else:
        err_values=[1-x for x in map_values]
        mcc_values = list(mcc_map.values())
        ax.bar(x_values, err_values,color='orange')
        ax.set_ylabel('Error Rate', color='orange')  
        ax2=ax.twinx()
        ax2.plot(x_values, mcc_values, marker='o', linestyle='-')
        ax2.set_ylabel('MCC', color='blue')  
        ax.legend()
        ax2.legend()

In [174]:
def mcc_accuracy_progression(mcc_map,accuracy_map,details):
    #Plot progressione MCC, error rate e accuracy 
    logistic_mcc_map={}
    random_forest_mcc_map={}
    xgb_mcc_map={}
    lda_mcc_map={}

    logistic_accuracy_map={}
    random_forest_accuracy_map={}
    xgb_accuracy_map={}
    lda_accuracy_map={}

    for key, value in mcc_map.items():
        if 'Logistic' in key:
            logistic_mcc_map[key] = value
        if 'Random' in key:
            random_forest_mcc_map[key] = value
        if 'XGB' in key:
            xgb_mcc_map[key] = value
        if 'LDA' in key:
            lda_mcc_map[key] = value

    for key, value in accuracy_map.items():
        if 'Logistic' in key:
            logistic_accuracy_map[key] = value
        if 'Random' in key:
            random_forest_accuracy_map[key] = value
        if 'XGB' in key:
            xgb_accuracy_map[key] = value
        if 'LDA' in key:
            lda_accuracy_map[key] = value


    fig, ax = plt.subplots(2, 2, figsize=(18,12))
    fig.suptitle('MCC Progression & Error Rate Progression '+details)
    fig.subplots_adjust(wspace=0.3)
    fig.subplots_adjust(hspace=0.3)
    progression_plot(ax[0,0],logistic_accuracy_map,'Logistic_Regression_MCC_Progression',logistic_mcc_map)
    progression_plot(ax[0,1],xgb_accuracy_map,'XGB_MCC_Progression',xgb_mcc_map)
    progression_plot(ax[1,0],random_forest_accuracy_map,'Random_Forest_MCC_Progression',random_forest_mcc_map)
    progression_plot(ax[1,1],lda_accuracy_map,'LDA_MCC_Progression',lda_mcc_map)
    plt.savefig('MCC_Progression_'+details)

    fig, ax = plt.subplots(2, 2, figsize=(18,12))
    fig.suptitle('Accuracy Progression '+details)
    fig.subplots_adjust(wspace=0.3)
    fig.subplots_adjust(hspace=0.3)
    progression_plot(ax[0,0],logistic_accuracy_map,'Logistic_Regression_Accuracy_Progression')
    progression_plot(ax[0,1],xgb_accuracy_map,'XGB_Accuracy_Progression')
    progression_plot(ax[1,0],random_forest_accuracy_map,'Random_Forest_Accuracy_Progression')
    progression_plot(ax[1,1],lda_accuracy_map,'LDA_Accuracy_Progression')
    plt.savefig('Accuracy_Progression_'+details)

    return logistic_mcc_map,random_forest_mcc_map,xgb_mcc_map,lda_mcc_map



In [175]:
def speed_progression(speed_map,details):
    #Plot progressione speed score 
    logistic_speed_map={}
    random_forest_speed_map={}
    xgb_speed_map={}
    lda_speed_map={}

    for key, value in speed_map.items():
        if 'Logistic' in key:
            logistic_speed_map[key] = value
        if 'Random' in key:
            random_forest_speed_map[key] = value
        if 'XGB' in key:
            xgb_speed_map[key] = value
        if 'LDA' in key:
            lda_speed_map[key] = value

    fig, ax = plt.subplots(2, 2, figsize=(18,12))
    fig.suptitle('Detection Speed Progression '+details)
    fig.subplots_adjust(wspace=0.3)
    fig.subplots_adjust(hspace=0.3)
    progression_plot(ax[0,0],logistic_speed_map,'Logistic_Regression_Speed_Progression')
    progression_plot(ax[0,1],xgb_speed_map,'XGB_Speed_Progression')
    progression_plot(ax[1,0],random_forest_speed_map,'Random_Forest_Speed_Progression')
    progression_plot(ax[1,1],lda_speed_map,'LDA_Speed_Progression')
    plt.savefig('Speed_Progression_'+details)

In [176]:
def mcc_all3(classic_models,ts_diff_models,ts_ma_models,size,shuffle):
    #Calcolo MCC per tutti i modelli usando come test set il dataset my_all3
    homenet_df=pd.read_csv('preprocessed_arancino_datasets/homenet_filtered.csv')
    mobile_df=pd.read_csv('preprocessed_arancino_datasets/mobile_filtered.csv')
    unifi_df=pd.read_csv('preprocessed_arancino_datasets/unifi_filtered.csv')
    y=unifi_df['label']
    X=unifi_df.drop(columns='label')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=shuffle)
    my_unifi_df = pd.concat([X_test, y_test], axis=1)

    my_all3_df = pd.concat([homenet_df,mobile_df,my_unifi_df],axis=0)
    my_all3_df.to_csv("preprocessed_arancino_datasets/my_all3.csv", index=False)

    X,y,df=preprocessing_base('preprocessed_arancino_datasets/my_all3.csv')

    mcc_all_map={
        'Logistic Regression Classic':get_mcc('',classic_models['LogisticRegression'],X,y),
        'XGB Classic':get_mcc('',classic_models['XGB'],X,y),
        'Random Forest Classic':get_mcc('',classic_models['RandomForest'],X,y),
        'LDA Classic':get_mcc('',classic_models['LDA'],X,y)
    }
    
    new_df_moving_average=window_input_moving_average(size,df)
    y = new_df_moving_average['label']
    X = new_df_moving_average.drop(columns='label')

    mcc_all_map.update({
        'Logistic Regression TS MA':get_mcc('',ts_ma_models['LogisticRegression'],X,y),
        'XGB TS MA':get_mcc('',ts_ma_models['XGB'],X,y),
        'Random Forest TS MA':get_mcc('',ts_ma_models['RandomForest'],X,y),
        'LDA TS MA':get_mcc('',ts_ma_models['LDA'],X,y)
    })

    new_df_difference = window_input_difference(size, df)
    y = new_df_difference['label']
    X = new_df_difference.drop(columns='label')

    mcc_all_map.update({
        'Logistic Regression TS Diff':get_mcc('',ts_diff_models['LogisticRegression'],X,y),
        'XGB TS Diff':get_mcc('',ts_diff_models['XGB'],X,y),
        'Random Forest TS Diff':get_mcc('',ts_diff_models['RandomForest'],X,y),
        'LDA TS Diff':get_mcc('',ts_diff_models['LDA'],X,y)
    })
    
    return mcc_all_map

In [177]:
def mcc_all3_plot(mcc_all_map,logistic_mcc_map,random_forest_mcc_map,xgb_mcc_map,lda_mcc_map,details):    
    #Plot per comparare MCC sul test set di Uni con quello su my_all3
    logistic_mcc_all_map={}
    random_forest_mcc_all_map={}
    xgb_mcc_all_map={}
    lda_mcc_all_map={}

    #Split maps
    for key, value in mcc_all_map.items():
        if 'Logistic' in key:
            logistic_mcc_all_map[key] = value
        if 'Random' in key:
            random_forest_mcc_all_map[key] = value
        if 'XGB' in key:
            xgb_mcc_all_map[key] = value
        if 'LDA' in key:
            lda_mcc_all_map[key] = value

    x_values = list(logistic_mcc_all_map.keys())
    y_values=list(logistic_mcc_map.values())
    y_all_values = list(logistic_mcc_all_map.values())

    fig, ax = plt.subplots(2, 2, figsize=(18, 12))
    ax[0,0].plot(x_values, y_values, marker='o', linestyle='-',label='MCC unifi')
    ax[0,0].plot(x_values, y_all_values, marker='o', linestyle='-',label='MCC my_all3')
    ax[0,0].legend()

    x_values = list(xgb_mcc_map.keys())
    y_values=list(xgb_mcc_map.values())
    y_all_values = list(xgb_mcc_all_map.values())

    ax[0,1].plot(x_values, y_values, marker='o', linestyle='-',label='MCC unifi')
    ax[0,1].plot(x_values, y_all_values, marker='o', linestyle='-',label='MCC my_all3')
    ax[0,1].legend()

    x_values = list(random_forest_mcc_map.keys())
    y_values=list(random_forest_mcc_map.values())
    y_all_values = list(random_forest_mcc_all_map.values())

    ax[1,0].plot(x_values, y_values, marker='o', linestyle='-',label='MCC unifi')
    ax[1,0].plot(x_values, y_all_values, marker='o', linestyle='-',label='MCC my_all3')
    ax[1,0].legend()

    x_values = list(lda_mcc_map.keys())
    y_values=list(lda_mcc_map.values())
    y_all_values = list(lda_mcc_all_map.values())

    ax[1,1].plot(x_values, y_values, marker='o', linestyle='-',label='MCC unifi')
    ax[1,1].plot(x_values, y_all_values, marker='o', linestyle='-',label='MCC my_all3')
    ax[1,1].legend()

    fig.suptitle('MCC Comparison unifi-my_all3 '+details)
    fig.subplots_adjust(wspace=0.3)
    fig.subplots_adjust(hspace=0.3)
    plt.savefig('MCC Comparison Uni-All3 '+details)
    plt.show()


In [178]:
def main_win5_shufTrue():
    X,y,df=preprocessing_base('preprocessed_arancino_datasets/unifi_filtered.csv')

    shuffle=True
    size=5
    details='win5_shufTrue'
    
    classic_models,X_train,y_train,X_test,y_test=training_classic(shuffle,X,y,details)
    mcc_map,accuracy_map,speed_map=map_creation_classic(classic_models,X_train,y_train,X_test,y_test,details)

    ts_ma_models,X_train,y_train,X_test,y_test=training_ts_ma(shuffle,X,y,df,size,details)
    mcc_map,accuracy_map,speed_map=map_update_TS(ts_ma_models,'MA',X_train,y_train,X_test,y_test,mcc_map,accuracy_map,speed_map,details)

    ts_diff_models,X_train,y_train,X_test,y_test=training_ts_diff(shuffle,X,y,df,size,details)
    mcc_map,accuracy_map,speed_map=map_update_TS(ts_diff_models,'Diff',X_train,y_train,X_test,y_test,mcc_map,accuracy_map,speed_map,details)

    logistic_mcc_map,random_forest_mcc_map,xgb_mcc_map,lda_mcc_map=mcc_accuracy_progression(mcc_map,accuracy_map,details)
    mcc_all_map=mcc_all3(classic_models,ts_diff_models,ts_ma_models,size,shuffle)
    mcc_all3_plot(mcc_all_map,logistic_mcc_map,random_forest_mcc_map,xgb_mcc_map,lda_mcc_map,details)
    speed_progression(speed_map,details)

    return mcc_all_map

mcc_all_map_main=main_win5_shufTrue()



In [179]:
def main_win5_shufFalse():
    X,y,df=preprocessing_base('preprocessed_arancino_datasets/unifi_filtered.csv')

    shuffle=False
    size=5
    details='win5_shufFalse'
    
    classic_models,X_train,y_train,X_test,y_test=training_classic(shuffle,X,y,details)
    mcc_map,accuracy_map,speed_map=map_creation_classic(classic_models,X_train,y_train,X_test,y_test,details)

    ts_ma_models,X_train,y_train,X_test,y_test=training_ts_ma(shuffle,X,y,df,size,details)
    mcc_map,accuracy_map,speed_map=map_update_TS(ts_ma_models,'MA',X_train,y_train,X_test,y_test,mcc_map,accuracy_map,speed_map,details)

    ts_diff_models,X_train,y_train,X_test,y_test=training_ts_diff(shuffle,X,y,df,size,details)
    mcc_map,accuracy_map,speed_map=map_update_TS(ts_diff_models,'Diff',X_train,y_train,X_test,y_test,mcc_map,accuracy_map,speed_map,details)

    logistic_mcc_map,random_forest_mcc_map,xgb_mcc_map,lda_mcc_map=mcc_accuracy_progression(mcc_map,accuracy_map,details)
    mcc_all_map=mcc_all3(classic_models,ts_diff_models,ts_ma_models,size,shuffle)
    mcc_all3_plot(mcc_all_map,logistic_mcc_map,random_forest_mcc_map,xgb_mcc_map,lda_mcc_map,details)
    speed_progression(speed_map,details)

    return xgb_mcc_map,logistic_mcc_map,random_forest_mcc_map,lda_mcc_map

xgb_mcc_map_win5,logistic_mcc_map_win5,random_forest_mcc_map_win5,lda_mcc_map_win5=main_win5_shufFalse()

In [180]:
def main_win4_shufFalse():
    X,y,df=preprocessing_base('preprocessed_arancino_datasets/unifi_filtered.csv')

    shuffle=False
    size=4
    details='win4_shufFalse'
    
    classic_models,X_train,y_train,X_test,y_test=training_classic(shuffle,X,y,details)
    mcc_map,accuracy_map,speed_map=map_creation_classic(classic_models,X_train,y_train,X_test,y_test,details)

    ts_ma_models,X_train,y_train,X_test,y_test=training_ts_ma(shuffle,X,y,df,size,details)
    mcc_map,accuracy_map,speed_map=map_update_TS(ts_ma_models,'MA',X_train,y_train,X_test,y_test,mcc_map,accuracy_map,speed_map,details)

    ts_diff_models,X_train,y_train,X_test,y_test=training_ts_diff(shuffle,X,y,df,size,details)
    mcc_map,accuracy_map,speed_map=map_update_TS(ts_diff_models,'Diff',X_train,y_train,X_test,y_test,mcc_map,accuracy_map,speed_map,details)

    logistic_mcc_map,random_forest_mcc_map,xgb_mcc_map,lda_mcc_map=mcc_accuracy_progression(mcc_map,accuracy_map,details)
    mcc_all_map=mcc_all3(classic_models,ts_diff_models,ts_ma_models,size,shuffle)
    mcc_all3_plot(mcc_all_map,logistic_mcc_map,random_forest_mcc_map,xgb_mcc_map,lda_mcc_map,details)
    speed_progression(speed_map,details)

    return xgb_mcc_map,logistic_mcc_map,random_forest_mcc_map,lda_mcc_map

xgb_mcc_map_win4,logistic_mcc_map_win4,random_forest_mcc_map_win4,lda_mcc_map_win4=main_win4_shufFalse()

In [181]:
def main_win3_shufFalse():
    X,y,df=preprocessing_base('preprocessed_arancino_datasets/unifi_filtered.csv')

    shuffle=False
    size=3
    details='win3_shufFalse'
    
    classic_models,X_train,y_train,X_test,y_test=training_classic(shuffle,X,y,details)
    mcc_map,accuracy_map,speed_map=map_creation_classic(classic_models,X_train,y_train,X_test,y_test,details)

    ts_ma_models,X_train,y_train,X_test,y_test=training_ts_ma(shuffle,X,y,df,size,details)
    mcc_map,accuracy_map,speed_map=map_update_TS(ts_ma_models,'MA',X_train,y_train,X_test,y_test,mcc_map,accuracy_map,speed_map,details)

    ts_diff_models,X_train,y_train,X_test,y_test=training_ts_diff(shuffle,X,y,df,size,details)
    mcc_map,accuracy_map,speed_map=map_update_TS(ts_diff_models,'Diff',X_train,y_train,X_test,y_test,mcc_map,accuracy_map,speed_map,details)

    logistic_mcc_map,random_forest_mcc_map,xgb_mcc_map,lda_mcc_map=mcc_accuracy_progression(mcc_map,accuracy_map,details)
    mcc_all_map=mcc_all3(classic_models,ts_diff_models,ts_ma_models,size,shuffle)
    mcc_all3_plot(mcc_all_map,logistic_mcc_map,random_forest_mcc_map,xgb_mcc_map,lda_mcc_map,details)
    speed_progression(speed_map,details)

    return xgb_mcc_map,logistic_mcc_map,random_forest_mcc_map,lda_mcc_map

xgb_mcc_map_win3,logistic_mcc_map_win3,random_forest_mcc_map_win3,lda_mcc_map_win3=main_win3_shufFalse()

In [182]:
def main_win2_shufFalse():
    X,y,df=preprocessing_base('preprocessed_arancino_datasets/unifi_filtered.csv')

    shuffle=False
    size=2
    details='win2_shufFalse'
    
    classic_models,X_train,y_train,X_test,y_test=training_classic(shuffle,X,y,details)
    mcc_map,accuracy_map,speed_map=map_creation_classic(classic_models,X_train,y_train,X_test,y_test,details)

    ts_ma_models,X_train,y_train,X_test,y_test=training_ts_ma(shuffle,X,y,df,size,details)
    mcc_map,accuracy_map,speed_map=map_update_TS(ts_ma_models,'MA',X_train,y_train,X_test,y_test,mcc_map,accuracy_map,speed_map,details)

    ts_diff_models,X_train,y_train,X_test,y_test=training_ts_diff(shuffle,X,y,df,size,details)
    mcc_map,accuracy_map,speed_map=map_update_TS(ts_diff_models,'Diff',X_train,y_train,X_test,y_test,mcc_map,accuracy_map,speed_map,details)

    logistic_mcc_map,random_forest_mcc_map,xgb_mcc_map,lda_mcc_map=mcc_accuracy_progression(mcc_map,accuracy_map,details)
    mcc_all_map=mcc_all3(classic_models,ts_diff_models,ts_ma_models,size,shuffle)
    mcc_all3_plot(mcc_all_map,logistic_mcc_map,random_forest_mcc_map,xgb_mcc_map,lda_mcc_map,details)
    speed_progression(speed_map,details)

    return xgb_mcc_map,logistic_mcc_map,random_forest_mcc_map,lda_mcc_map

xgb_mcc_map_win2,logistic_mcc_map_win2,random_forest_mcc_map_win2,lda_mcc_map_win2=main_win2_shufFalse()

In [188]:
def delete_unreliable_features_classic(columns_to_drop,model_name,csv_name):  
    #Elimina le feature 'inaffidabili' dai modelli con shuffle=False
    X,y,df=preprocessing_base('preprocessed_arancino_datasets/unifi_filtered.csv')
    
    X=X.drop(columns=columns_to_drop)
    df=df.drop(columns=columns_to_drop)

    preprocessor=StandardScaler()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=False) 

    if(model_name=='XGB Classic'):
        model=my_XGB_pipeline(preprocessor,X_train,y_train)
    elif(model_name=='Logistic Regression Classic'):
        model=my_LogisticRegression_pipeline(preprocessor,X_train,y_train)
    elif(model_name=='Random Forest Classic'):
        model=my_RandomForest_pipeline(preprocessor,X_train,y_train)
    elif(model_name=='LDA Classic'):
        model=my_LDA_pipeline(preprocessor,X_train,y_train)

    MCC=(matthews_corrcoef(y_test, model.predict(X_test))+1)/2 # MCC scalato a [0,1]

    with open('mcc_csv/mcc_win5_shufFalse.csv', 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        for row in csvreader:
            if row[0]==model_name:
                value = row[1]

    with open(csv_name, 'a', newline='') as file:
            fieldnames = ['csv_note','MCC Without Unreliable Features','MCC With Unreliable Features']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            if is_csv_empty(csv_name):
                writer.writeheader()

            writer.writerow({'csv_note': model_name,'MCC Without Unreliable Features': MCC,'MCC With Unreliable Features': value})

    return model,X,y

    

In [189]:
import itertools

def mcc_plot_all3_unreliable(mcc_all_map_main1, mcc_map_without_unreliable_features):
    #Plot per confrontare MCC dei modelli addestrati con e senza feature inaffidabili su my_all3  
    x_values = list(itertools.islice(mcc_all_map_main1.keys(), 4))
    mcc_map_with_unreliable_features_values=list(itertools.islice(mcc_all_map_main1.values(), 4))
    mcc_map_without_unreliable_features_values=list(mcc_map_without_unreliable_features.values())

    plt.suptitle('MCC Comparison With/Without Unreliable Features on my_all3 ')
    plt.plot(x_values, mcc_map_with_unreliable_features_values, marker='o', linestyle='-',label='With Unreliable Features')
    plt.plot(x_values, mcc_map_without_unreliable_features_values, marker='o', linestyle='-',label='Without Unreliable Features')
    plt.legend()
    plt.savefig('MCC_Comparison_With_Without_Unreliable_Features_All3')

In [None]:
#Elimina Logistic Regression Unreliable Features
columns_to_drop=['meminfo.MemFree','iostat.nice','disk_io.read_time','redis_used_cpu_sys','cpu_times.idle']
model_name='Logistic Regression Classic'
csv_name='mcc_without_unreliable_features.csv'

X,y,df=preprocessing_base('preprocessed_arancino_datasets/my_all3.csv')

model,X,y=delete_unreliable_features_classic(columns_to_drop,model_name,csv_name)
MCC=(matthews_corrcoef(y, model.predict(X))+1)/2 # MCC scalato a [0,1]

mcc_unreliable_map={
    model_name:MCC
}

#Elimina XGB Unreliable Features
columns_to_drop=['net_io.bytes_sent','meminfo.Inactive','disk_io.read_bytes','cpu_times.iowait','vmstat.pgfree','temperature.temperature']
model_name='XGB Classic'

X,y,df=preprocessing_base('preprocessed_arancino_datasets/my_all3.csv')

model,X,y=delete_unreliable_features_classic(columns_to_drop,model_name,csv_name)
MCC=(matthews_corrcoef(y, model.predict(X))+1)/2 # MCC scalato a [0,1]

mcc_unreliable_map.update(
    {model_name:MCC}
)

#Elimina Random Forest Unreliable Features
columns_to_drop=['meminfo.MemAvailable','virtual.available','meminfo.Inactive','virtual.inactive']
model_name='Random Forest Classic'

X,y,df=preprocessing_base('preprocessed_arancino_datasets/my_all3.csv')

model,X,y=delete_unreliable_features_classic(columns_to_drop,model_name,csv_name)
MCC=(matthews_corrcoef(y, model.predict(X))+1)/2 # MCC scalato a [0,1]

mcc_unreliable_map.update(
    {model_name:MCC}
)

#Elimina LDA Unreliable Features
columns_to_drop=['cpu_times.idle','cpu_stats.ctx_switches','vmstat.pgreuse','cpu_times.nice','netinfo.lo.sent.pkts']
model_name='LDA Classic'

X,y,df=preprocessing_base('preprocessed_arancino_datasets/my_all3.csv')

model,X,y=delete_unreliable_features_classic(columns_to_drop,model_name,csv_name)
MCC=(matthews_corrcoef(y, model.predict(X))+1)/2 # MCC scalato a [0,1]

mcc_unreliable_map.update(
    {model_name:MCC}
)

mcc_plot_all3_unreliable(mcc_all_map_main,mcc_unreliable_map)


In [192]:
def mcc_progression_window(mcc_win5_map,mcc_win4_map,mcc_win3_map,mcc_win2_map,details):
    #Plot per confrontare progresione MCC dei modelli al variare della finestra temporale utilizzata  
    x_values = ['win5','win4','win3','win2']
    mcc_values_ma=[mcc_win5_map[details+' TS MA'],mcc_win4_map[details+' TS MA'],mcc_win3_map[details+' TS MA'],mcc_win2_map[details+' TS MA']]
    mcc_values_diff=[mcc_win5_map[details+' TS Diff'],mcc_win4_map[details+' TS Diff'],mcc_win3_map[details+' TS Diff'],mcc_win2_map[details+' TS Diff']]

    if(details=='XGB'):
        details='XGBoost'

    plt.suptitle('MCC Progression by Window Size '+details)
    plt.plot(x_values, mcc_values_ma, marker='o', linestyle='-',label='TS MA')
    plt.plot(x_values, mcc_values_diff, marker='o', linestyle='-',label='TS Diff')
    plt.legend()
    plt.savefig('MCC_Progression_by_Window_Size_'+details)
    plt.clf()

mcc_progression_window(xgb_mcc_map_win5,xgb_mcc_map_win4,xgb_mcc_map_win3,xgb_mcc_map_win2,'XGB')
mcc_progression_window(logistic_mcc_map_win5,logistic_mcc_map_win4,logistic_mcc_map_win3,logistic_mcc_map_win2,'Logistic Regression')
mcc_progression_window(random_forest_mcc_map_win5,random_forest_mcc_map_win4,random_forest_mcc_map_win3,random_forest_mcc_map_win2,'Random Forest')
mcc_progression_window(lda_mcc_map_win5,lda_mcc_map_win4,lda_mcc_map_win3,lda_mcc_map_win2,'LDA')

<Figure size 640x480 with 0 Axes>