In [52]:
import pandas as pd
import numpy as np
import random
from pickleshare import PickleShareDB
import os 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, matthews_corrcoef, balanced_accuracy_score
import time
import memory_profiler

In [53]:
# pour la reproductibilité 
np.random.seed(0)
random.seed(0)
tf.random.set_seed(0)

In [54]:
# charger les données de prep_data

data_dir = '../prep_data' 
db = PickleShareDB(os.path.join(data_dir, 'kity'))

if 'df_phy_1' in db:
    df_phy_1 = db['df_phy_1']
else:
    print("df_phy_1 n'est pas trouvé dans la base de données.")
if 'df_phy_2' in db:
    df_phy_2 = db['df_phy_2']
else:
    print("df_phy_2 n'est pas trouvé dans la base de données.")
if 'df_phy_3' in db:
    df_phy_3 = db['df_phy_3']
else:
    print("df_phy_3 n'est pas trouvé dans la base de données.")
if 'df_phy_4' in db:
    df_phy_4 = db['df_phy_4']
else:
    print("df_phy_4 n'est pas trouvé dans la base de données.")
if 'df_phy_norm' in db:
    df_phy_norm = db['df_phy_norm']
else:
    print("df_phy_norm n'est pas trouvé dans la base de données.")
if 'df_phy_attack' in db:
    df_phy_attack = db['df_phy_attack']
if 'df_phy_all' in db:
    df_phy_all = db['df_phy_all']
else:
    print("df_phy_all n'est pas trouvé dans la base de données.")
if 'dict_dfs' in db:
    dict_dfs = db['dict_dfs']
else:
    print("dict_dfs n'est pas trouvé dans la base de données.")


In [55]:
df_phy_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2420 entries, 0 to 2419
Data columns (total 29 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Time           2420 non-null   datetime64[ns]
 1   Tank_1         2420 non-null   int64         
 2   Tank_2         2420 non-null   int64         
 3   Tank_3         2420 non-null   int64         
 4   Tank_4         2420 non-null   int64         
 5   Tank_5         2420 non-null   int64         
 6   Tank_6         2420 non-null   int64         
 7   Tank_7         2420 non-null   int64         
 8   Tank_8         2420 non-null   int64         
 9   Pump_1         2420 non-null   bool          
 10  Pump_2         2420 non-null   bool          
 11  Pump_4         2420 non-null   bool          
 12  Pump_5         2420 non-null   bool          
 13  Pump_6         2420 non-null   bool          
 14  Flow_sensor_1  2420 non-null   category      
 15  Flow_sensor_2  2420 n

In [56]:
df_phy_1.head()

Unnamed: 0,Time,Tank_1,Tank_2,Tank_3,Tank_4,Tank_5,Tank_6,Tank_7,Tank_8,Pump_1,...,Valv_12,Valv_13,Valv_14,Valv_15,Valv_17,Valv_18,Valv_20,Valv_22,Label_n,Label
0,2021-04-09 18:23:28,0,0,0,0,0,0,0,0,False,...,False,False,False,False,False,False,False,False,False,normal
1,2021-04-09 18:23:29,0,0,0,0,0,0,0,0,False,...,False,False,False,False,False,False,False,False,False,normal
2,2021-04-09 18:23:30,0,0,0,0,0,0,0,0,False,...,False,False,False,False,False,False,False,False,False,normal
3,2021-04-09 18:23:31,0,0,0,0,0,0,0,0,False,...,False,False,False,False,False,False,False,False,False,normal
4,2021-04-09 18:23:32,0,0,0,0,0,0,0,0,True,...,False,False,False,False,False,False,False,False,False,normal


In [57]:
# 5 séries temporelles multivariées
# créer des fenêtres temporelles sur chaque jeu de données sans essayer de les fusionner sur les dates 
# chaque dataset aura ses propres fenêtres glissantes
# supprimer la colonne de temps
# ensuite diviser en X et y (Label et Label_n, on chercher à prédire l'un ou l'autre)

In [58]:
cnn_1D_df_phy_1 = df_phy_1.drop(columns=['Time'])
cnn_1D_df_phy_2 = df_phy_2.drop(columns=['Time'])
cnn_1D_df_phy_3 = df_phy_3.drop(columns=['Time'])
cnn_1D_df_phy_4 = df_phy_4.drop(columns=['Time'])
cnn_1D_df_phy_norm = df_phy_norm.drop(columns=['Time'])

In [59]:
# Diviser en X (features) et y (labels)
X_phy_1 = cnn_1D_df_phy_1.drop(columns=['Label', 'Label_n'])  
y_label_phy_1 = cnn_1D_df_phy_1['Label']                      
y_label_n_phy_1 = cnn_1D_df_phy_1['Label_n']                  

X_phy_2 = cnn_1D_df_phy_2.drop(columns=['Label', 'Label_n'])  
y_label_phy_2 = cnn_1D_df_phy_2['Label']                      
y_label_n_phy_2 = cnn_1D_df_phy_2['Label_n']      

X_phy_3 = cnn_1D_df_phy_3.drop(columns=['Label', 'Label_n'])
y_label_phy_3 = cnn_1D_df_phy_3['Label']
y_label_n_phy_3 = cnn_1D_df_phy_3['Label_n']

X_phy_4 = cnn_1D_df_phy_4.drop(columns=['Label', 'Label_n'])
y_label_phy_4 = cnn_1D_df_phy_4['Label']
y_label_n_phy_4 = cnn_1D_df_phy_4['Label_n']

X_phy_norm = cnn_1D_df_phy_norm.drop(columns=['Label', 'Label_n'])
y_label_phy_norm = cnn_1D_df_phy_norm['Label']
y_label_n_phy_norm = cnn_1D_df_phy_norm['Label_n']

In [60]:
# TODO : pas sur de ce coup là; je pense l'enlever : à voir

# on va concaténer les données, mais la fin d'un dataset n'est pas lié au début du suivant
# on ajoute juste une colonne pour identifier le dataset d'origine
X_phy_1['dataset_id'] = 'dataset_1'
X_phy_2['dataset_id'] = 'dataset_2'
X_phy_3['dataset_id'] = 'dataset_3'
X_phy_4['dataset_id'] = 'dataset_4'
X_phy_norm['dataset_id'] = 'dataset_norm'

X_phy_1['dataset_id'] = X_phy_1['dataset_id'].astype('category')
X_phy_2['dataset_id'] = X_phy_2['dataset_id'].astype('category')
X_phy_3['dataset_id'] = X_phy_3['dataset_id'].astype('category')
X_phy_4['dataset_id'] = X_phy_4['dataset_id'].astype('category')
X_phy_norm['dataset_id'] = X_phy_norm['dataset_id'].astype('category')


In [61]:
# Normaliser : MinMaxScaler nous permet de préparer nos données de manière efficace,
# garantissant que chaque feature contribut de manière égale au processus d'apprentissage

In [62]:
# Fonction pour normaliser uniquement les colonnes numériques d'un DataFrame
def normalize_numeric_columns(df):
    scaler = MinMaxScaler()
    numeric_cols = df.select_dtypes(include=['int64']).columns  # Sélectionne les colonnes numériques
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])  # Applique le scaler uniquement sur les colonnes numériques
    return df

# Appliquez la normalisation sur chaque DataFrame individuellement
X_phy_1 = normalize_numeric_columns(X_phy_1)
X_phy_2 = normalize_numeric_columns(X_phy_2)
X_phy_3 = normalize_numeric_columns(X_phy_3)
X_phy_4 = normalize_numeric_columns(X_phy_4)
X_phy_norm = normalize_numeric_columns(X_phy_norm)


In [63]:
cat_cols = [df.select_dtypes(include=['category']).columns for df in [X_phy_1, X_phy_2, X_phy_3, X_phy_4, X_phy_norm]]  
cat_cols = set(cat_cols[0])  
cat_cols

{'Flow_sensor_1', 'dataset_id'}

In [64]:
# extraire tous les catégories pour les colonnes catégorielles 

def extract_categories(df):
    return {col: list(df[col].cat.categories) for col in df.select_dtypes(include=['category']).columns}

cat_dict_phy_1 = extract_categories(X_phy_1)
cat_dict_phy_2 = extract_categories(X_phy_2)
cat_dict_phy_3 = extract_categories(X_phy_3)
cat_dict_phy_4 = extract_categories(X_phy_4)
cat_dict_phy_norm = extract_categories(X_phy_norm)

# fusionner les catégories de tous les datasets
cat_dict_all = {}
for cat_dict in [cat_dict_phy_1, cat_dict_phy_2, cat_dict_phy_3, cat_dict_phy_4, cat_dict_phy_norm]:
    for key, value in cat_dict.items():
        if key not in cat_dict_all:
            cat_dict_all[key] = value
        else:
            cat_dict_all[key] = list(set(cat_dict_all[key] + value))
cat_dict_all

{'Flow_sensor_1': [0, 100, 4000],
 'dataset_id': ['dataset_norm',
  'dataset_1',
  'dataset_2',
  'dataset_4',
  'dataset_3']}

In [65]:
# Fonction pour appliquer le One-Hot Encoding tout en gardant les mêmes colonnes
def apply_one_hot_encoding(df, cat_dict_all):
    for col, categories in cat_dict_all.items():
        if col in df.columns:  # Appliquer le One-Hot uniquement aux colonnes présentes
            # Convertir la colonne en Categorical avec les catégories globales
            df[col] = pd.Categorical(df[col], categories=categories)
            
            # Créer un DataFrame avec le One-Hot Encoding en utilisant les catégories globales
            dummies = pd.get_dummies(df[col], prefix=col)
            
            # Ajouter les colonnes dummies et supprimer la colonne originale
            df = pd.concat([df, dummies], axis=1).drop(col, axis=1)
    
    return df

# Appliquer One-Hot Encoding pour chaque dataframe
X_phy_1 = apply_one_hot_encoding(X_phy_1, cat_dict_all)
X_phy_2 = apply_one_hot_encoding(X_phy_2, cat_dict_all)
X_phy_3 = apply_one_hot_encoding(X_phy_3, cat_dict_all)
X_phy_4 = apply_one_hot_encoding(X_phy_4, cat_dict_all)
X_phy_norm = apply_one_hot_encoding(X_phy_norm, cat_dict_all)


In [66]:
# vérif de la taille des données
print(X_phy_1.shape)
print(X_phy_2.shape)
print(X_phy_3.shape)
print(X_phy_4.shape)
print(X_phy_norm.shape)

(2420, 33)
(2104, 33)
(1254, 33)
(1717, 33)
(3428, 33)


In [67]:
# vérif info
print(X_phy_1.info())
print()
print(X_phy_norm.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2420 entries, 0 to 2419
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Tank_1                   2420 non-null   float64
 1   Tank_2                   2420 non-null   float64
 2   Tank_3                   2420 non-null   float64
 3   Tank_4                   2420 non-null   float64
 4   Tank_5                   2420 non-null   float64
 5   Tank_6                   2420 non-null   float64
 6   Tank_7                   2420 non-null   float64
 7   Tank_8                   2420 non-null   float64
 8   Pump_1                   2420 non-null   bool   
 9   Pump_2                   2420 non-null   bool   
 10  Pump_4                   2420 non-null   bool   
 11  Pump_5                   2420 non-null   bool   
 12  Pump_6                   2420 non-null   bool   
 13  Flow_sensor_2            2420 non-null   bool   
 14  Flow_sensor_4           

In [68]:
print(X_phy_2.value_counts('Flow_sensor_1_100'))
# que des False -> ok

print(X_phy_3.value_counts('dataset_id_dataset_1'))
# que False -> ok

print(X_phy_3.value_counts('dataset_id_dataset_3'))
# que True -> ok

Flow_sensor_1_100
False    2104
Name: count, dtype: int64
dataset_id_dataset_1
False    1254
Name: count, dtype: int64
dataset_id_dataset_3
True    1254
Name: count, dtype: int64


In [69]:
# Fonction pour récupérer les valeurs uniques de chaque colonne catégorielle à travers tous les datasets
def get_unique_values_all_labels(*dfs):
    unique_values = {}
    
    # Combiner tous les datasets
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Identifier les colonnes catégorielles
    for col in combined_df.select_dtypes(include=['object']).columns:
        # Ajouter les valeurs uniques de chaque colonne catégorielle
        unique_values[col] = combined_df[col].unique()
    
    return unique_values

unique_values = get_unique_values_all_labels(X_phy_1, X_phy_2, X_phy_3, X_phy_4, X_phy_norm)
unique_values

# plus rien -> ok

{}

In [70]:
print(y_label_phy_1.value_counts())
print(y_label_phy_2.value_counts())

Label
normal            1610
MITM               533
physical fault     277
Name: count, dtype: int64
Label
normal            1798
physical fault     123
MITM                96
DoS                 80
scan                 7
Name: count, dtype: int64


In [71]:
# Concaténer les labels de tous les datasets
all_labels = pd.concat([y_label_phy_1, y_label_phy_2, y_label_phy_3, y_label_phy_4, y_label_phy_norm], ignore_index=True)
print(all_labels.value_counts())

# Définir explicitement l'ordre des classes, avec 'normal' en premier
ordered_classes = ['normal', 'DoS', 'MITM', 'physical fault', 'scan']
nb_class = 5

# Créer un LabelEncoder et assigner directement l'ordre des classes
label_encoder = LabelEncoder()
# Assigner les classes manuellement : on force l'ordre pour que normal soit à 0
label_encoder.classes_ = np.array(ordered_classes)

# Récupérer la correspondance entre les labels d'origine et les labels encodés
label_mapping = {label: encoded for label, encoded in zip(label_encoder.classes_, range(len(label_encoder.classes_)))}
print(label_mapping)

# Appliquer la transformation sur les datasets
y_label_phy_1_encoder = label_encoder.transform(df_phy_1['Label'])
y_label_phy_2_encoder = label_encoder.transform(df_phy_2['Label'])
y_label_phy_3_encoder = label_encoder.transform(df_phy_3['Label'])
y_label_phy_4_encoder = label_encoder.transform(df_phy_4['Label'])
y_label_phy_norm_encoder = label_encoder.transform(df_phy_norm['Label'])

# Convertir en DataFrame
y_label_phy_1_df_enc = pd.DataFrame(y_label_phy_1_encoder, columns=['encoded_label'])
y_label_phy_2_df_enc = pd.DataFrame(y_label_phy_2_encoder, columns=['encoded_label'])
y_label_phy_3_df_enc = pd.DataFrame(y_label_phy_3_encoder, columns=['encoded_label'])
y_label_phy_4_df_enc = pd.DataFrame(y_label_phy_4_encoder, columns=['encoded_label'])
y_label_phy_norm_df_enc = pd.DataFrame(y_label_phy_norm_encoder, columns=['encoded_label'])

Label
normal            8906
MITM              1008
physical fault     685
DoS                310
scan                14
Name: count, dtype: int64
{np.str_('normal'): 0, np.str_('DoS'): 1, np.str_('MITM'): 2, np.str_('physical fault'): 3, np.str_('scan'): 4}


In [72]:
# Affichage pour vérifier
print(y_label_phy_1.value_counts())
print(y_label_phy_1_df_enc['encoded_label'].value_counts())
# ok -> on écrase y_label_phy_1

Label
normal            1610
MITM               533
physical fault     277
Name: count, dtype: int64
encoded_label
0    1610
2     533
3     277
Name: count, dtype: int64


In [73]:
y_label_phy_1 = y_label_phy_1_df_enc
y_label_phy_2 = y_label_phy_2_df_enc
y_label_phy_3 = y_label_phy_3_df_enc
y_label_phy_4 = y_label_phy_4_df_enc
y_label_phy_norm = y_label_phy_norm_df_enc

In [74]:
print(X_phy_1.shape)
print(y_label_n_phy_1.shape)
print(y_label_phy_1.shape)

(2420, 33)
(2420,)
(2420, 1)


## Modélisation pour essayer de détecter les attaques 

### Prédiction de Label_n 

In [75]:
def create_sliding_windows(X, y, window_size):
    X_windows = []
    y_windows = []

    for i in range(len(X) - window_size):
        X_windows.append(X.iloc[i:i + window_size])  
        y_windows.append(y.iloc[i + window_size])   

    return np.array(X_windows), np.array(y_windows)

In [76]:
window_size = 10 # TODO : déterminer 10 secondes via viz ? sinon changer la valeur

# Créer des fenêtres glissantes pour X et y
X_windows_phy_1, y_windows_label_n_phy_1 = create_sliding_windows(X_phy_1, y_label_n_phy_1, window_size)
X_windows_phy_2, y_windows_label_n_phy_2 = create_sliding_windows(X_phy_2, y_label_n_phy_2, window_size)
X_windows_phy_3, y_windows_label_n_phy_3 = create_sliding_windows(X_phy_3, y_label_n_phy_3, window_size)
X_windows_phy_4, y_windows_label_n_phy_4 = create_sliding_windows(X_phy_4, y_label_n_phy_4, window_size)
X_windows_phy_norm, y_windows_label_n_phy_norm = create_sliding_windows(X_phy_norm, y_label_n_phy_norm, window_size)

In [77]:
# Vérifier la forme des données
print(X_windows_phy_1.shape)  # (nombre d'échantillons, window_size, nombre de features)
print(y_windows_label_n_phy_1.shape)  # (nombre d'échantillons,)

print(X_windows_phy_2.shape)
print(y_windows_label_n_phy_2.shape)

print(X_windows_phy_3.shape)
print(y_windows_label_n_phy_3.shape)

print(X_windows_phy_4.shape)
print(y_windows_label_n_phy_4.shape)

print(X_windows_phy_norm.shape)
print(y_windows_label_n_phy_norm.shape)

(2410, 10, 33)
(2410,)
(2094, 10, 33)
(2094,)
(1244, 10, 33)
(1244,)
(1707, 10, 33)
(1707,)
(3418, 10, 33)
(3418,)


In [78]:
# concaténer les données

X_all = np.concatenate([X_windows_phy_1, X_windows_phy_2, X_windows_phy_3, X_windows_phy_4, X_windows_phy_norm], axis=0)
y_all = np.concatenate([y_windows_label_n_phy_1, y_windows_label_n_phy_2, y_windows_label_n_phy_3, y_windows_label_n_phy_4, y_windows_label_n_phy_norm], axis=0)

# Vérifier la forme des données après concaténation
print("Shape of X_all:", X_all.shape)  # (nombre d'échantillons, window_size, nombre de features)
print("Shape of y_all:", y_all.shape)  # (nombre d'échantillons,)

Shape of X_all: (10873, 10, 33)
Shape of y_all: (10873,)


In [79]:
# Convertir les colonnes booléennes de X_train et X_test en float32
X_all = X_all.astype('float64')
y_all = y_all.astype('float64')

In [80]:
# division en train et test
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

In [81]:
print(type(X_train), X_train.dtype)
print(type(y_train), y_train.dtype)

<class 'numpy.ndarray'> float64
<class 'numpy.ndarray'> float64


In [82]:
# Définir et entraîner le modèle CNN 1D
model = Sequential()

# Ajouter une couche Conv1D
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(MaxPooling1D(pool_size=2))

# Ajouter d'autres couches convolutives si nécessaire
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# Aplatir les sorties pour les couches fully connected
model.add(Flatten())

# Ajouter une couche Dense pour la classification binaire
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # Une sortie avec activation sigmoïde pour classification binaire

# Compiler le modèle : binary_crossentropy pour une classification binaire
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Mise en place de l'EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [83]:
# Mesurer le temps et la mémoire pour l'entraînement
start_fit_time = time.time()
fit_memory_before = memory_profiler.memory_usage()[0]

# Entraîner le modèle
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping]
)

fit_time = time.time() - start_fit_time
fit_memory_after = memory_profiler.memory_usage()[0]
fit_memory_usage = fit_memory_after - fit_memory_before

Epoch 1/100
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8340 - loss: 0.3837 - val_accuracy: 0.8699 - val_loss: 0.2725
Epoch 2/100
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9033 - loss: 0.2196 - val_accuracy: 0.9099 - val_loss: 0.2024
Epoch 3/100
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9242 - loss: 0.1696 - val_accuracy: 0.9241 - val_loss: 0.1710
Epoch 4/100
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9393 - loss: 0.1432 - val_accuracy: 0.9430 - val_loss: 0.1338
Epoch 5/100
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9474 - loss: 0.1266 - val_accuracy: 0.9517 - val_loss: 0.1153
Epoch 6/100
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9517 - loss: 0.1133 - val_accuracy: 0.9545 - val_loss: 0.1051
Epoch 7/100
[1m272/27

In [84]:
# Mesurer le temps et la mémoire pour la prédiction
start_predict_time = time.time()
pred_memory_before = memory_profiler.memory_usage()[0]

# Prédire sur le jeu de test
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()  # Convertir les probabilités en classes binaires (True/False)

predict_time = time.time() - start_predict_time
pred_memory_after = memory_profiler.memory_usage()[0]
predict_memory_usage = pred_memory_after - pred_memory_before

[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [85]:
# Calculer les métriques de classification
conf_matrix = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = conf_matrix.ravel() # attention c'est dans cet ordre !

precision = precision_score(y_test, y_pred)
recall_tpr = recall_score(y_test, y_pred)
tnr = TN / (TN + FP) if (TN + FP) != 0 else 0
fpr = FP / (FP + TN) if (FP + TN) != 0 else 0
accuracy = accuracy_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

results_cnn1d_label_n = {
    'data': 'PHY',
    'model_type': 'CNN 1D',
    'attack_type': 'labeln',
    'confusion_matrix': conf_matrix,
    'precision': precision,
    'recall': recall_tpr,
    'tnr': tnr,
    'fpr': fpr,
    'accuracy': accuracy,
    'f1': f1,
    'balanced_accuracy': balanced_acc,
    'mcc': mcc,
    'fit_time': fit_time,
    'predict_time': predict_time,
    'fit_memory_usage': fit_memory_usage,
    'predict_memory_usage': predict_memory_usage
}

In [86]:
# Afficher les résultats
print("Évaluation complète du modèle:")
for metric, value in results_cnn1d_label_n.items():
    print(f"{metric}: {value}")

Évaluation complète du modèle:
data: PHY
model_type: CNN 1D
attack_type: labeln
confusion_matrix: [[1759   22]
 [  28  366]]
precision: 0.9432989690721649
recall: 0.9289340101522843
tnr: 0.9876473891072431
fpr: 0.012352610892756879
accuracy: 0.9770114942528736
f1: 0.9360613810741688
balanced_accuracy: 0.9582906996297638
mcc: 0.9220892760704508
fit_time: 55.926172494888306
predict_time: 0.4510812759399414
fit_memory_usage: -28.0859375
predict_memory_usage: 0.15625


In [87]:
# sauvegarder dans PickleShareDB
db['PHY_results_cnn1d_labeln'] = results_cnn1d_label_n

### Prédiction de Label


In [88]:
window_size = 10 # TODO : déterminer 10 secondes via viz ? sinon changer la valeur

# Créer des fenêtres glissantes pour X et y
X_windows_phy_1, y_windows_label_phy_1 = create_sliding_windows(X_phy_1, y_label_phy_1, window_size)
X_windows_phy_2, y_windows_label_phy_2 = create_sliding_windows(X_phy_2, y_label_phy_2, window_size)
X_windows_phy_3, y_windows_label_phy_3 = create_sliding_windows(X_phy_3, y_label_phy_3, window_size)
X_windows_phy_4, y_windows_label_phy_4 = create_sliding_windows(X_phy_4, y_label_phy_4, window_size)
X_windows_phy_norm, y_windows_label_phy_norm = create_sliding_windows(X_phy_norm, y_label_phy_norm, window_size)

In [89]:
# Vérifier la forme des données
print(X_windows_phy_1.shape)  # (nombre d'échantillons, window_size, nombre de features)
print(y_windows_label_phy_1.shape)  # (nombre d'échantillons,)

print(X_windows_phy_2.shape)
print(y_windows_label_phy_2.shape)

print(X_windows_phy_3.shape)
print(y_windows_label_phy_3.shape)

print(X_windows_phy_4.shape)
print(y_windows_label_phy_4.shape)

print(X_windows_phy_norm.shape)
print(y_windows_label_phy_norm.shape)

(2410, 10, 33)
(2410, 1)
(2094, 10, 33)
(2094, 1)
(1244, 10, 33)
(1244, 1)
(1707, 10, 33)
(1707, 1)
(3418, 10, 33)
(3418, 1)


In [90]:
# concaténer les données

X_all = np.concatenate([X_windows_phy_1, X_windows_phy_2, X_windows_phy_3, X_windows_phy_4, X_windows_phy_norm], axis=0)
y_all = np.concatenate([y_windows_label_phy_1, y_windows_label_phy_2, y_windows_label_phy_3, y_windows_label_phy_4, y_windows_label_phy_norm], axis=0)

# Vérifier la forme des données après concaténation
print("Shape of X_all:", X_all.shape)  # (nombre d'échantillons, window_size, nombre de features)
print("Shape of y_all:", y_all.shape)  # (nombre d'échantillons,)

Shape of X_all: (10873, 10, 33)
Shape of y_all: (10873, 1)


In [91]:
# Convertir les colonnes booléennes de X_train et X_test en float32
X_all = X_all.astype('float64')
y_all = y_all.astype('float64')

In [92]:
# division en train et test
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

In [93]:
# Définir et entraîner le modèle CNN 1D
model = Sequential()

# Ajouter une couche Conv1D
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(MaxPooling1D(pool_size=2))

# Ajouter d'autres couches convolutives si nécessaire
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# Aplatir les sorties pour les couches fully connected
model.add(Flatten())

# Ajouter une couche Dense pour la classification multiclasse
model.add(Dense(nb_class, activation='softmax'))  # Une sortie avec activation softmax pour classification multiclasse

# Compiler le modèle : sparse_categorical_crossentropy pour une classification multiclasse
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Mise en place de l'EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [94]:
# Mesurer le temps et la mémoire pour l'entraînement
start_fit_time = time.time()
fit_memory_before = memory_profiler.memory_usage()[0]

# Entraîner le modèle
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping]
)

fit_time = time.time() - start_fit_time
fit_memory_after = memory_profiler.memory_usage()[0]
fit_memory_usage = fit_memory_after - fit_memory_before

Epoch 1/100
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8020 - loss: 0.6100 - val_accuracy: 0.8469 - val_loss: 0.3309
Epoch 2/100
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8875 - loss: 0.2841 - val_accuracy: 0.8828 - val_loss: 0.2657
Epoch 3/100
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9119 - loss: 0.2172 - val_accuracy: 0.9011 - val_loss: 0.2171
Epoch 4/100
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9294 - loss: 0.1783 - val_accuracy: 0.9177 - val_loss: 0.1964
Epoch 5/100
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9383 - loss: 0.1529 - val_accuracy: 0.9260 - val_loss: 0.1733
Epoch 6/100
[1m272/272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9478 - loss: 0.1316 - val_accuracy: 0.9384 - val_loss: 0.1541
Epoch 7/100
[1m272/27

In [95]:
# Mesurer le temps et la mémoire pour la prédiction
start_predict_time = time.time()
pred_memory_before = memory_profiler.memory_usage()[0]

# Prédire sur le jeu de test
y_pred_proba = model.predict(X_test)
y_pred = y_pred_proba.argmax(axis=1)  # Choisir la classe avec la probabilité la plus élevée

predict_time = time.time() - start_predict_time
pred_memory_after = memory_profiler.memory_usage()[0]
predict_memory_usage = pred_memory_after - pred_memory_before

[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [96]:
# on veut connaitre les perfs pour chaque attaque :
# évaluer pour chaque label != normal 

In [97]:
label_mapping

{np.str_('normal'): 0,
 np.str_('DoS'): 1,
 np.str_('MITM'): 2,
 np.str_('physical fault'): 3,
 np.str_('scan'): 4}

In [98]:
reverse_label_mapping = {v: k for k, v in label_mapping.items()}

In [99]:
# Calcul de la confusion matrix pour tout le test
conf_matrix = confusion_matrix(y_test, y_pred)

# Dictionnaire pour stocker les résultats de chaque classe
class_results = {}

for class_label in range(nb_class):
    # Extraire la matrice de confusion pour la classe spécifique
    TP = conf_matrix[class_label, class_label]
    FP = sum(conf_matrix[:, class_label]) - TP
    FN = sum(conf_matrix[class_label, :]) - TP
    TN = conf_matrix.sum() - (TP + FP + FN)

    # Calcul des métriques pour chaque classe
    precision = precision_score(y_test, y_pred, average=None)[class_label]
    recall_tpr = recall_score(y_test, y_pred, average=None)[class_label]
    tnr = TN / (TN + FP) if (TN + FP) != 0 else 0
    fpr = FP / (FP + TN) if (FP + TN) != 0 else 0
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    f1 = f1_score(y_test, y_pred, average=None)[class_label]
    balanced_acc = (recall_tpr + tnr) / 2
    mcc = (TP * TN - FP * FN) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)) if (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN) != 0 else 0


    # Stocker les résultats dans le dictionnaire
    class_results[class_label] = {
        'data': 'PHY',
        'model_type': 'CNN 1D',
        'attack_type': reverse_label_mapping[class_label],
        'confusion_matrix': conf_matrix,
        'TN': TN,
        'FP': FP,
        'FN': FN,
        'TP': TP,
        'precision': precision,
        'recall': recall_tpr,
        'f1': f1,
        'balanced_accuracy': balanced_acc,
        'mcc': mcc,
        'tnr': tnr,
        'fpr': fpr,
        'accuracy': accuracy,
        'fit_time': fit_time,
        'predict_time': predict_time,
        'fit_memory_usage': fit_memory_usage,
        'predict_memory_usage': predict_memory_usage
    }

# Afficher ou enregistrer les résultats pour chaque classe
print(class_results)


{0: {'data': 'PHY', 'model_type': 'CNN 1D', 'attack_type': np.str_('normal'), 'confusion_matrix': array([[1760,    0,    5,   16,    0],
       [   8,   56,    0,    0,    0],
       [   8,    0,  183,    0,    0],
       [   3,    0,    0,  135,    0],
       [   1,    0,    0,    0,    0]]), 'TN': np.int64(374), 'FP': np.int64(20), 'FN': np.int64(21), 'TP': np.int64(1760), 'precision': np.float64(0.9887640449438202), 'recall': np.float64(0.9882088714205502), 'f1': np.float64(0.9884863802302724), 'balanced_accuracy': np.float64(0.9687237250503766), 'mcc': np.float64(0.9365230132378509), 'tnr': np.float64(0.949238578680203), 'fpr': np.float64(0.050761421319796954), 'accuracy': np.float64(0.9811494252873563), 'fit_time': 64.18473696708679, 'predict_time': 0.49485111236572266, 'fit_memory_usage': -31.1640625, 'predict_memory_usage': -0.15625}, 1: {'data': 'PHY', 'model_type': 'CNN 1D', 'attack_type': np.str_('DoS'), 'confusion_matrix': array([[1760,    0,    5,   16,    0],
       [   8,

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [100]:
# vérif
print(precision_score(y_test, y_pred, average=None))
for i in range(nb_class):
    print(f"Résultats pour la classe {i}: {class_results[i]['precision']}")
    print(f"avec calcul : {class_results[i]['TP'] / (class_results[i]['TP'] + class_results[i]['FP'])}")
# ok c'est bon

[0.98876404 1.         0.97340426 0.89403974 0.        ]
Résultats pour la classe 0: 0.9887640449438202
avec calcul : 0.9887640449438202
Résultats pour la classe 1: 1.0
avec calcul : 1.0
Résultats pour la classe 2: 0.973404255319149
avec calcul : 0.973404255319149
Résultats pour la classe 3: 0.8940397350993378
avec calcul : 0.8940397350993378
Résultats pour la classe 4: 0.0
avec calcul : nan


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  print(f"avec calcul : {class_results[i]['TP'] / (class_results[i]['TP'] + class_results[i]['FP'])}")


In [101]:
tp = class_results[1]['TP']
fn = class_results[1]['FN']
print("test tpr", tp / (tp + fn))
print(class_results[1]['recall'])
# ok c'est bon 

test tpr 0.875
0.875


In [102]:
# sauvegarder les données 

for i in range(1, nb_class):
    class_results[i]['model'] = f'CNN1D - label - {reverse_label_mapping[i]}'
    db[f'PHY_results_cnn1d_{reverse_label_mapping[i]}'] = class_results[i]