In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('default')
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
sonata_data = pd.read_csv('sonata_data.csv')
soul_data = pd.read_csv('soul_data.csv')
spark_data = pd.read_csv('spark_data.csv')

In [6]:
print("Sonata shape: ", sonata_data.shape)
print("Soul shape: ", soul_data.shape)
print("Spark shape: ", spark_data.shape)

Sonata shape:  (535041, 12)
Soul shape:  (797843, 12)
Spark shape:  (402956, 12)


In [7]:
spark_data.head()

Unnamed: 0,timestamp,canId,dlc,data0,data1,data2,data3,data4,data5,data6,data7,flag
0,1513920000.0,04C1,8,0,CC,80,5E,52,08,00,00,Benign
1,1513920000.0,04C7,3,10,00,00,-1,-1,-1,-1,-1,Benign
2,1513920000.0,01E1,7,0,00,00,00,00,00,00,-1,Benign
3,1513920000.0,00C1,8,0,F9,05,41,02,85,8B,91,Benign
4,1513920000.0,00C5,8,3,52,0F,1D,C3,F4,03,D4,Benign


In [8]:
flag_values = pd.DataFrame(columns=['Sonata', 'Soul', 'Spark'])
flag_values['Sonata'] = sonata_data['flag'].value_counts()
flag_values['Soul'] = soul_data['flag'].value_counts()
flag_values['Spark'] = spark_data['flag'].value_counts()
flag_values

Unnamed: 0_level_0,Sonata,Soul,Spark
flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Benign,468527,717489,366510
Flooding,32422,33141,22587
Fuzzy,18118,39812,5812
Malfunction,15974,7401,8047


In [9]:
sonata_data = sonata_data.drop(columns=['timestamp'])
soul_data = soul_data.drop(columns=['timestamp'])
spark_data = spark_data.drop(columns=['timestamp'])
spark_data.head()

Unnamed: 0,canId,dlc,data0,data1,data2,data3,data4,data5,data6,data7,flag
0,04C1,8,0,CC,80,5E,52,08,00,00,Benign
1,04C7,3,10,00,00,-1,-1,-1,-1,-1,Benign
2,01E1,7,0,00,00,00,00,00,00,-1,Benign
3,00C1,8,0,F9,05,41,02,85,8B,91,Benign
4,00C5,8,3,52,0F,1D,C3,F4,03,D4,Benign


In [10]:
X_sonata = sonata_data.drop(columns=['flag'])
Y_sonata = sonata_data['flag']
Y_sonata = pd.DataFrame(Y_sonata, columns=['flag'])

X_soul = soul_data.drop(columns=['flag'])
Y_soul = soul_data['flag']
Y_soul = pd.DataFrame(Y_soul, columns=['flag'])

X_spark = spark_data.drop(columns=['flag'])
Y_spark = spark_data['flag']
Y_spark = pd.DataFrame(Y_spark, columns=['flag'])

In [11]:
X_spark.head()

Unnamed: 0,canId,dlc,data0,data1,data2,data3,data4,data5,data6,data7
0,04C1,8,0,CC,80,5E,52,08,00,00
1,04C7,3,10,00,00,-1,-1,-1,-1,-1
2,01E1,7,0,00,00,00,00,00,00,-1
3,00C1,8,0,F9,05,41,02,85,8B,91
4,00C5,8,3,52,0F,1D,C3,F4,03,D4


In [12]:
Y_sonata.head()

Unnamed: 0,flag
0,Benign
1,Benign
2,Benign
3,Benign
4,Benign


In [13]:
label_encoder = LabelEncoder()
combined_flags = pd.concat([sonata_data['flag'], soul_data['flag'], spark_data['flag']], axis=0)
label_encoder.fit(combined_flags)

Y_sonata['label_multiclass'] = label_encoder.transform(Y_sonata['flag'])
Y_sonata['label_binary'] = Y_sonata['flag'].apply(lambda x: 0 if x == 'Benign' else 1)
Y_sonata.drop(columns=['flag'], inplace=True)

Y_soul['label_multiclass'] = label_encoder.transform(Y_soul['flag'])
Y_soul['label_binary'] = Y_soul['flag'].apply(lambda x: 0 if x == 'Benign' else 1)
Y_soul.drop(columns=['flag'], inplace=True)

Y_spark['label_multiclass'] = label_encoder.transform(Y_spark['flag'])
Y_spark['label_binary'] = Y_spark['flag'].apply(lambda x: 0 if x == 'Benign' else 1)
Y_spark.drop(columns=['flag'], inplace=True)

In [14]:
label_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
print("Label Mapping (Encoded -> Class):")
for encoded_label, class_name in label_mapping.items():
    print(f"{encoded_label} -> {class_name}")

Label Mapping (Encoded -> Class):
0 -> Benign
1 -> Flooding
2 -> Fuzzy
3 -> Malfunction


In [15]:
Y_sonata.head()

Unnamed: 0,label_multiclass,label_binary
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [16]:
print(X_sonata.dtypes)

canId    object
dlc       int64
data0    object
data1    object
data2    object
data3    object
data4    object
data5    object
data6    object
data7    object
dtype: object


In [17]:
def preprocess_hex_values(data):
    for col in data.columns:
        if col.startswith('data') or col == 'canId':
            data[col] = data[col].apply(lambda x: int(x, 16))
    return data

X_sonata = preprocess_hex_values(X_sonata)
X_soul = preprocess_hex_values(X_soul)
X_spark = preprocess_hex_values(X_spark)

In [18]:
X_spark.head()

Unnamed: 0,canId,dlc,data0,data1,data2,data3,data4,data5,data6,data7
0,1217,8,0,204,128,94,82,8,0,0
1,1223,3,16,0,0,-1,-1,-1,-1,-1
2,481,7,0,0,0,0,0,0,0,-1
3,193,8,0,249,5,65,2,133,139,145
4,197,8,3,82,15,29,195,244,3,212


In [19]:
X_train_sonata, X_test_sonata, Y_train_sonata, Y_test_sonata = train_test_split(X_sonata, Y_sonata, test_size=0.2, random_state=42, stratify=Y_sonata)
X_train_soul, X_test_soul, Y_train_soul, Y_test_soul = train_test_split(X_soul, Y_soul, test_size=0.2, random_state=42, stratify=Y_soul)
X_train_spark, X_test_spark, Y_train_spark, Y_test_spark = train_test_split(X_spark, Y_spark, test_size=0.2, random_state=42, stratify=Y_spark)


In [20]:
class CanBusDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X.values if isinstance(X, pd.DataFrame) else np.array(X)
        self.Y = Y.values if isinstance(Y, pd.DataFrame) else np.array(Y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        X_values = self.X[idx].astype(np.float32)
        Y_values = self.Y[idx].astype(np.float32)
        return torch.tensor(X_values).to(device), torch.tensor(Y_values[0]).to(device), torch.tensor(Y_values[1]).to(device)


In [21]:
train_dataset_sonata = CanBusDataset(X_train_sonata, Y_train_sonata)
test_dataset_sonata = CanBusDataset(X_test_sonata, Y_test_sonata)

train_loader_sonata = DataLoader(train_dataset_sonata, batch_size=512, shuffle=True)
test_loader_sonata = DataLoader(test_dataset_sonata, batch_size=512, shuffle=False)

train_dataset_soul = CanBusDataset(X_train_soul, Y_train_soul)
test_dataset_soul = CanBusDataset(X_test_soul, Y_test_soul)

train_loader_soul = DataLoader(train_dataset_soul, batch_size=512, shuffle=True)
test_loader_soul = DataLoader(test_dataset_soul, batch_size=512, shuffle=False)

train_dataset_spark = CanBusDataset(X_train_spark, Y_train_spark)
test_dataset_spark = CanBusDataset(X_test_spark, Y_test_spark)

train_loader_spark = DataLoader(train_dataset_spark, batch_size=512, shuffle=True)
test_loader_spark = DataLoader(test_dataset_spark, batch_size=512, shuffle=False)

In [22]:
for idx, data in enumerate(train_loader_sonata):
    datas = data[0]
    labels = data[1]
    print("Datas shape:", datas.shape)
    print("Labels shape:", labels.shape)
    break

Datas shape: torch.Size([512, 10])
Labels shape: torch.Size([512])


In [23]:
len(train_dataset_sonata), len(test_dataset_sonata)

(428032, 107009)

In [24]:
class LSTMNetwork(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, num_layers, network_type):
        super(LSTMNetwork, self).__init__()
        self.hidden_sizes = hidden_sizes
        self.num_layers = num_layers

        self.lstm_layers = nn.ModuleList()
        for i in range(num_layers):
            self.lstm_layers.append(
                nn.LSTM(
                    input_size=input_size if i == 0 else hidden_sizes[i - 1],
                    hidden_size=hidden_sizes[i],
                    num_layers=1,
                    batch_first=True,
                )
            )

        self.fc = nn.Linear(hidden_sizes[-1], output_size)
        self.network_type = network_type

    def forward(self, X):
        if len(X.shape) == 2:
            X = X.unsqueeze(1)
        batch_size = X.size(0)

        for lstm in self.lstm_layers:
          X, _ = lstm(X)

        output = self.fc(X[:, -1])
        if self.network_type == 'binary':
          return torch.sigmoid(output)
        else:
          return output

    def fit(self, train_dataloader, test_dataloader, epochs, optimizer, loss_function):
        train_losses = []
        test_losses = []

        for epoch in range(epochs):
            self.train()
            train_loss = 0.0
            for X_batch, y_batch_multiclass, y_batch_binary in train_dataloader:
                optimizer.zero_grad()
                y_pred = self.forward(X_batch).squeeze()

                if self.network_type == 'binary':
                    y_batch = y_batch_binary.float().squeeze()
                else:
                    y_batch = y_batch_multiclass.long().squeeze()

                loss = loss_function(y_pred, y_batch)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
            train_losses.append(train_loss / len(train_dataloader))

            self.eval()
            test_loss = 0.0
            with torch.no_grad():
                for X_batch, y_batch_multiclass, y_batch_binary in test_dataloader:
                    y_pred = self.forward(X_batch).squeeze()

                    if self.network_type == 'binary':
                      y_batch = y_batch_binary.float().squeeze()
                    else:
                      y_batch = y_batch_multiclass.long().squeeze()

                    loss = loss_function(y_pred, y_batch)
                    test_loss += loss.item()
            test_losses.append(test_loss / len(test_dataloader))

            print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_losses[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}")

        return train_losses, test_losses

    def predict(self, dataloader):
        all_outputs = []
        self.eval()
        with torch.no_grad():
            for X_batch, _, _ in dataloader:
                X_batch = X_batch.to(device)
                outputs = self.forward(X_batch)
                if self.network_type == 'binary':
                  predicted = (outputs > 0.5).float()
                else:
                  _, predicted = torch.max(outputs.data, 1)
                all_outputs.extend(predicted.cpu().numpy())
        return np.array(all_outputs)

In [25]:
def verify_parameters_multiclass_classification(y_test, y_pred_tab, parameters):
    results = pd.DataFrame(columns=['Accuracy', 'Recall', 'F1 Score', 'FPR', 'FNR'])
    for i in range(len(parameters)):
        y_pred = y_pred_tab[i]
        accuracy = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')

        cm = confusion_matrix(y_test, y_pred)
        tp = np.diag(cm)
        fp = cm.sum(axis=0) - tp
        fn = cm.sum(axis=1) - tp
        tn = cm.sum() - (tp + fp + fn)

        fpr = (fp / (fp + tn)).mean()
        fnr = (fn / (fn + tp)).mean()

        results.loc[str(parameters[i])] = [accuracy, recall, f1, fpr, fnr]

    return results

In [26]:
def calculate_scores_binary_classification(y_test, y_pred_tab):
    results = pd.DataFrame(columns=['Accuracy', 'Recall', 'F1 Score', 'FPR', 'FNR'])

    classes = list(label_mapping.items())
    classes = classes[1:]

    for encoded_label, class_name in classes:
        y_test_modified = y_test[(y_test['label_multiclass'] == encoded_label) | (y_test['label_multiclass'] == 0)]
        y_pred_modified = y_pred_tab[(y_test['label_multiclass'] == encoded_label) | (y_test['label_multiclass'] == 0)]

        accuracy = accuracy_score(y_test_modified['label_binary'], y_pred_modified)
        recall = recall_score(y_test_modified['label_binary'], y_pred_modified)
        f1 = f1_score(y_test_modified['label_binary'], y_pred_modified)

        cm = confusion_matrix(y_test_modified['label_binary'], y_pred_modified, labels=[0, 1])

        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

        results.loc[class_name] = [accuracy, recall, f1, fpr, fnr]

    results.loc['Average'] = results.mean()

    accuracy = accuracy_score(y_test['label_binary'], y_pred_tab)
    recall = recall_score(y_test['label_binary'], y_pred_tab)
    f1 = f1_score(y_test['label_binary'], y_pred_tab)
    cm = confusion_matrix(y_test['label_binary'], y_pred_tab, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0

    results.loc['Overall'] = [accuracy, recall, f1, fpr, fnr]


    return results

In [27]:
def calculate_scores_multiclass_classification(y_test, y_pred_tab):
    results = pd.DataFrame(columns=['Accuracy', 'Recall', 'F1 Score', 'FPR', 'FNR'])

    cm = confusion_matrix(y_test['label_multiclass'], y_pred_tab, labels=[0, 1, 2, 3])

    tp = np.diag(cm)
    fp = cm.sum(axis=0) - tp
    fn = cm.sum(axis=1) - tp
    tn = cm.sum() - (tp + fp + fn)

    accuracy = np.sum(tp) / np.sum(cm)
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    f1 = 2 * (precision * recall) / (precision + recall)

    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp)

    for encoded_label, class_name in label_mapping.items():
        results.loc[class_name] = [accuracy, recall[encoded_label], f1[encoded_label], fpr[encoded_label], fnr[encoded_label]]


    results.loc['Average'] = results.mean()

    return results

In [28]:
input_size = X_train_sonata.shape[1]
input_size

10

#### Sonata - binary classification

In [None]:
lstm = LSTMNetwork(input_size=10, hidden_sizes=[512], output_size=1, num_layers=1, network_type='binary').to(device)
loss = lstm.fit(train_loader_sonata, test_loader_sonata, 20, optim.Adam(lstm.parameters(), lr=0.0001), nn.BCELoss())
predictions = lstm.predict(test_loader_sonata)

Epoch 1/20 - Train Loss: 0.1269, Test Loss: 0.0346
Epoch 2/20 - Train Loss: 0.0174, Test Loss: 0.0119
Epoch 3/20 - Train Loss: 0.0082, Test Loss: 0.0069
Epoch 4/20 - Train Loss: 0.0053, Test Loss: 0.0047
Epoch 5/20 - Train Loss: 0.0038, Test Loss: 0.0035
Epoch 6/20 - Train Loss: 0.0029, Test Loss: 0.0029
Epoch 7/20 - Train Loss: 0.0023, Test Loss: 0.0023
Epoch 8/20 - Train Loss: 0.0019, Test Loss: 0.0020
Epoch 9/20 - Train Loss: 0.0015, Test Loss: 0.0017
Epoch 10/20 - Train Loss: 0.0012, Test Loss: 0.0014
Epoch 11/20 - Train Loss: 0.0010, Test Loss: 0.0011
Epoch 12/20 - Train Loss: 0.0008, Test Loss: 0.0010
Epoch 13/20 - Train Loss: 0.0006, Test Loss: 0.0009
Epoch 14/20 - Train Loss: 0.0005, Test Loss: 0.0008
Epoch 15/20 - Train Loss: 0.0004, Test Loss: 0.0015
Epoch 16/20 - Train Loss: 0.0004, Test Loss: 0.0014
Epoch 17/20 - Train Loss: 0.0003, Test Loss: 0.0015
Epoch 18/20 - Train Loss: 0.0003, Test Loss: 0.0014
Epoch 19/20 - Train Loss: 0.0002, Test Loss: 0.0013
Epoch 20/20 - Train L

In [None]:
df_scores = calculate_scores_binary_classification(Y_test_sonata, predictions)

In [None]:
df_scores

Unnamed: 0,Accuracy,Recall,F1 Score,FPR,FNR
Flooding,0.99999,1.0,0.999923,1.1e-05,0.0
Fuzzy,0.999928,0.998344,0.999034,1.1e-05,0.001656
Malfunction,0.99999,1.0,0.999844,1.1e-05,0.0
Average,0.999969,0.999448,0.9996,1.1e-05,0.000552
Overall,0.999935,0.999549,0.999737,1.1e-05,0.000451


#### Soul - binary classification

In [None]:
lstm = LSTMNetwork(input_size=10, hidden_sizes=[512], output_size=1, num_layers=1, network_type='binary').to(device)
loss = lstm.fit(train_loader_soul, test_loader_soul, 20, optim.Adam(lstm.parameters(), lr=0.0001), nn.BCELoss())
predictions = lstm.predict(test_loader_soul)

Epoch 1/20 - Train Loss: 0.0811, Test Loss: 0.0139
Epoch 2/20 - Train Loss: 0.0087, Test Loss: 0.0055
Epoch 3/20 - Train Loss: 0.0042, Test Loss: 0.0032
Epoch 4/20 - Train Loss: 0.0025, Test Loss: 0.0020
Epoch 5/20 - Train Loss: 0.0017, Test Loss: 0.0016
Epoch 6/20 - Train Loss: 0.0012, Test Loss: 0.0011
Epoch 7/20 - Train Loss: 0.0009, Test Loss: 0.0009
Epoch 8/20 - Train Loss: 0.0007, Test Loss: 0.0007
Epoch 9/20 - Train Loss: 0.0006, Test Loss: 0.0007
Epoch 10/20 - Train Loss: 0.0005, Test Loss: 0.0006
Epoch 11/20 - Train Loss: 0.0004, Test Loss: 0.0006
Epoch 12/20 - Train Loss: 0.0004, Test Loss: 0.0006
Epoch 13/20 - Train Loss: 0.0003, Test Loss: 0.0005
Epoch 14/20 - Train Loss: 0.0003, Test Loss: 0.0005
Epoch 15/20 - Train Loss: 0.0003, Test Loss: 0.0004
Epoch 16/20 - Train Loss: 0.0002, Test Loss: 0.0005
Epoch 17/20 - Train Loss: 0.0002, Test Loss: 0.0004
Epoch 18/20 - Train Loss: 0.0002, Test Loss: 0.0004
Epoch 19/20 - Train Loss: 0.0002, Test Loss: 0.0004
Epoch 20/20 - Train L

In [None]:
df_scores = calculate_scores_binary_classification(Y_test_soul, predictions)
df_scores

Unnamed: 0,Accuracy,Recall,F1 Score,FPR,FNR
Flooding,0.999987,1.0,0.999849,1.4e-05,0.0
Fuzzy,0.999947,0.999247,0.999498,1.4e-05,0.000753
Malfunction,0.999972,0.998649,0.998649,1.4e-05,0.001351
Average,0.999969,0.999298,0.999332,1.4e-05,0.000702
Overall,0.999937,0.999502,0.999689,1.4e-05,0.000498


#### Spark - binary classification

In [None]:
lstm = LSTMNetwork(input_size=10, hidden_sizes=[512], output_size=1, num_layers=1, network_type='binary').to(device)
loss = lstm.fit(train_loader_spark, test_loader_spark, 20, optim.Adam(lstm.parameters(), lr=0.0001), nn.BCELoss())
predictions = lstm.predict(test_loader_spark)

Epoch 1/20 - Train Loss: 0.1186, Test Loss: 0.0331
Epoch 2/20 - Train Loss: 0.0217, Test Loss: 0.0156
Epoch 3/20 - Train Loss: 0.0127, Test Loss: 0.0114
Epoch 4/20 - Train Loss: 0.0096, Test Loss: 0.0093
Epoch 5/20 - Train Loss: 0.0080, Test Loss: 0.0081
Epoch 6/20 - Train Loss: 0.0069, Test Loss: 0.0073
Epoch 7/20 - Train Loss: 0.0062, Test Loss: 0.0068
Epoch 8/20 - Train Loss: 0.0056, Test Loss: 0.0062
Epoch 9/20 - Train Loss: 0.0051, Test Loss: 0.0059
Epoch 10/20 - Train Loss: 0.0047, Test Loss: 0.0054
Epoch 11/20 - Train Loss: 0.0043, Test Loss: 0.0052
Epoch 12/20 - Train Loss: 0.0040, Test Loss: 0.0049
Epoch 13/20 - Train Loss: 0.0037, Test Loss: 0.0045
Epoch 14/20 - Train Loss: 0.0034, Test Loss: 0.0044
Epoch 15/20 - Train Loss: 0.0031, Test Loss: 0.0042
Epoch 16/20 - Train Loss: 0.0029, Test Loss: 0.0039
Epoch 17/20 - Train Loss: 0.0027, Test Loss: 0.0037
Epoch 18/20 - Train Loss: 0.0025, Test Loss: 0.0036
Epoch 19/20 - Train Loss: 0.0024, Test Loss: 0.0035
Epoch 20/20 - Train L

In [None]:
df_scores = calculate_scores_binary_classification(Y_test_spark, predictions)
df_scores

Unnamed: 0,Accuracy,Recall,F1 Score,FPR,FNR
Flooding,0.99955,1.0,0.996142,0.000477,0.0
Fuzzy,0.999235,0.981067,0.97561,0.000477,0.018933
Malfunction,0.999533,1.0,0.989241,0.000477,0.0
Average,0.999439,0.993689,0.986997,0.000477,0.006311
Overall,0.999293,0.996982,0.996093,0.000477,0.003018


#### Sonata - multiclass classification

In [None]:
lstm = LSTMNetwork(input_size=10, hidden_sizes=[512], output_size=4, num_layers=1, network_type='multiclass').to(device)
loss = lstm.fit(train_loader_sonata, test_loader_sonata, 20, optim.NAdam(lstm.parameters(), lr=0.0001), nn.CrossEntropyLoss())
predictions = lstm.predict(test_loader_sonata)

Epoch 1/20 - Train Loss: 0.1653, Test Loss: 0.0287
Epoch 2/20 - Train Loss: 0.0146, Test Loss: 0.0094
Epoch 3/20 - Train Loss: 0.0065, Test Loss: 0.0051
Epoch 4/20 - Train Loss: 0.0040, Test Loss: 0.0034
Epoch 5/20 - Train Loss: 0.0028, Test Loss: 0.0027
Epoch 6/20 - Train Loss: 0.0022, Test Loss: 0.0021
Epoch 7/20 - Train Loss: 0.0017, Test Loss: 0.0018
Epoch 8/20 - Train Loss: 0.0014, Test Loss: 0.0014
Epoch 9/20 - Train Loss: 0.0011, Test Loss: 0.0014
Epoch 10/20 - Train Loss: 0.0009, Test Loss: 0.0011
Epoch 11/20 - Train Loss: 0.0007, Test Loss: 0.0009
Epoch 12/20 - Train Loss: 0.0006, Test Loss: 0.0009
Epoch 13/20 - Train Loss: 0.0004, Test Loss: 0.0008
Epoch 14/20 - Train Loss: 0.0004, Test Loss: 0.0007
Epoch 15/20 - Train Loss: 0.0003, Test Loss: 0.0006
Epoch 16/20 - Train Loss: 0.0003, Test Loss: 0.0006
Epoch 17/20 - Train Loss: 0.0002, Test Loss: 0.0006
Epoch 18/20 - Train Loss: 0.0002, Test Loss: 0.0006
Epoch 19/20 - Train Loss: 0.0002, Test Loss: 0.0005
Epoch 20/20 - Train L

In [None]:
df_scores = calculate_scores_multiclass_classification(Y_test_sonata, predictions)

[[93705     0     1     0]
 [    0  6484     0     0]
 [    4     0  3620     0]
 [    0     0     0  3195]]
[93705  6484  3620  3195]
[4 0 1 0]


In [None]:
df_scores

Unnamed: 0,Accuracy,Recall,F1 Score,FPR,FNR
Benign,0.999953,0.999989,0.999973,0.000301,1.1e-05
Flooding,0.999953,1.0,1.0,0.0,0.0
Fuzzy,0.999953,0.998896,0.99931,1e-05,0.001104
Malfunction,0.999953,1.0,1.0,0.0,0.0
Average,0.999953,0.999721,0.999821,7.8e-05,0.000279


#### Soul - multiclass classification

In [None]:
lstm = LSTMNetwork(input_size=10, hidden_sizes=[512], output_size=4, num_layers=1, network_type='multiclass').to(device)
loss = lstm.fit(train_loader_soul, test_loader_soul, 20, optim.NAdam(lstm.parameters(), lr=0.0001), nn.CrossEntropyLoss())
predictions = lstm.predict(test_loader_soul)

Epoch 1/20 - Train Loss: 0.1101, Test Loss: 0.0226
Epoch 2/20 - Train Loss: 0.0178, Test Loss: 0.0153
Epoch 3/20 - Train Loss: 0.0136, Test Loss: 0.0128
Epoch 4/20 - Train Loss: 0.0119, Test Loss: 0.0117
Epoch 5/20 - Train Loss: 0.0110, Test Loss: 0.0112
Epoch 6/20 - Train Loss: 0.0104, Test Loss: 0.0107
Epoch 7/20 - Train Loss: 0.0098, Test Loss: 0.0102
Epoch 8/20 - Train Loss: 0.0094, Test Loss: 0.0099
Epoch 9/20 - Train Loss: 0.0091, Test Loss: 0.0095
Epoch 10/20 - Train Loss: 0.0088, Test Loss: 0.0094
Epoch 11/20 - Train Loss: 0.0086, Test Loss: 0.0095
Epoch 12/20 - Train Loss: 0.0084, Test Loss: 0.0090
Epoch 13/20 - Train Loss: 0.0082, Test Loss: 0.0089
Epoch 14/20 - Train Loss: 0.0079, Test Loss: 0.0092
Epoch 15/20 - Train Loss: 0.0078, Test Loss: 0.0089
Epoch 16/20 - Train Loss: 0.0077, Test Loss: 0.0085
Epoch 17/20 - Train Loss: 0.0075, Test Loss: 0.0085
Epoch 18/20 - Train Loss: 0.0074, Test Loss: 0.0082
Epoch 19/20 - Train Loss: 0.0072, Test Loss: 0.0084
Epoch 20/20 - Train L

In [None]:
df_scores = calculate_scores_multiclass_classification(Y_test_soul, predictions)
df_scores

Unnamed: 0,Accuracy,Recall,F1 Score,FPR,FNR
Benign,0.996622,0.999986,0.999937,0.000996,1.4e-05
Flooding,0.996622,1.0,1.0,0.0,0.0
Fuzzy,0.996622,0.947633,0.965703,0.000785,0.052367
Malfunction,0.996622,0.918919,0.838471,0.002556,0.081081
Average,0.996622,0.966634,0.951028,0.001084,0.033366


#### Spark - multiclass classification


In [None]:
lstm = LSTMNetwork(input_size=10, hidden_sizes=[512], output_size=4, num_layers=1, network_type='multiclass').to(device)
loss = lstm.fit(train_loader_spark, test_loader_spark, 20, optim.NAdam(lstm.parameters(), lr=0.0001), nn.CrossEntropyLoss())
predictions = lstm.predict(test_loader_spark)

Epoch 1/20 - Train Loss: 0.1717, Test Loss: 0.0303
Epoch 2/20 - Train Loss: 0.0195, Test Loss: 0.0143
Epoch 3/20 - Train Loss: 0.0114, Test Loss: 0.0102
Epoch 4/20 - Train Loss: 0.0085, Test Loss: 0.0084
Epoch 5/20 - Train Loss: 0.0069, Test Loss: 0.0073
Epoch 6/20 - Train Loss: 0.0059, Test Loss: 0.0063
Epoch 7/20 - Train Loss: 0.0051, Test Loss: 0.0057
Epoch 8/20 - Train Loss: 0.0045, Test Loss: 0.0052
Epoch 9/20 - Train Loss: 0.0040, Test Loss: 0.0049
Epoch 10/20 - Train Loss: 0.0036, Test Loss: 0.0045
Epoch 11/20 - Train Loss: 0.0033, Test Loss: 0.0043
Epoch 12/20 - Train Loss: 0.0030, Test Loss: 0.0040
Epoch 13/20 - Train Loss: 0.0027, Test Loss: 0.0038
Epoch 14/20 - Train Loss: 0.0025, Test Loss: 0.0035
Epoch 15/20 - Train Loss: 0.0023, Test Loss: 0.0033
Epoch 16/20 - Train Loss: 0.0021, Test Loss: 0.0032
Epoch 17/20 - Train Loss: 0.0020, Test Loss: 0.0030
Epoch 18/20 - Train Loss: 0.0018, Test Loss: 0.0030
Epoch 19/20 - Train Loss: 0.0017, Test Loss: 0.0030
Epoch 20/20 - Train L

In [None]:
df_scores = calculate_scores_multiclass_classification(Y_test_spark, predictions)
df_scores

Unnamed: 0,Accuracy,Recall,F1 Score,FPR,FNR
Benign,0.999429,0.999659,0.999686,0.002881,0.000341
Flooding,0.999429,1.0,1.0,0.0,0.0
Fuzzy,0.999429,0.981928,0.980241,0.000315,0.018072
Malfunction,0.999429,1.0,1.0,0.0,0.0
Average,0.999429,0.995397,0.994982,0.000799,0.004603


#### Experimenting with layers (multiclass, using Sonata dataset)

In [31]:
predictions = []
layers = ['L1', 'L2', 'L3', 'L4', 'L5']
sizes = [[512], [512, 512], [512, 512, 256], [512, 512, 256, 128], [512, 512, 256, 128, 64]]
dict_sizes = dict(zip(layers, sizes))

for layer_name, hidden_sizes in dict_sizes.items():
    print(f"Number of Layers: {layer_name}, Hidden Sizes: {hidden_sizes}")
    lstm = LSTMNetwork(input_size=10, hidden_sizes=hidden_sizes, output_size=4, num_layers=len(hidden_sizes), network_type='multiclass').to(device)
    loss = lstm.fit(train_loader_sonata, test_loader_sonata, 20, optim.NAdam(lstm.parameters(), lr=0.0001), nn.CrossEntropyLoss())
    predictions.append(lstm.predict(test_loader_sonata))
    print()


df_param_layers = verify_parameters_multiclass_classification(Y_test_sonata['label_multiclass'], predictions, layers)
df_param_layers

Number of Layers: L1, Hidden Sizes: [512]
Epoch 1/20 - Train Loss: 0.1574, Test Loss: 0.0239
Epoch 2/20 - Train Loss: 0.0130, Test Loss: 0.0084
Epoch 3/20 - Train Loss: 0.0061, Test Loss: 0.0048
Epoch 4/20 - Train Loss: 0.0039, Test Loss: 0.0034
Epoch 5/20 - Train Loss: 0.0029, Test Loss: 0.0027
Epoch 6/20 - Train Loss: 0.0022, Test Loss: 0.0022
Epoch 7/20 - Train Loss: 0.0018, Test Loss: 0.0019
Epoch 8/20 - Train Loss: 0.0014, Test Loss: 0.0017
Epoch 9/20 - Train Loss: 0.0012, Test Loss: 0.0014
Epoch 10/20 - Train Loss: 0.0010, Test Loss: 0.0011
Epoch 11/20 - Train Loss: 0.0008, Test Loss: 0.0010
Epoch 12/20 - Train Loss: 0.0006, Test Loss: 0.0009
Epoch 13/20 - Train Loss: 0.0005, Test Loss: 0.0007
Epoch 14/20 - Train Loss: 0.0004, Test Loss: 0.0007
Epoch 15/20 - Train Loss: 0.0003, Test Loss: 0.0007
Epoch 16/20 - Train Loss: 0.0003, Test Loss: 0.0006
Epoch 17/20 - Train Loss: 0.0002, Test Loss: 0.0006
Epoch 18/20 - Train Loss: 0.0002, Test Loss: 0.0006
Epoch 19/20 - Train Loss: 0.000

Unnamed: 0,Accuracy,Recall,F1 Score,FPR,FNR
L1,0.999953,0.999721,0.999821,7.8e-05,0.000279
L2,0.999925,0.999581,0.999713,0.000118,0.000419
L3,0.999869,0.999101,0.999498,0.000247,0.000899
L4,0.999794,0.999212,0.999212,0.000233,0.000788
L5,0.99985,0.998963,0.999426,0.000284,0.001037


#### Experimenting with learning rate (multiclass, using Sonata dataset)

In [29]:
predictions = []
learning_rates = [0.0001, 0.001, 0.01, 0.5]

for lr in learning_rates:
    print(f"Learning Rate: {lr}")
    lstm = LSTMNetwork(input_size=10, hidden_sizes=[512], output_size=4, num_layers=1, network_type='multiclass').to(device)
    loss = lstm.fit(train_loader_sonata, test_loader_sonata, 20, optim.NAdam(lstm.parameters(), lr=lr), nn.CrossEntropyLoss())
    predictions.append(lstm.predict(test_loader_sonata))
    print()

df_param_lr = verify_parameters_multiclass_classification(Y_test_sonata['label_multiclass'], predictions, learning_rates)
df_param_lr

Learning Rate: 0.0001
Epoch 1/20 - Train Loss: 0.1444, Test Loss: 0.0256
Epoch 2/20 - Train Loss: 0.0128, Test Loss: 0.0084
Epoch 3/20 - Train Loss: 0.0058, Test Loss: 0.0047
Epoch 4/20 - Train Loss: 0.0037, Test Loss: 0.0033
Epoch 5/20 - Train Loss: 0.0027, Test Loss: 0.0025
Epoch 6/20 - Train Loss: 0.0021, Test Loss: 0.0021
Epoch 7/20 - Train Loss: 0.0017, Test Loss: 0.0018
Epoch 8/20 - Train Loss: 0.0013, Test Loss: 0.0016
Epoch 9/20 - Train Loss: 0.0011, Test Loss: 0.0013
Epoch 10/20 - Train Loss: 0.0008, Test Loss: 0.0010
Epoch 11/20 - Train Loss: 0.0006, Test Loss: 0.0009
Epoch 12/20 - Train Loss: 0.0005, Test Loss: 0.0008
Epoch 13/20 - Train Loss: 0.0004, Test Loss: 0.0007
Epoch 14/20 - Train Loss: 0.0003, Test Loss: 0.0006
Epoch 15/20 - Train Loss: 0.0002, Test Loss: 0.0006
Epoch 16/20 - Train Loss: 0.0002, Test Loss: 0.0006
Epoch 17/20 - Train Loss: 0.0002, Test Loss: 0.0006
Epoch 18/20 - Train Loss: 0.0001, Test Loss: 0.0005
Epoch 19/20 - Train Loss: 0.0001, Test Loss: 0.0006

Unnamed: 0,Accuracy,Recall,F1 Score,FPR,FNR
0.0001,0.999916,0.999445,0.999677,0.000153,0.000555
0.001,0.999654,0.997514,0.998668,0.000679,0.002486
0.01,0.998094,0.989707,0.992485,0.002851,0.010293
0.5,0.936304,0.500207,0.49164,0.128091,0.499793


#### Experimenting with optimizers (multiclass, using Sonata dataset)

In [None]:
predictions = []
optimizers = ['RMSprop', 'Adam', 'NAdam', 'Adagrad', 'Adadelta', 'Adamax']

for optimizer_name in optimizers:
    print(f"Optimizer: {optimizer_name}")
    optimizer = getattr(optim, optimizer_name)
    lstm = LSTMNetwork(input_size=10, hidden_sizes=[512], output_size=4, num_layers=1, network_type='multiclass').to(device)
    loss = lstm.fit(train_loader_sonata, test_loader_sonata, 20, optimizer(lstm.parameters(), lr=0.0001), nn.CrossEntropyLoss())
    predictions.append(lstm.predict(test_loader_sonata))
    print()


df_param_optim = verify_parameters_multiclass_classification(Y_test_sonata['label_multiclass'], predictions, optimizers)
df_param_optim

Optimizer: RMSprop
Epoch 1/20 - Train Loss: 0.0838, Test Loss: 0.0115
Epoch 2/20 - Train Loss: 0.0056, Test Loss: 0.0032
Epoch 3/20 - Train Loss: 0.0025, Test Loss: 0.0022
Epoch 4/20 - Train Loss: 0.0018, Test Loss: 0.0018
Epoch 5/20 - Train Loss: 0.0014, Test Loss: 0.0017
Epoch 6/20 - Train Loss: 0.0011, Test Loss: 0.0014
Epoch 7/20 - Train Loss: 0.0009, Test Loss: 0.0012
Epoch 8/20 - Train Loss: 0.0007, Test Loss: 0.0010
Epoch 9/20 - Train Loss: 0.0006, Test Loss: 0.0010
Epoch 10/20 - Train Loss: 0.0005, Test Loss: 0.0008
Epoch 11/20 - Train Loss: 0.0004, Test Loss: 0.0008
Epoch 12/20 - Train Loss: 0.0003, Test Loss: 0.0007
Epoch 13/20 - Train Loss: 0.0003, Test Loss: 0.0006
Epoch 14/20 - Train Loss: 0.0002, Test Loss: 0.0006
Epoch 15/20 - Train Loss: 0.0002, Test Loss: 0.0006
Epoch 16/20 - Train Loss: 0.0002, Test Loss: 0.0005
Epoch 17/20 - Train Loss: 0.0001, Test Loss: 0.0005
Epoch 18/20 - Train Loss: 0.0001, Test Loss: 0.0005
Epoch 19/20 - Train Loss: 0.0001, Test Loss: 0.0005
Ep

Unnamed: 0,Accuracy,Recall,F1 Score,FPR,FNR
RMSprop,0.999944,0.999652,0.999785,9.6e-05,0.000348
Adam,0.999925,0.999514,0.999713,0.000134,0.000486
NAdam,0.999888,0.999239,0.99957,0.000209,0.000761
Adagrad,0.959994,0.699345,0.77388,0.078071,0.300655
Adadelta,0.936276,0.5,0.491223,0.128148,0.5
Adamax,0.999645,0.997511,0.998632,0.000681,0.002489
