In [65]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('default')
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [66]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [67]:
sonata_data = pd.read_csv('sonata_data.csv')
soul_data = pd.read_csv('soul_data.csv')
spark_data = pd.read_csv('spark_data.csv')

In [68]:
print("Sonata shape: ", sonata_data.shape)
print("Soul shape: ", soul_data.shape)
print("Spark shape: ", spark_data.shape)

Sonata shape:  (535041, 12)
Soul shape:  (797843, 12)
Spark shape:  (402956, 12)


In [69]:
spark_data.head()

Unnamed: 0,timestamp,canId,dlc,data0,data1,data2,data3,data4,data5,data6,data7,flag
0,1513920000.0,04C1,8,0,CC,80,5E,52,08,00,00,Benign
1,1513920000.0,04C7,3,10,00,00,-1,-1,-1,-1,-1,Benign
2,1513920000.0,01E1,7,0,00,00,00,00,00,00,-1,Benign
3,1513920000.0,00C1,8,0,F9,05,41,02,85,8B,91,Benign
4,1513920000.0,00C5,8,3,52,0F,1D,C3,F4,03,D4,Benign


In [70]:
flag_values = pd.DataFrame(columns=['Sonata', 'Soul', 'Spark'])
flag_values['Sonata'] = sonata_data['flag'].value_counts()
flag_values['Soul'] = soul_data['flag'].value_counts()
flag_values['Spark'] = spark_data['flag'].value_counts()
flag_values

Unnamed: 0_level_0,Sonata,Soul,Spark
flag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Benign,468527,717489,366510
Flooding,32422,33141,22587
Fuzzy,18118,39812,5812
Malfunction,15974,7401,8047


In [71]:
sonata_data = sonata_data.drop(columns=['timestamp'])
soul_data = soul_data.drop(columns=['timestamp'])
spark_data = spark_data.drop(columns=['timestamp'])
spark_data.head()

Unnamed: 0,canId,dlc,data0,data1,data2,data3,data4,data5,data6,data7,flag
0,04C1,8,0,CC,80,5E,52,08,00,00,Benign
1,04C7,3,10,00,00,-1,-1,-1,-1,-1,Benign
2,01E1,7,0,00,00,00,00,00,00,-1,Benign
3,00C1,8,0,F9,05,41,02,85,8B,91,Benign
4,00C5,8,3,52,0F,1D,C3,F4,03,D4,Benign


In [72]:
X_sonata = sonata_data.drop(columns=['flag'])
Y_sonata = sonata_data['flag']
Y_sonata = pd.DataFrame(Y_sonata, columns=['flag'])

X_soul = soul_data.drop(columns=['flag'])
Y_soul = soul_data['flag']
Y_soul = pd.DataFrame(Y_soul, columns=['flag'])

X_spark = spark_data.drop(columns=['flag'])
Y_spark = spark_data['flag']
Y_spark = pd.DataFrame(Y_spark, columns=['flag'])

In [73]:
X_spark.head()

Unnamed: 0,canId,dlc,data0,data1,data2,data3,data4,data5,data6,data7
0,04C1,8,0,CC,80,5E,52,08,00,00
1,04C7,3,10,00,00,-1,-1,-1,-1,-1
2,01E1,7,0,00,00,00,00,00,00,-1
3,00C1,8,0,F9,05,41,02,85,8B,91
4,00C5,8,3,52,0F,1D,C3,F4,03,D4


In [74]:
Y_sonata.head()

Unnamed: 0,flag
0,Benign
1,Benign
2,Benign
3,Benign
4,Benign


In [75]:
label_encoder = LabelEncoder()
combined_flags = pd.concat([sonata_data['flag'], soul_data['flag'], spark_data['flag']], axis=0)
label_encoder.fit(combined_flags)

Y_sonata['label_multiclass'] = label_encoder.transform(Y_sonata['flag'])
Y_sonata['label_binary'] = Y_sonata['flag'].apply(lambda x: 0 if x == 'Benign' else 1)
Y_sonata.drop(columns=['flag'], inplace=True)

Y_soul['label_multiclass'] = label_encoder.transform(Y_soul['flag'])
Y_soul['label_binary'] = Y_soul['flag'].apply(lambda x: 0 if x == 'Benign' else 1)
Y_soul.drop(columns=['flag'], inplace=True)

Y_spark['label_multiclass'] = label_encoder.transform(Y_spark['flag'])
Y_spark['label_binary'] = Y_spark['flag'].apply(lambda x: 0 if x == 'Benign' else 1)
Y_spark.drop(columns=['flag'], inplace=True)

In [76]:
label_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
print("Label Mapping (Encoded -> Class):")
for encoded_label, class_name in label_mapping.items():
    print(f"{encoded_label} -> {class_name}")

Label Mapping (Encoded -> Class):
0 -> Benign
1 -> Flooding
2 -> Fuzzy
3 -> Malfunction


In [77]:
Y_sonata.head()

Unnamed: 0,label_multiclass,label_binary
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [78]:
print(X_sonata.dtypes)

canId    object
dlc       int64
data0    object
data1    object
data2    object
data3    object
data4    object
data5    object
data6    object
data7    object
dtype: object


In [79]:
def preprocess_hex_values(data):
    for col in data.columns:
        if col.startswith('data') or col == 'canId':
            data[col] = data[col].apply(lambda x: int(x, 16))
    return data

X_sonata = preprocess_hex_values(X_sonata)
X_soul = preprocess_hex_values(X_soul)
X_spark = preprocess_hex_values(X_spark)

In [80]:
X_spark.head()

Unnamed: 0,canId,dlc,data0,data1,data2,data3,data4,data5,data6,data7
0,1217,8,0,204,128,94,82,8,0,0
1,1223,3,16,0,0,-1,-1,-1,-1,-1
2,481,7,0,0,0,0,0,0,0,-1
3,193,8,0,249,5,65,2,133,139,145
4,197,8,3,82,15,29,195,244,3,212


In [81]:
X_train_sonata, X_test_sonata, Y_train_sonata, Y_test_sonata = train_test_split(X_sonata, Y_sonata, test_size=0.2, random_state=42, stratify=Y_sonata)
X_train_soul, X_test_soul, Y_train_soul, Y_test_soul = train_test_split(X_soul, Y_soul, test_size=0.2, random_state=42, stratify=Y_soul)
X_train_spark, X_test_spark, Y_train_spark, Y_test_spark = train_test_split(X_spark, Y_spark, test_size=0.2, random_state=42, stratify=Y_spark)


In [82]:
class CanBusDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X.values if isinstance(X, pd.DataFrame) else np.array(X)
        self.Y = Y.values if isinstance(Y, pd.DataFrame) else np.array(Y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        X_values = self.X[idx].astype(np.float32)
        Y_values = self.Y[idx].astype(np.float32)
        return torch.tensor(X_values).to(device), torch.tensor(Y_values[0]).to(device), torch.tensor(Y_values[1]).to(device)


In [83]:
train_dataset_sonata = CanBusDataset(X_train_sonata, Y_train_sonata)
test_dataset_sonata = CanBusDataset(X_test_sonata, Y_test_sonata)

train_loader_sonata = DataLoader(train_dataset_sonata, batch_size=512, shuffle=True)
test_loader_sonata = DataLoader(test_dataset_sonata, batch_size=512, shuffle=False)

train_dataset_soul = CanBusDataset(X_train_soul, Y_train_soul)
test_dataset_soul = CanBusDataset(X_test_soul, Y_test_soul)

train_loader_soul = DataLoader(train_dataset_soul, batch_size=512, shuffle=True)
test_loader_soul = DataLoader(test_dataset_soul, batch_size=512, shuffle=False)

train_dataset_spark = CanBusDataset(X_train_spark, Y_train_spark)
test_dataset_spark = CanBusDataset(X_test_spark, Y_test_spark)

train_loader_spark = DataLoader(train_dataset_spark, batch_size=512, shuffle=True)
test_loader_spark = DataLoader(test_dataset_spark, batch_size=512, shuffle=False)

In [84]:
for idx, data in enumerate(train_loader_sonata):
    datas = data[0]
    labels = data[1]
    print("Datas shape:", datas.shape)
    print("Labels shape:", labels.shape)
    break

Datas shape: torch.Size([512, 10])
Labels shape: torch.Size([512])


In [85]:
len(train_dataset_sonata), len(test_dataset_sonata)

(428032, 107009)

In [91]:
class LSTMNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTMNetwork, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.activation = nn.Sigmoid()

        print()

    def forward(self, X):
        if len(X.shape) == 2:
            X = X.unsqueeze(1)
        batch_size = X.size(0)
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        carry = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        output, (hidden, carry) = self.lstm(X, (hidden, carry))
        output = self.fc(output[:, -1])
        return self.activation(output)

    def fit(self, train_dataloader, test_dataloader, epochs, optimizer, loss_function, type):
        train_losses = []
        test_losses = []

        for epoch in range(epochs):
            self.train()
            train_loss = 0.0
            for X_batch, y_batch_multiclass, y_batch_binary in train_dataloader:
                optimizer.zero_grad()
                y_pred = self.forward(X_batch).squeeze()

                if type == 'binary':
                    y_batch = y_batch_binary.float().squeeze()
                else:
                    y_batch = y_batch_multiclass.float().squeeze()

                loss = loss_function(y_pred, y_batch)
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
            train_losses.append(train_loss / len(train_dataloader))

            self.eval()
            test_loss = 0.0
            with torch.no_grad():
                for X_batch, y_batch_multiclass, y_batch_binary in test_dataloader:
                    y_pred = self.forward(X_batch).squeeze()

                    if type == 'binary':
                      y_batch = y_batch_binary.float().squeeze()
                    else:
                      y_batch = y_batch_multiclass.float().squeeze()

                    loss = loss_function(y_pred, y_batch)
                    test_loss += loss.item()
            test_losses.append(test_loss / len(test_dataloader))

            print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_losses[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}")

        return train_losses, test_losses

    def predict(self, dataloader):
        all_outputs = []
        self.eval()
        with torch.no_grad():
            for X_batch, _, _ in dataloader:
                X_batch = X_batch.to(device)
                outputs = self.forward(X_batch)
                predicted = (outputs > 0.5).float()
                all_outputs.extend(predicted.cpu().numpy())
        return np.array(all_outputs)

In [87]:
def verify_parameters(y_test, y_pred_tab, parameters):
    results = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1 Score'])
    for i in range(len(parameters)):
        accuracy_score_test = accuracy_score(y_test, y_pred_tab[i])
        precision_score_test = precision_score(y_test, y_pred_tab[i], average='macro')
        recall_score_test = recall_score(y_test, y_pred_tab[i], average='macro')
        f1_score_test = f1_score(y_test, y_pred_tab[i], average='macro')
        results.loc[str(parameters[i])] = [accuracy_score_test, precision_score_test, recall_score_test, f1_score_test]

    return results

In [88]:
def losses_chart(train_losses, test_losses, title):
    plt.figure(figsize=(6, 4))
    plt.ylim(0, 2)
    plt.plot(train_losses, label='Train loss')
    plt.plot(test_losses, label='Test loss')
    plt.title(title)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [89]:
input_size = X_train_sonata.shape[1]
input_size

10

In [92]:
lstm = LSTMNetwork(input_size=10, hidden_size=512, output_size=1, num_layers=1).to(device)
loss = lstm.fit(train_loader_sonata, test_loader_sonata, 20, optim.Adam(lstm.parameters(), lr=0.0001), nn.BCELoss(), type='binary')
predictions = lstm.predict(test_loader_sonata)
accuracy_score_test = accuracy_score(Y_test_sonata['label_binary'], predictions)


Epoch 1/20 - Train Loss: 0.1135, Test Loss: 0.0332
Epoch 2/20 - Train Loss: 0.0173, Test Loss: 0.0125
Epoch 3/20 - Train Loss: 0.0085, Test Loss: 0.0072
Epoch 4/20 - Train Loss: 0.0054, Test Loss: 0.0048
Epoch 5/20 - Train Loss: 0.0038, Test Loss: 0.0035
Epoch 6/20 - Train Loss: 0.0029, Test Loss: 0.0027
Epoch 7/20 - Train Loss: 0.0022, Test Loss: 0.0022
Epoch 8/20 - Train Loss: 0.0017, Test Loss: 0.0017
Epoch 9/20 - Train Loss: 0.0014, Test Loss: 0.0015
Epoch 10/20 - Train Loss: 0.0011, Test Loss: 0.0013
Epoch 11/20 - Train Loss: 0.0008, Test Loss: 0.0011
Epoch 12/20 - Train Loss: 0.0007, Test Loss: 0.0009
Epoch 13/20 - Train Loss: 0.0005, Test Loss: 0.0009
Epoch 14/20 - Train Loss: 0.0004, Test Loss: 0.0016
Epoch 15/20 - Train Loss: 0.0004, Test Loss: 0.0015
Epoch 16/20 - Train Loss: 0.0003, Test Loss: 0.0014
Epoch 17/20 - Train Loss: 0.0003, Test Loss: 0.0014
Epoch 18/20 - Train Loss: 0.0002, Test Loss: 0.0014
Epoch 19/20 - Train Loss: 0.0002, Test Loss: 0.0013
Epoch 20/20 - Train 

In [93]:
accuracy_score_test

0.9998878598996346