In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import Birch
from sklearn.metrics import confusion_matrix, classification_report, matthews_corrcoef, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE


In [24]:

def load_data_from_directory(directory_path):
    data_frames = []
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(directory_path, file_name)
            df = pd.read_csv(file_path)
            data_frames.append(df)
    return pd.concat(data_frames, ignore_index=True)

# set directory
set01_directory = './Documents/Research/can-train-and-test/set_01/'
set02_directory = './Documents/Research/can-train-and-test/set_01/'
set03_directory = './Documents/Research/can-train-and-test/set_01/'
set04_directory = './Documents/Research/can-train-and-test/set_01/'


train01_directory = os.path.join(set01_directory, 'train_01')
train02_directory = os.path.join(set02_directory, 'train_01')
train03_directory = os.path.join(set03_directory, 'train_01')
train04_directory = os.path.join(set04_directory, 'train_01')




In [3]:
df_train = load_data_from_directory(train01_directory)

# Preprocess the data
df_train['timestamp'] = pd.to_datetime(df_train['timestamp']).astype(np.int64) // 10**9
df_train['arbitration_id'] = df_train['arbitration_id'].apply(lambda x: int(x, 16))
df_train['data_field'] = df_train['data_field'].apply(lambda x: int(x, 16))

# Extract features and labels
X_train = df_train.drop(columns='attack')
y_train = df_train['attack']

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)


In [5]:
# Apply SMOTE for Handling Class Imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Train the BIRCH Model
birch_model = Birch(threshold=0.5, n_clusters=None)
birch_model.fit(X_train_resampled)


In [7]:
# Predict clusters on the training data
train_clusters = birch_model.predict(X_train_resampled)

# Map clusters to the most frequent attack label in each cluster
cluster_to_label = {}
for cluster in np.unique(train_clusters):
    mask = (train_clusters == cluster)
    cluster_to_label[cluster] = np.bincount(y_train_resampled[mask]).argmax()

# Assign predicted labels based on clusters
train_cluster_labels = np.array([cluster_to_label[cluster] for cluster in train_clusters])


In [8]:
# Evaluate Training Performance
conf_matrix_train = confusion_matrix(y_train_resampled, train_cluster_labels)
print(conf_matrix_train)
mcc_train = matthews_corrcoef(y_train_resampled, train_cluster_labels)
accuracy_train = accuracy_score(y_train_resampled, train_cluster_labels)
precision_train = precision_score(y_train_resampled, train_cluster_labels, zero_division=0)
recall_train = recall_score(y_train_resampled, train_cluster_labels, zero_division=0)
f1_train = f1_score(y_train_resampled, train_cluster_labels, zero_division=0)
informedness_train = recall_train - (1 - recall_train)
markedness_train = precision_train - (1 - precision_train)

print(f"Training Metrics (BIRCH):")
print(f"Accuracy: {accuracy_train * 100:.2f}%")
print(f"Precision: {precision_train * 100:.2f}%")
print(f"Recall: {recall_train * 100:.2f}%")
print(f"F1-Score: {f1_train * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_train:.4f}")
print(f"Informedness: {informedness_train:.4f}")
print(f"Markedness: {markedness_train:.4f}")

[[6998700 3604883]
 [1498067 9105516]]
Training Metrics (BIRCH):
Accuracy: 75.94%
Precision: 71.64%
Recall: 85.87%
F1-Score: 78.11%
Matthews Correlation Coefficient: 0.5293
Informedness: 0.7174
Markedness: 0.4328


In [9]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_01/test_01_known_vehicle_known_attack/')

# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(lambda x: int(x, 16))
df_test['data_field'] = df_test['data_field'].apply(lambda x: int(x, 16))

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])


In [10]:
# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[3717759 1921631]
 [   6412   56868]]

Testing Metrics:
Accuracy: 66.19%
Precision: 98.75%
Recall: 66.19%
F1-Score: 78.59%
Matthews Correlation Coefficient: 12.28%
Informedness: 32.38%
Markedness: 97.50%


In [14]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_01/test_02_unknown_vehicle_known_attack/')

# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(lambda x: int(x, 16))
df_test['data_field'] = df_test['data_field'].apply(lambda x: int(x, 16))

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])

# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[4605889 1677861]
 [    240  163927]]

Testing Metrics:
Accuracy: 73.97%
Precision: 97.68%
Recall: 73.97%
F1-Score: 82.85%
Matthews Correlation Coefficient: 25.51%
Informedness: 47.95%
Markedness: 95.35%


In [17]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_03/test_01_known_vehicle_unknown_attack/')

# Convert hexadecimal fields to integers
def hex_to_int(x):
    try:
        return int(str(x), 16)
    except ValueError:
        return 0  # Handle non-hexadecimal values, or choose an appropriate default

# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(hex_to_int)
df_test['data_field'] = df_test['data_field'].apply(hex_to_int)

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])

# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[5680148 2934302]
 [  18439    2386]]

Testing Metrics:
Accuracy: 65.81%
Precision: 99.44%
Recall: 65.81%
F1-Score: 79.18%
Matthews Correlation Coefficient: -2.34%
Informedness: 31.61%
Markedness: 98.87%


In [18]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_01/test_04_unknown_vehicle_unknown_attack/')

# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(lambda x: int(x, 16))
df_test['data_field'] = df_test['data_field'].apply(lambda x: int(x, 16))

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])

# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[3717759 1921631]
 [   6412   56868]]

Testing Metrics:
Accuracy: 66.19%
Precision: 98.75%
Recall: 66.19%
F1-Score: 78.59%
Matthews Correlation Coefficient: 12.28%
Informedness: 32.38%
Markedness: 97.50%


# Set 02 - Train

In [26]:
# set 02
df_train = load_data_from_directory(train02_directory)

# Apply SMOTE for Handling Class Imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Train the BIRCH Model
birch_model = Birch(threshold=0.5, n_clusters=None)
birch_model.fit(X_train_resampled)

# Predict clusters on the training data
train_clusters = birch_model.predict(X_train_resampled)

# Map clusters to the most frequent attack label in each cluster
cluster_to_label = {}
for cluster in np.unique(train_clusters):
    mask = (train_clusters == cluster)
    cluster_to_label[cluster] = np.bincount(y_train_resampled[mask]).argmax()

# Assign predicted labels based on clusters
train_cluster_labels = np.array([cluster_to_label[cluster] for cluster in train_clusters])

# Evaluate Training Performance
conf_matrix_train = confusion_matrix(y_train_resampled, train_cluster_labels)
print(conf_matrix_train)
mcc_train = matthews_corrcoef(y_train_resampled, train_cluster_labels)
accuracy_train = accuracy_score(y_train_resampled, train_cluster_labels)
precision_train = precision_score(y_train_resampled, train_cluster_labels, zero_division=0)
recall_train = recall_score(y_train_resampled, train_cluster_labels, zero_division=0)
f1_train = f1_score(y_train_resampled, train_cluster_labels, zero_division=0)
informedness_train = recall_train - (1 - recall_train)
markedness_train = precision_train - (1 - precision_train)

print(f"Training Metrics (BIRCH):")
print(f"Accuracy: {accuracy_train * 100:.2f}%")
print(f"Precision: {precision_train * 100:.2f}%")
print(f"Recall: {recall_train * 100:.2f}%")
print(f"F1-Score: {f1_train * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_train:.4f}")
print(f"Informedness: {informedness_train:.4f}")
print(f"Markedness: {markedness_train:.4f}")

[[6998700 3604883]
 [1498067 9105516]]
Training Metrics (BIRCH):
Accuracy: 75.94%
Precision: 71.64%
Recall: 85.87%
F1-Score: 78.11%
Matthews Correlation Coefficient: 0.5293
Informedness: 0.7174
Markedness: 0.4328


# Set 02 - Train and Test 01

In [28]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_02/test_01_known_vehicle_known_attack/')

# Convert hexadecimal fields to integers
def hex_to_int(x):
    try:
        return int(str(x), 16)
    except ValueError:
        return 0  # Handle non-hexadecimal values, or choose an appropriate default
        
# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(hex_to_int)
df_test['data_field'] = df_test['data_field'].apply(hex_to_int)

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])

# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[10158617  3047694]
 [   11970     2274]]

Testing Metrics:
Accuracy: 76.86%
Precision: 99.77%
Recall: 76.86%
F1-Score: 86.82%
Matthews Correlation Coefficient: -0.55%
Informedness: 53.71%
Markedness: 99.55%


# Set 02 - Train and Test 02

In [29]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_02/test_02_unknown_vehicle_known_attack/')

# Convert hexadecimal fields to integers
def hex_to_int(x):
    try:
        return int(str(x), 16)
    except ValueError:
        return 0  # Handle non-hexadecimal values, or choose an appropriate default
        
# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(hex_to_int)
df_test['data_field'] = df_test['data_field'].apply(hex_to_int)

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])

# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics - set 02 and test 02:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[6636696 1788682]
 [  17791    2258]]

Testing Metrics - set 02 and test 02:
Accuracy: 78.61%
Precision: 99.50%
Recall: 78.61%
F1-Score: 87.81%
Matthews Correlation Coefficient: -1.19%
Informedness: 57.22%
Markedness: 98.99%


# Set 02 - Train and Test 03

In [30]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_03/test_03_known_vehicle_unknown_attack/')

# Convert hexadecimal fields to integers
def hex_to_int(x):
    try:
        return int(str(x), 16)
    except ValueError:
        return 0  # Handle non-hexadecimal values, or choose an appropriate default

# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(hex_to_int)
df_test['data_field'] = df_test['data_field'].apply(hex_to_int)

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])

# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics - set 02 and test 03:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[6848984 2580815]
 [  15649    2503]]

Testing Metrics - set 02 and test 03:
Accuracy: 72.52%
Precision: 99.58%
Recall: 72.52%
F1-Score: 83.90%
Matthews Correlation Coefficient: -1.33%
Informedness: 45.04%
Markedness: 99.16%


# Set 02 - Train and Test 04

In [31]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_02/test_04_unknown_vehicle_unknown_attack/')

# Convert hexadecimal fields to integers
def hex_to_int(x):
    try:
        return int(str(x), 16)
    except ValueError:
        return 0  # Handle non-hexadecimal values, or choose an appropriate default

# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(hex_to_int)
df_test['data_field'] = df_test['data_field'].apply(hex_to_int)

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])

# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[3766088 1005808]
 [  13439    4569]]

Testing Metrics:
Accuracy: 78.72%
Precision: 99.27%
Recall: 78.72%
F1-Score: 87.75%
Matthews Correlation Coefficient: 0.64%
Informedness: 57.44%
Markedness: 98.54%


# Set 03 - Train

In [32]:
# set 03
train_directory = './Documents/Research/can-train-and-test/set_03/train_01/'
df_train = load_data_from_directory(train_directory)

# Apply SMOTE for Handling Class Imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Train the BIRCH Model
birch_model = Birch(threshold=0.5, n_clusters=None)
birch_model.fit(X_train_resampled)

# Predict clusters on the training data
train_clusters = birch_model.predict(X_train_resampled)

# Map clusters to the most frequent attack label in each cluster
cluster_to_label = {}
for cluster in np.unique(train_clusters):
    mask = (train_clusters == cluster)
    cluster_to_label[cluster] = np.bincount(y_train_resampled[mask]).argmax()

# Assign predicted labels based on clusters
train_cluster_labels = np.array([cluster_to_label[cluster] for cluster in train_clusters])

# Evaluate Training Performance
conf_matrix_train = confusion_matrix(y_train_resampled, train_cluster_labels)
print(conf_matrix_train)
mcc_train = matthews_corrcoef(y_train_resampled, train_cluster_labels)
accuracy_train = accuracy_score(y_train_resampled, train_cluster_labels)
precision_train = precision_score(y_train_resampled, train_cluster_labels, zero_division=0)
recall_train = recall_score(y_train_resampled, train_cluster_labels, zero_division=0)
f1_train = f1_score(y_train_resampled, train_cluster_labels, zero_division=0)
informedness_train = recall_train - (1 - recall_train)
markedness_train = precision_train - (1 - precision_train)

print(f"Training Metrics (BIRCH):")
print(f"Accuracy: {accuracy_train * 100:.2f}%")
print(f"Precision: {precision_train * 100:.2f}%")
print(f"Recall: {recall_train * 100:.2f}%")
print(f"F1-Score: {f1_train * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_train:.4f}")
print(f"Informedness: {informedness_train:.4f}")
print(f"Markedness: {markedness_train:.4f}")

[[6998700 3604883]
 [1498067 9105516]]
Training Metrics (BIRCH):
Accuracy: 75.94%
Precision: 71.64%
Recall: 85.87%
F1-Score: 78.11%
Matthews Correlation Coefficient: 0.5293
Informedness: 0.7174
Markedness: 0.4328


# Set 03 - Train and Test 01

In [33]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_03/test_01_known_vehicle_known_attack/')

# Convert hexadecimal fields to integers
def hex_to_int(x):
    try:
        return int(str(x), 16)
    except ValueError:
        return 0  # Handle non-hexadecimal values, or choose an appropriate default

# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(hex_to_int)
df_test['data_field'] = df_test['data_field'].apply(hex_to_int)

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])

# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[6181437 2231849]
 [  54664  107841]]

Testing Metrics:
Accuracy: 73.34%
Precision: 97.33%
Recall: 73.34%
F1-Score: 82.96%
Matthews Correlation Coefficient: 12.19%
Informedness: 46.68%
Markedness: 94.66%


# Set 03 - Train and Test 02

In [34]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_03/test_02_unknown_vehicle_known_attack/')

# Convert hexadecimal fields to integers
def hex_to_int(x):
    try:
        return int(str(x), 16)
    except ValueError:
        return 0  # Handle non-hexadecimal values, or choose an appropriate default

# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(hex_to_int)
df_test['data_field'] = df_test['data_field'].apply(hex_to_int)

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])

# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[5287806 1410239]
 [ 108927   47262]]

Testing Metrics:
Accuracy: 77.84%
Precision: 95.82%
Recall: 77.84%
F1-Score: 85.58%
Matthews Correlation Coefficient: 3.36%
Informedness: 55.67%
Markedness: 91.65%


# Set 03 - Train and Test 03

In [35]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_03/test_03_known_vehicle_unknown_attack/')

# Convert hexadecimal fields to integers
def hex_to_int(x):
    try:
        return int(str(x), 16)
    except ValueError:
        return 0  # Handle non-hexadecimal values, or choose an appropriate default

# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(hex_to_int)
df_test['data_field'] = df_test['data_field'].apply(hex_to_int)

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])

# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[6848984 2580815]
 [  15649    2503]]

Testing Metrics:
Accuracy: 72.52%
Precision: 99.58%
Recall: 72.52%
F1-Score: 83.90%
Matthews Correlation Coefficient: -1.33%
Informedness: 45.04%
Markedness: 99.16%


# Set 03 - Train and Test 04

In [36]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_03/test_04_unknown_vehicle_unknown_attack/')

# Convert hexadecimal fields to integers
def hex_to_int(x):
    try:
        return int(str(x), 16)
    except ValueError:
        return 0  # Handle non-hexadecimal values, or choose an appropriate default

# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(hex_to_int)
df_test['data_field'] = df_test['data_field'].apply(hex_to_int)

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])

# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[5358809 1387262]
 [ 148609    1339]]

Testing Metrics:
Accuracy: 77.73%
Precision: 95.19%
Recall: 77.73%
F1-Score: 85.57%
Matthews Correlation Coefficient: -7.15%
Informedness: 55.46%
Markedness: 90.38%


# Set 04 - Train

In [37]:
# set 04
train_directory = './Documents/Research/can-train-and-test/set_04/train_01/'
df_train = load_data_from_directory(train_directory)

# Apply SMOTE for Handling Class Imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Train the BIRCH Model
birch_model = Birch(threshold=0.5, n_clusters=None)
birch_model.fit(X_train_resampled)

# Predict clusters on the training data
train_clusters = birch_model.predict(X_train_resampled)

# Map clusters to the most frequent attack label in each cluster
cluster_to_label = {}
for cluster in np.unique(train_clusters):
    mask = (train_clusters == cluster)
    cluster_to_label[cluster] = np.bincount(y_train_resampled[mask]).argmax()

# Assign predicted labels based on clusters
train_cluster_labels = np.array([cluster_to_label[cluster] for cluster in train_clusters])

# Evaluate Training Performance
conf_matrix_train = confusion_matrix(y_train_resampled, train_cluster_labels)
print(conf_matrix_train)
mcc_train = matthews_corrcoef(y_train_resampled, train_cluster_labels)
accuracy_train = accuracy_score(y_train_resampled, train_cluster_labels)
precision_train = precision_score(y_train_resampled, train_cluster_labels, zero_division=0)
recall_train = recall_score(y_train_resampled, train_cluster_labels, zero_division=0)
f1_train = f1_score(y_train_resampled, train_cluster_labels, zero_division=0)
informedness_train = recall_train - (1 - recall_train)
markedness_train = precision_train - (1 - precision_train)

print(f"Training Metrics (BIRCH):")
print(f"Accuracy: {accuracy_train * 100:.2f}%")
print(f"Precision: {precision_train * 100:.2f}%")
print(f"Recall: {recall_train * 100:.2f}%")
print(f"F1-Score: {f1_train * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_train:.4f}")
print(f"Informedness: {informedness_train:.4f}")
print(f"Markedness: {markedness_train:.4f}")

[[6998700 3604883]
 [1498067 9105516]]
Training Metrics (BIRCH):
Accuracy: 75.94%
Precision: 71.64%
Recall: 85.87%
F1-Score: 78.11%
Matthews Correlation Coefficient: 0.5293
Informedness: 0.7174
Markedness: 0.4328


# Set 04 - Train and Test 01

In [39]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_04/test_01_known_vehicle_known_attack/')

# Convert hexadecimal fields to integers
def hex_to_int(x):
    try:
        return int(str(x), 16)
    except ValueError:
        return 0  # Handle non-hexadecimal values, or choose an appropriate default

# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(hex_to_int)
df_test['data_field'] = df_test['data_field'].apply(hex_to_int)

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])

# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[5358809 1387262]
 [ 148609    1339]]

Testing Metrics:
Accuracy: 77.73%
Precision: 95.19%
Recall: 77.73%
F1-Score: 85.57%
Matthews Correlation Coefficient: -7.15%
Informedness: 55.46%
Markedness: 90.38%


# Set 04 - Train and Test 02

In [40]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_04/test_02_unknown_vehicle_known_attack/')

# Convert hexadecimal fields to integers
def hex_to_int(x):
    try:
        return int(str(x), 16)
    except ValueError:
        return 0  # Handle non-hexadecimal values, or choose an appropriate default

# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(hex_to_int)
df_test['data_field'] = df_test['data_field'].apply(hex_to_int)

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])

# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[13273070  4077785]
 [   51428     5811]]

Testing Metrics:
Accuracy: 76.28%
Precision: 99.29%
Recall: 76.28%
F1-Score: 86.26%
Matthews Correlation Coefficient: -1.80%
Informedness: 52.56%
Markedness: 98.57%


# Set 04 - Train and Test 03

In [41]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_04/test_03_known_vehicle_unknown_attack/')

# Convert hexadecimal fields to integers
def hex_to_int(x):
    try:
        return int(str(x), 16)
    except ValueError:
        return 0  # Handle non-hexadecimal values, or choose an appropriate default

# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(hex_to_int)
df_test['data_field'] = df_test['data_field'].apply(hex_to_int)

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])

# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[5287806 1410239]
 [ 108927   47262]]

Testing Metrics:
Accuracy: 77.84%
Precision: 95.82%
Recall: 77.84%
F1-Score: 85.58%
Matthews Correlation Coefficient: 3.36%
Informedness: 55.67%
Markedness: 91.65%


# Set 04 - Train and Test 04

In [42]:
df_test = load_data_from_directory('./Documents/Research/can-train-and-test/set_04/test_04_unknown_vehicle_unknown_attack/')

# Convert hexadecimal fields to integers
def hex_to_int(x):
    try:
        return int(str(x), 16)
    except ValueError:
        return 0  # Handle non-hexadecimal values, or choose an appropriate default

# Preprocess the data
df_test['timestamp'] = pd.to_datetime(df_test['timestamp']).astype(np.int64) // 10**9
df_test['arbitration_id'] = df_test['arbitration_id'].apply(hex_to_int)
df_test['data_field'] = df_test['data_field'].apply(hex_to_int)

X_test = df_test.drop(columns='attack')
y_test = df_test['attack']

X_test_scaled = scaler.transform(X_test)

# Predict clusters on the test data
test_clusters = birch_model.predict(X_test_scaled)

# Assign predicted labels based on clusters
test_cluster_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in test_clusters])

# Evaluate Test Performance
conf_matrix_test = confusion_matrix(y_test, test_cluster_labels)
print(conf_matrix_test)
mcc_test = matthews_corrcoef(y_test, test_cluster_labels)
accuracy_test = accuracy_score(y_test, test_cluster_labels)
precision_test = precision_score(y_test, test_cluster_labels, average='weighted', zero_division=0)
recall_test = recall_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
f1_test = f1_score(y_test, test_cluster_labels,  average='weighted', zero_division=0)
informedness_test = recall_test - (1 - recall_test)
markedness_test = precision_test - (1 - precision_test)

print(f"\nTesting Metrics:")
print(f"Accuracy: {accuracy_test * 100:.2f}%")
print(f"Precision: {precision_test * 100:.2f}%")
print(f"Recall: {recall_test * 100:.2f}%")
print(f"F1-Score: {f1_test * 100:.2f}%")
print(f"Matthews Correlation Coefficient: {mcc_test * 100:.2f}%")
print(f"Informedness: {informedness_test * 100:.2f}%")
print(f"Markedness: {markedness_test * 100:.2f}%")

[[6129280 1833375]
 [ 144038   76933]]

Testing Metrics:
Accuracy: 75.84%
Precision: 95.17%
Recall: 75.84%
F1-Score: 83.98%
Matthews Correlation Coefficient: 4.52%
Informedness: 51.67%
Markedness: 90.35%
