In [1]:
import pandas as pd
import numpy as np
import os
import time
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, matthews_corrcoef, accuracy_score, precision_score, recall_score, f1_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import GridSearchCV, cross_val_score


In [2]:
# Function to Load Data
def load_data_from_directory(directory_path):
    data_frames = []
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(directory_path, file_name)
            df = pd.read_csv(file_path)
            data_frames.append(df)
    return pd.concat(data_frames, ignore_index=True)

# Function to Preprocess Data
def hex_to_int(x):
    try:
        return int(str(x), 16)
    except ValueError:
        return 0  # Handle non-hexadecimal values
        
def preprocess_data(df):
    df['timestamp'] = pd.to_datetime(df['timestamp']).astype(np.int64) // 10**9
    df['arbitration_id'] = df['arbitration_id'].apply(hex_to_int)
    df['data_field'] = df['data_field'].apply(hex_to_int)
    return df

# Function to Extract Features and Labels
def extract_features_labels(df):
    X = df.drop(columns='attack')
    y = df['attack']
    return X, y

In [3]:
# Function to Train Gradient Boosting Model
def train_gradient_boosting(X_train, y_train):
    # Apply Random Under-Sampling for Handling Class Imbalance
    rus = RandomUnderSampler(random_state=42)
    X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

    # Standardize the Features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_resampled)

    # Hyperparameter tuning for Gradient Boosting
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10]
    }
    grid_search = GridSearchCV(GradientBoostingClassifier(random_state=42), param_grid, cv=5, scoring='f1_weighted')
    
    # Record training start time
    start_time = time.time()
    
    grid_search.fit(X_train_scaled, y_train_resampled)

    # Calculate training time
    training_time = time.time() - start_time
    
    best_gb = grid_search.best_estimator_
    return best_gb, scaler, training_time



In [4]:
# Function to Test Gradient Boosting Model
def test_gradient_boosting(gb, scaler, X_test, y_test):
    # Standardize the Features
    X_test_scaled = scaler.transform(X_test)

    # Record testing start time
    start_time = time.time()
    
    # Predict on Test Data
    y_test_pred = gb.predict(X_test_scaled)

    # Calculate testing time
    testing_time = time.time() - start_time
    
    # Evaluate Test Performance
    conf_matrix_test = confusion_matrix(y_test, y_test_pred)
    mcc_test = matthews_corrcoef(y_test, y_test_pred)
    accuracy_test = accuracy_score(y_test, y_test_pred)
    precision_test = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
    recall_test = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)
    f1_test = f1_score(y_test, y_test_pred, average='weighted', zero_division=0)
    informedness_test = recall_test - (1 - recall_test)
    markedness_test = precision_test - (1 - precision_test)

    return {
        'conf_matrix': conf_matrix_test,
        'mcc': mcc_test,
        'accuracy': accuracy_test,
        'precision': precision_test,
        'recall': recall_test,
        'f1_score': f1_test,
        'informedness': informedness_test,
        'markedness': markedness_test,
        'testing_time': testing_time
    }


In [5]:
# Directories
train_directories = [
    './Documents/Research/can-train-and-test/set_01/train_01/'
]

test_directories = [
    './Documents/Research/can-train-and-test/set_01/test_01_known_vehicle_known_attack/',
    './Documents/Research/can-train-and-test/set_01/test_02_unknown_vehicle_known_attack/',
    './Documents/Research/can-train-and-test/set_01/test_03_known_vehicle_unknown_attack/',
    './Documents/Research/can-train-and-test/set_01/test_04_unknown_vehicle_unknown_attack/'
]


# Iterate through each training directory

for i, train_dir in enumerate(train_directories):

    # Load and preprocess training data
    df_train = load_data_from_directory(train_dir)
    df_train = preprocess_data(df_train)
    X_train, y_train = extract_features_labels(df_train)

    # Train the Gradient Boosting Model
    gb, scaler, training_time = train_gradient_boosting(X_train, y_train)

    test_metrics = test_gradient_boosting(gb, scaler, X_train, y_train)

    # Corresponding test directories for each training set
    corresponding_tests = test_directories[i*4:(i+1)*4]
    
    # Test the model on each corresponding test set
    for test_dir in corresponding_tests:
        
        # Load and preprocess testing data
        df_test = load_data_from_directory(test_dir)
        df_test = preprocess_data(df_test)
        X_test, y_test = extract_features_labels(df_test)

        # Test the Gradient Boosting Model
        test_metrics = test_gradient_boosting(gb, scaler, X_test, y_test)

        # Print Testing Metrics
        print(f"\n===============================================================================")
        print(f"\nTesting Metrics (Train: {train_dir} | Test: {test_dir}):")
        print(f"\n-----------------------------------------------------------------------")
        print(f"\nTraining Time: {training_time:.2f} seconds")
        print(f"Testing Time: {test_metrics['testing_time']:.2f} seconds")
        print(f"\n-----------------------------------------------------------------------")
        print(f"Accuracy: {test_metrics['accuracy'] * 100:.2f}%")
        print(f"Precision: {test_metrics['precision'] * 100:.2f}%")
        print(f"Recall: {test_metrics['recall'] * 100:.2f}%")
        print(f"F1-Score: {test_metrics['f1_score'] * 100:.2f}%")
        print(f"Matthews Correlation Coefficient: {test_metrics['mcc']:.4f}")
        print(f"Informedness: {test_metrics['informedness']:.4f}")
        print(f"Markedness: {test_metrics['markedness']:.4f}")



Testing Metrics (Train: ./Documents/Research/can-train-and-test/set_01/train_01/ | Test: ./Documents/Research/can-train-and-test/set_01/test_01_known_vehicle_known_attack/):

-----------------------------------------------------------------------

Training Time: 961.66 seconds
Testing Time: 11.05 seconds

-----------------------------------------------------------------------
Accuracy: 99.13%
Precision: 98.98%
Recall: 99.13%
F1-Score: 99.00%
Matthews Correlation Coefficient: 0.5083
Informedness: 0.9827
Markedness: 0.9797


Testing Metrics (Train: ./Documents/Research/can-train-and-test/set_01/train_01/ | Test: ./Documents/Research/can-train-and-test/set_01/test_02_unknown_vehicle_known_attack/):

-----------------------------------------------------------------------

Training Time: 961.66 seconds
Testing Time: 12.54 seconds

-----------------------------------------------------------------------
Accuracy: 92.61%
Precision: 94.99%
Recall: 92.61%
F1-Score: 93.78%
Matthews Correlation 

In [6]:
# Directories
train_directories = [
    './Documents/Research/can-train-and-test/set_02/train_01/'
]

test_directories = [
    './Documents/Research/can-train-and-test/set_02/test_01_known_vehicle_known_attack/',
    './Documents/Research/can-train-and-test/set_02/test_02_unknown_vehicle_known_attack/',
    './Documents/Research/can-train-and-test/set_02/test_03_known_vehicle_unknown_attack/',
    './Documents/Research/can-train-and-test/set_02/test_04_unknown_vehicle_unknown_attack/'
]

# Iterate through each training directory

for i, train_dir in enumerate(train_directories):
    
    # Load and preprocess training data
    df_train = load_data_from_directory(train_dir)
    df_train = preprocess_data(df_train)
    X_train, y_train = extract_features_labels(df_train)

    # Train the Gradient Boosting Model
    gb, scaler, training_time = train_gradient_boosting(X_train, y_train)

    test_metrics = test_gradient_boosting(gb, scaler, X_train, y_train)

    # Corresponding test directories for each training set
    corresponding_tests = test_directories[i*4:(i+1)*4]
    
    # Test the model on each corresponding test set
    for test_dir in corresponding_tests:
        print(f"\nTesting on data from: {test_dir}")
        
        # Load and preprocess testing data
        df_test = load_data_from_directory(test_dir)
        df_test = preprocess_data(df_test)
        X_test, y_test = extract_features_labels(df_test)

        # Test the Gradient Boosting Model
        test_metrics = test_gradient_boosting(gb, scaler, X_test, y_test)

        # Print Testing Metrics
        print(f"\n===============================================================================")
        print(f"\nTesting Metrics (Train: {train_dir} | Test: {test_dir}):")
        print(f"\n-----------------------------------------------------------------------")
        print(f"\nTraining Time: {training_time:.2f} seconds")
        print(f"Testing Time: {test_metrics['testing_time']:.2f} seconds")
        print(f"\n-----------------------------------------------------------------------")
        print(f"Accuracy: {test_metrics['accuracy'] * 100:.2f}%")
        print(f"Precision: {test_metrics['precision'] * 100:.2f}%")
        print(f"Recall: {test_metrics['recall'] * 100:.2f}%")
        print(f"F1-Score: {test_metrics['f1_score'] * 100:.2f}%")
        print(f"Matthews Correlation Coefficient: {test_metrics['mcc']:.4f}")
        print(f"Informedness: {test_metrics['informedness']:.4f}")
        print(f"Markedness: {test_metrics['markedness']:.4f}")


Testing on data from: ./Documents/Research/can-train-and-test/set_02/test_01_known_vehicle_known_attack/


Testing Metrics (Train: ./Documents/Research/can-train-and-test/set_02/train_01/ | Test: ./Documents/Research/can-train-and-test/set_02/test_01_known_vehicle_known_attack/):

-----------------------------------------------------------------------

Training Time: 17751.94 seconds
Testing Time: 20.17 seconds

-----------------------------------------------------------------------
Accuracy: 97.68%
Precision: 99.86%
Recall: 97.68%
F1-Score: 98.73%
Matthews Correlation Coefficient: 0.1417
Informedness: 0.9536
Markedness: 0.9972

Testing on data from: ./Documents/Research/can-train-and-test/set_02/test_02_unknown_vehicle_known_attack/


Testing Metrics (Train: ./Documents/Research/can-train-and-test/set_02/train_01/ | Test: ./Documents/Research/can-train-and-test/set_02/test_02_unknown_vehicle_known_attack/):

-----------------------------------------------------------------------

Tra

In [7]:
# Directories
train_directories = [
    './Documents/Research/can-train-and-test/set_03/train_01/'
]

test_directories = [
    './Documents/Research/can-train-and-test/set_03/test_01_known_vehicle_known_attack/',
    './Documents/Research/can-train-and-test/set_03/test_02_unknown_vehicle_known_attack/',
    './Documents/Research/can-train-and-test/set_03/test_03_known_vehicle_unknown_attack/',
    './Documents/Research/can-train-and-test/set_03/test_04_unknown_vehicle_unknown_attack/'
]

# Iterate through each training directory

for i, train_dir in enumerate(train_directories):

    # Load and preprocess training data
    df_train = load_data_from_directory(train_dir)
    df_train = preprocess_data(df_train)
    X_train, y_train = extract_features_labels(df_train)

    # Train the Gradient Boosting Model
    gb, scaler, training_time = train_gradient_boosting(X_train, y_train)

    test_metrics = test_gradient_boosting(gb, scaler, X_train, y_train)

    # Corresponding test directories for each training set
    corresponding_tests = test_directories[i*4:(i+1)*4]
    
    # Test the model on each corresponding test set
    for test_dir in corresponding_tests:
        print(f"\nTesting on data from: {test_dir}")
        
        # Load and preprocess testing data
        df_test = load_data_from_directory(test_dir)
        df_test = preprocess_data(df_test)
        X_test, y_test = extract_features_labels(df_test)

        # Test the Gradient Boosting Model
        test_metrics = test_gradient_boosting(gb, scaler, X_test, y_test)

        # Print Testing Metrics
        print(f"\n===============================================================================")
        print(f"\nTesting Metrics (Train: {train_dir} | Test: {test_dir}):")
        print(f"\n-----------------------------------------------------------------------")
        print(f"\nTraining Time: {training_time:.2f} seconds")
        print(f"Testing Time: {test_metrics['testing_time']:.2f} seconds")
        print(f"\n-----------------------------------------------------------------------")
        print(f"Accuracy: {test_metrics['accuracy'] * 100:.2f}%")
        print(f"Precision: {test_metrics['precision'] * 100:.2f}%")
        print(f"Recall: {test_metrics['recall'] * 100:.2f}%")
        print(f"F1-Score: {test_metrics['f1_score'] * 100:.2f}%")
        print(f"Matthews Correlation Coefficient: {test_metrics['mcc']:.4f}")
        print(f"Informedness: {test_metrics['informedness']:.4f}")
        print(f"Markedness: {test_metrics['markedness']:.4f}")


Testing on data from: ./Documents/Research/can-train-and-test/set_03/test_01_known_vehicle_known_attack/


Testing Metrics (Train: ./Documents/Research/can-train-and-test/set_03/train_01/ | Test: ./Documents/Research/can-train-and-test/set_03/test_01_known_vehicle_known_attack/):

-----------------------------------------------------------------------

Training Time: 6752.37 seconds
Testing Time: 19.81 seconds

-----------------------------------------------------------------------
Accuracy: 99.60%
Precision: 99.60%
Recall: 99.60%
F1-Score: 99.58%
Matthews Correlation Coefficient: 0.8869
Informedness: 0.9920
Markedness: 0.9921

Testing on data from: ./Documents/Research/can-train-and-test/set_03/test_02_unknown_vehicle_known_attack/


Testing Metrics (Train: ./Documents/Research/can-train-and-test/set_03/train_01/ | Test: ./Documents/Research/can-train-and-test/set_03/test_02_unknown_vehicle_known_attack/):

-----------------------------------------------------------------------

Trai

In [8]:
# Directories
train_directories = [
    './Documents/Research/can-train-and-test/set_04/train_01/'
]

test_directories = [
    './Documents/Research/can-train-and-test/set_04/test_01_known_vehicle_known_attack/',
    './Documents/Research/can-train-and-test/set_04/test_02_unknown_vehicle_known_attack/',
    './Documents/Research/can-train-and-test/set_04/test_03_known_vehicle_unknown_attack/',
    './Documents/Research/can-train-and-test/set_04/test_04_unknown_vehicle_unknown_attack/'
]

# Iterate through each training directory

for i, train_dir in enumerate(train_directories):
    
    # Load and preprocess training data
    df_train = load_data_from_directory(train_dir)
    df_train = preprocess_data(df_train)
    X_train, y_train = extract_features_labels(df_train)

    # Train the Gradient Boosting Model
    gb, scaler, training_time = train_gradient_boosting(X_train, y_train)

    test_metrics = test_gradient_boosting(gb, scaler, X_train, y_train)

    # Corresponding test directories for each training set
    corresponding_tests = test_directories[i*4:(i+1)*4]
    
    # Test the model on each corresponding test set
    for test_dir in corresponding_tests:
        print(f"\nTesting on data from: {test_dir}")
        
        # Load and preprocess testing data
        df_test = load_data_from_directory(test_dir)
        df_test = preprocess_data(df_test)
        X_test, y_test = extract_features_labels(df_test)

        # Test the Gradient Boosting Model
        test_metrics = test_gradient_boosting(gb, scaler, X_test, y_test)

        # Print Testing Metrics
        print(f"\n===============================================================================")
        print(f"\nTesting Metrics (Train: {train_dir} | Test: {test_dir}):")
        print(f"\n-----------------------------------------------------------------------")
        print(f"\nTraining Time: {training_time:.2f} seconds")
        print(f"Testing Time: {test_metrics['testing_time']:.2f} seconds")
        print(f"\n-----------------------------------------------------------------------")
        print(f"Accuracy: {test_metrics['accuracy'] * 100:.2f}%")
        print(f"Precision: {test_metrics['precision'] * 100:.2f}%")
        print(f"Recall: {test_metrics['recall'] * 100:.2f}%")
        print(f"F1-Score: {test_metrics['f1_score'] * 100:.2f}%")
        print(f"Matthews Correlation Coefficient: {test_metrics['mcc']:.4f}")
        print(f"Informedness: {test_metrics['informedness']:.4f}")
        print(f"Markedness: {test_metrics['markedness']:.4f}")


Testing on data from: ./Documents/Research/can-train-and-test/set_04/test_01_known_vehicle_known_attack/


Testing Metrics (Train: ./Documents/Research/can-train-and-test/set_04/train_01/ | Test: ./Documents/Research/can-train-and-test/set_04/test_01_known_vehicle_known_attack/):

-----------------------------------------------------------------------

Training Time: 1773.88 seconds
Testing Time: 38.51 seconds

-----------------------------------------------------------------------
Accuracy: 96.45%
Precision: 98.61%
Recall: 96.45%
F1-Score: 97.21%
Matthews Correlation Coefficient: 0.5980
Informedness: 0.9290
Markedness: 0.9722

Testing on data from: ./Documents/Research/can-train-and-test/set_04/test_02_unknown_vehicle_known_attack/


Testing Metrics (Train: ./Documents/Research/can-train-and-test/set_04/train_01/ | Test: ./Documents/Research/can-train-and-test/set_04/test_02_unknown_vehicle_known_attack/):

-----------------------------------------------------------------------

Trai

# How well the algorithm works in detecting each attack.

### We train the model in all set and 

In [None]:
# List of attack files to test
attack_files = {
    'DOS': ['DOS-3.csv', 'DOS-4.csv'],
    'force-neutral': ['force-neutral-3.csv', 'force-neutral-4'],
    'rpm': ['rpm-3', 'rpm-4'],
    'standstill': ['standstill-3', 'standstill-4'],
    'double_spoofing': ['double-spoofing-3', 'double-spoofing-4'],
    'fuzzing': ['fuzzing-3', 'fuzzing-4'],
    'interval': ['interval-3', 'interval-4'],
    'speed': ['speed-3', 'speed-4'],
    'systematic': ['systematic-3', 'systematic-4'],
    'triple': ['triple-3', 'triple-4']
}

# Function to load and combine attack files from a directory
def load_combine_attack_files(directory, attack_files):
    data_frames = []
    for file in attack_files:
        file_path = os.path.join(directory, file)
        df = load_data_from_file(file_path)
        data_frames.append(df)
    combined_df = pd.concat(data_frames, ignore_index=True)
    return combined_df

# Iterate through each attack type
for attack_name, files in attack_files.items():
    print(f"\nEvaluating metrics for attack type: {attack_name}")
    
    # Iterate through each training directory
    for train_dir in train_directories:
        
        # Benchmark loading and preprocessing training data
        start_time = time.time()
        df_train = load_data_from_directory(train_dir)
        df_train = preprocess_data(df_train)
        X_train, y_train = extract_features_labels(df_train)
        load_preprocess_train_time = time.time() - start_time

        # Benchmark training the Gradient Boosting Model
        start_time = time.time()
        gb, scaler, training_time = train_gradient_boosting(X_train, y_train)
        training_time = time.time() - start_time

        # Combined metrics storage for the current attack
        combined_metrics = {
            'accuracy': [],
            'precision': [],
            'recall': [],
            'f1_score': [],
            'mcc': [],
            'informedness': [],
            'markedness': [],
            'testing_time': []
        }

        # Test the model on each test directory
        for test_dir in test_directories:
            # Load and combine attack files
            df_test = load_combine_attack_files(test_dir, files)
            df_test = preprocess_data(df_test)
            X_test, y_test = extract_features_labels(df_test)

            # Benchmark testing the Gradient Boosting Model
            start_time = time.time()
            test_metrics = test_gradient_boosting(gb, scaler, X_test, y_test)
            testing_time = time.time() - start_time

            # Store combined metrics
            combined_metrics['accuracy'].append(test_metrics['accuracy'])
            combined_metrics['precision'].append(test_metrics['precision'])
            combined_metrics['recall'].append(test_metrics['recall'])
            combined_metrics['f1_score'].append(test_metrics['f1_score'])
            combined_metrics['mcc'].append(test_metrics['mcc'])
            combined_metrics['informedness'].append(test_metrics['informedness'])
            combined_metrics['markedness'].append(test_metrics['markedness'])
            combined_metrics['testing_time'].append(testing_time)

        # Compute average metrics for the current attack
        avg_metrics = {key: sum(values) / len(values) for key, values in combined_metrics.items()}

        # Print Combined Testing Metrics for the current attack
        print(f"\n===============================================================================")
        print(f"\nCombined Testing Metrics (Train: {train_dir} | {attack_name} Attack Test Files):")
        print(f"\n-----------------------------------------------------------------------")
        print(f"\nLoad and Preprocess Training Time: {load_preprocess_train_time:.2f} seconds")
        print(f"Training Time: {training_time:.2f} seconds")
        print(f"Average Load and Preprocess Testing Time: {avg_metrics['testing_time']:.2f} seconds")
        print(f"\n-----------------------------------------------------------------------")
        print(f"Accuracy: {avg_metrics['accuracy'] * 100:.2f}%")
        print(f"Precision: {avg_metrics['precision'] * 100:.2f}%")
        print(f"Recall: {avg_metrics['recall'] * 100:.2f}%")
        print(f"F1-Score: {avg_metrics['f1_score'] * 100:.2f}%")
        print(f"Matthews Correlation Coefficient: {avg_metrics['mcc']:.4f}")
        print(f"Informedness: {avg_metrics['informedness']:.4f}")
        print(f"Markedness: {avg_metrics['markedness']:.4f}")