### Citations
#### Dataset
“Stratosphere Laboratory. A labeled dataset with malicious and benign IoT network traffic. January 22th. Agustin Parmisano, Sebastian Garcia, Maria Jose Erquiaga. https://www.stratosphereips.org/datasets-iot23
#### Model advices
Christian Desrosiers, École de Technologie Supérieur (ETS), for proposing XGBoost model.
#### Notebook creation
Rémi Blier, École de Technologie Supérieur (ETS), for creating this notebook.
#### XGBoost model
Chen, T., & Guestrin, C. (2016). XGBoost: A Scalable Tree Boosting System. In Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (pp. 785–794). New York, NY, USA: ACM. https://doi.org/10.1145/2939672.2939785

### Import data

In [1]:
def find_csv_delimiter(file_path, max_lines=5):
    with open(file_path, 'r', newline='') as file:
        sample_lines = [file.readline().strip() for _ in range(max_lines)]

    delimiters = [',', ';', '\t', '|']  # Common delimiters to check

    best_delimiter = ','
    max_delimiter_count = 0

    for delimiter in delimiters:
        delimiter_count = sum(line.count(delimiter) for line in sample_lines)
        if delimiter_count > max_delimiter_count:
            best_delimiter = delimiter
            max_delimiter_count = delimiter_count

    return best_delimiter

import csv
import numpy as np
from sklearn.preprocessing import LabelEncoder

def extract_data_from_csv(file_path, delimiter=','):
    data = []  # Create a list to store the data

    # Open the CSV file for reading
    with open(file_path, mode='r', newline='') as file:
        # Create a CSV reader object with the pipe delimiter

        csv_reader = csv.reader(file, delimiter=delimiter)

        # Read the header row
        header = next(csv_reader)

        # Iterate through the rows in the CSV file
        for row in csv_reader:
            data.append(row)
    
    return data

data = []

import os
cpt = 0
directory = "archive"
print(os.listdir(directory))
for filename in os.listdir(directory):
    
        # for filename in filenames:
        if cpt < 2:
            file_path = os.path.join(directory, filename)
            delimiter = find_csv_delimiter(file_path)
            data += extract_data_from_csv(file_path, delimiter)
            print(os.path.join(directory, filename))
            cpt+=1
        else:
            break

 

# Convert your data to a NumPy array
data = np.array(data)

print(data.shape)

['CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv']
archive\CTU-IoT-Malware-Capture-1-1conn.log.labeled.csv
(1008748, 23)


In [2]:
print(data[0])

['1525879831.015811' 'CUmrqr4svHuSXJy5z7' '192.168.100.103' '51524'
 '65.127.233.163' '23' 'tcp' '-' '2.999051' '0' '0' 'S0' '-' '-' '0' 'S'
 '3' '180' '0' '0' '-' 'Malicious' 'PartOfAHorizontalPortScan']


In [3]:
# columns_to_remove = [0, 1, 2, 4, 12, 13, 14, 20, 22]
columns_to_remove = [0, 1, 7, 8, 11, 12, 13, 14, 15, 20, 22]
data = np.delete(data, columns_to_remove, axis=1)
print(data[0])
print(data.shape)
print(type(data))

['192.168.100.103' '51524' '65.127.233.163' '23' 'tcp' '0' '0' '3' '180'
 '0' '0' 'Malicious']
(1008748, 12)
<class 'numpy.ndarray'>


In [4]:
# Make Malicious = 1 and Benign = 0
for row in data:
    if row[-1] == 'Benign':
        row[-1] = 0
    else:
        row[-1] = 1

# Separating ip addresses as 4 numbers
def convert_ip_addresses(arr):
    new_data = []
    for i, item in enumerate(arr):
        if i == 0 or i == 2:
            ip_parts = item.split('.')
            new_data.extend(ip_parts if len(ip_parts) == 4 else ['0', '0', '0', '0'])
        else:
            new_data.append(item)

    return np.array(new_data)

new_data = []


for i in range(len(data)):
    new_data.append(convert_ip_addresses(data[i]))

data = np.vstack(new_data)
print(data.shape)

(1008748, 18)


### Process data

In [5]:
# Convert columns to int
new_data = []
for row in data:
    r = np.array([])
    for column in range(len(row)):
        if row[column] == '-':
            row[column] = 0
            r = np.append(r, 0)
        elif row[column] in ['tcp', 'udp', 'icmp']:
            r = np.append(r, row[column])
            continue
        else:
            try:
                # Attempt to convert the value to an integer
                r = np.append(r, row[column].astype(float))
            except ValueError:
                pass
    new_data.append(r)

data = np.vstack(new_data)
print(data[0])


['192.0' '168.0' '100.0' '103.0' '51524.0' '65.0' '127.0' '233.0' '163.0'
 '23.0' 'tcp' '0.0' '0.0' '3.0' '180.0' '0.0' '0.0' '1.0']


In [6]:
print(type(data[0][0]))

<class 'numpy.str_'>


In [7]:
import pandas as pd
import numpy as np

print(data[0])

# Assuming 'data' is your NumPy array
df = pd.DataFrame(data)

# columns_to_onehot = [0, 1, 2, 3, 7, 8]
columns_to_onehot = [10]

# for i in range(18):
#     columns_to_onehot.append(i+1)
#     print(type(data[0][i]))

columns_to_encode = df.columns[columns_to_onehot]



# Perform one-hot encoding
onehot_encoded = pd.get_dummies(df, columns=columns_to_encode, drop_first=True)

# Display the result
print('After one-hot encoding features:')
print(onehot_encoded.values[0])
print(onehot_encoded.values[0].shape)
data = onehot_encoded.values


['192.0' '168.0' '100.0' '103.0' '51524.0' '65.0' '127.0' '233.0' '163.0'
 '23.0' 'tcp' '0.0' '0.0' '3.0' '180.0' '0.0' '0.0' '1.0']
After one-hot encoding features:
['192.0' '168.0' '100.0' '103.0' '51524.0' '65.0' '127.0' '233.0' '163.0'
 '23.0' '0.0' '0.0' '3.0' '180.0' '0.0' '0.0' '1.0' True False]
(19,)


In [8]:
# Check if any of the data contains strings
for row in data:
    for column in row:
        if isinstance(column, str):
            #Convert the value to a float, if possible
            try:
                column = float(column)
            except ValueError:
                pass

# Initialize an empty list to store preprocessed data
preprocessed_data = []

# Iterate through the rows in the data
for row in data:
    try:
        # Convert all columns to floats in this row
        float_row = [float(column) if column != '-' else 0.0 for column in row]
        preprocessed_data.append(float_row)
    except ValueError:
        print('Skipping row with non-convertible values:', row)

data = preprocessed_data

# Check if data contains strings
for row in data:
    for column in row:
        if isinstance(column, str):
            print('Error: String found in data: ', column)
            break

preprocessed_data = data

# Convert preprocessed_data to a normal Python list of lists
preprocessed_data = [list(row) for row in preprocessed_data]

# # Print the preprocessed data
# for row in preprocessed_data:
#     print(row)


### Separate data

In [9]:
import random
import numpy as np

# Separate data

# Define the split ratios for training, validation, and test datasets
train_ratio = 0.70  # 70% for training
val_ratio = 0.15   # 15% for validation
test_ratio = 0.15  # 15% for testing

train_val_indices = int((train_ratio + val_ratio) * len(preprocessed_data))

train_val_data = preprocessed_data[:train_val_indices]
test_data = preprocessed_data[train_val_indices:]

# Shuffle the data randomly
random.shuffle(train_val_data)
random.shuffle(test_data)

# Calculate the split points
total_records = len(train_val_data)
train_split = int(train_ratio * total_records)
val_split = int(val_ratio * total_records)

# Split the data into training, validation
train_data = train_val_data[:train_split]
val_data = train_val_data[train_split:]



In [10]:
train_labels = []
val_labels = []
test_labels = []

# Separate the labels from features vectors
train_temp = []
for row in train_data:
    newRow = []
    newRow = row[:len(row)-1]
    train_temp.append(newRow)
    train_labels.append(row[-1])
    
val_temp = []
for row in val_data:
    newRow = []
    newRow = row[:len(row)-1]
    val_temp.append(newRow)
    val_labels.append(row[-1])

test_temp = []
for row in test_data:
    newRow = []
    newRow = row[:len(row)-1]
    test_temp.append(newRow)
    test_labels.append(row[-1])

train_data = train_temp
val_data = val_temp
test_data = test_temp

In [11]:
# Convert your data to NumPy arrays
train_data = np.array(train_data)
train_labels = np.array(train_labels)

val_data = np.array(val_data)
val_labels = np.array(val_labels)

test_data = np.array(test_data)
test_labels = np.array(test_labels)

# Print rows and columns of the data
print(train_data.shape)
print(train_labels.shape)
print(val_data.shape)
print(val_labels.shape)


(600204, 18)
(600204,)
(257231, 18)
(257231,)


### Train model

In [12]:

# Check if train data contains strings
for row in train_data:
    for column in row:
        if isinstance(column, str):
            print('Error: String found in data: ', column)
            break

# Check if val data contains strings
for row in val_data:
    for column in row:
        if isinstance(column, str):
            print('Error: String found in data: ', column)
            break

In [13]:
import gc
gc.collect()

0

In [14]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define your XGBoost classifier and hyperparameter search space
xgb_model = XGBClassifier()
param_space = {
    'n_estimators': [100],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
}

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(
    xgb_model,
    param_space,
    n_iter=5,  # Adjust the number of iterations as needed
    scoring='accuracy',  # Use the appropriate scoring metric
    n_jobs=-1,  # Use all available CPU cores for parallel processing
    cv=5,  # Number of cross-validation folds
    random_state=42,  # Set a random seed for reproducibility
    verbose=3
)

# Perform hyperparameter optimization
random_search.fit(train_data, train_labels)

# Get the best hyperparameters and the best model
best_xgb_hps = random_search.best_params_
best_xgb_model = random_search.best_estimator_


Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [15]:
print(best_xgb_hps)

{'subsample': 0.8, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.8}


### Evaluate model on val set

In [16]:
from sklearn.metrics import classification_report, confusion_matrix
# Evaluate the Random Forest model on the validation data
sgboost_val_predictions = random_search.predict(val_data)
sgboost_val_accuracy = np.mean(sgboost_val_predictions == val_labels)
print("Validation Accuracy (Random Forest):", sgboost_val_accuracy)

# Calculate and print classification report and confusion matrix for Random Forest
sgboost_val_report = classification_report(val_labels, sgboost_val_predictions)
sgboost_val_confusion = confusion_matrix(val_labels, sgboost_val_predictions)
print("Validation Classification Report (XGBoost):")
print(sgboost_val_report)
print("Validation Confusion Matrix (XGBoost):")
print(sgboost_val_confusion)


Validation Accuracy (Random Forest): 1.0
Validation Classification Report (XGBoost):
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    154492
         1.0       1.00      1.00      1.00    102739

    accuracy                           1.00    257231
   macro avg       1.00      1.00      1.00    257231
weighted avg       1.00      1.00      1.00    257231

Validation Confusion Matrix (XGBoost):
[[154492      0]
 [     0 102739]]


### Evaluate model on test set

In [17]:
# Evaluate the Random Forest model on the test data
sgboost_test_predictions = random_search.predict(test_data)
sgboost_test_accuracy = np.mean(sgboost_test_predictions == test_labels)
print("Test Accuracy (SGBoost):", sgboost_test_accuracy)

# Calculate and print classification report and confusion matrix for Random Forest
sgboost_test_report = classification_report(test_labels, sgboost_test_predictions)
sgboost_test_confusion = confusion_matrix(test_labels, sgboost_test_predictions)
print("Test Classification Report (XGBoost):")
print(sgboost_test_report)
print("Test Confusion Matrix (XGBoost):")
print(sgboost_test_confusion)

Test Accuracy (SGBoost): 1.0
Test Classification Report (XGBoost):
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     85934
         1.0       1.00      1.00      1.00     65379

    accuracy                           1.00    151313
   macro avg       1.00      1.00      1.00    151313
weighted avg       1.00      1.00      1.00    151313

Test Confusion Matrix (XGBoost):
[[85934     0]
 [    0 65379]]


In [19]:
import joblib
joblib.dump(best_xgb_model, 'best_xgb_model.joblib')
# If you want to save the best hyperparameters as well
with open('best_xgb_hyperparameters.txt', 'w') as file:
    file.write(str(best_xgb_hps))


In [None]:
# Load the best XGBoost model from the file
loaded_xgb_model = joblib.load('best_xgb_model.joblib')

# If you want to load the best hyperparameters as well
with open('best_xgb_hyperparameters.txt', 'r') as file:
    loaded_xgb_hps = eval(file.read())