In [1]:
"""
This code does the following:
    1. Drops the unwanted columns which are object type and don't have a real impact on the model
    2. Combines 2 dataset captured over the span of 2 years
    3. 'attack category' column has 5 types of attack and 1 is benign category. Attacks are classified as either 1 or 2 and 
        benign is classified as 0.
    4. One hot encoder is used and columns are segregated based on attack types.
    5. Random forest model is run the get the accuracy. (Approx 80 features)

Results:
    We get almost perfect accuracy. This includes all the 80 features.
"""

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder

df1 = pd.read_csv('Datasets\\ALLFLOWMETER_HIKARI2021.csv')
df2 = pd.read_csv('Datasets\\ALLFLOWMETER_HIKARI2022.csv')

#df1.drop('bwd_last_window_size', axis=1)

# Combining two IDS datasets acquired over 2 years to develop good model
combined_df = pd.concat([df1, df2], ignore_index=True)

# we remove the 'flow_duration' here too because it has no real impact on the analysis and the formats are very different from same
# synthetic dataset reproduced as shown in paper.
combined_df = combined_df.drop(['Unnamed: 0', 'uid', 'flow_duration','bwd_last_window_size'], axis=1)

combined_df['attack_category'] = combined_df['attack_category'].str.strip()
# Replacing similar values with correct version
combined_df['attack_category']=combined_df['attack_category'].replace('Brutefoce', 'Bruteforce')

attack_category_counts = combined_df['attack_category'].value_counts()
print("Attack categories:",attack_category_counts)

y = combined_df['Label']

X = combined_df.drop(['Label', 'originh', 'responh'], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# Names of columns with missing values
cols_with_missing_values = [col for col in X_train.columns
                            if X_train[col].isnull().any()]

print("Columns with missings values:",cols_with_missing_values)
# With prelimnary analysis we found that there were no missing values in any 
# of the columns. Hence, it is a robust dataset.

#getting list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables")
print(object_cols)

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Removing the categorical columns (will be replacing with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

rfc_model = RandomForestClassifier(n_estimators=100, random_state=0)
rfc_model.fit(OH_X_train, y_train)

predictions = rfc_model.predict(OH_X_valid)

accuracy = accuracy_score(y_valid, predictions)

print("Accuracy: ", accuracy)
print("Classification Report:")
print(classification_report(y_valid, predictions))

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Attack categories: attack_category
Benign                 562335
Background             170151
Probing                 23388
Bruteforce-XML           8795
Bruteforce               7988
XMRIGCC CryptoMiner      7595
Name: count, dtype: int64
Columns with missings values: []
Categorical variables
['attack_category']
Accuracy:  1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    112433
           1       1.00      1.00      1.00      9572
           2       1.00      1.00      1.00     34046

    accuracy                           1.00    156051
   macro avg       1.00      1.00      1.00    156051
weighted avg       1.00      1.00      1.00    156051



In [6]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

xgb_model = XGBRegressor()
xgb_model.fit(OH_X_train, y_train)

xgb_predictions = xgb_model.predict(OH_X_valid)

predictions = xgb_model.predict(OH_X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 1.0234672764034752e-06


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

df1 = pd.read_csv('Datasets\\ALLFLOWMETER_HIKARI2021.csv')
df2 = pd.read_csv('Datasets\\ALLFLOWMETER_HIKARI2022.csv')

#df1.drop('bwd_last_window_size', axis=1)

# Combining two IDS datasets acquired over 2 years to develop good model
combined_df = pd.concat([df1, df2], ignore_index=True)

# we remove the 'flow_duration' here too because it has no real impact on the analysis and the formats are very different from same
# synthetic dataset reproduced as shown in paper.
combined_df = combined_df.drop(['Unnamed: 0', 'uid', 'flow_duration','bwd_last_window_size'], axis=1)

combined_df['attack_category'] = combined_df['attack_category'].str.strip()
# Replacing similar values with correct version
combined_df['attack_category']=combined_df['attack_category'].replace('Brutefoce', 'Bruteforce')


X = combined_df.drop(['Label', 'originh', 'responh'], axis=1)
y = combined_df['Label']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Removing the categorical columns (will be replacing with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

OH_X_train = pd.concat([X_train.drop(object_cols, axis=1), OH_cols_train], axis=1)
OH_X_valid = pd.concat([X_valid.drop(object_cols, axis=1), OH_cols_valid], axis=1)

OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

# K-Fold Cross-Validation with F1-score evaluation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)  # 5-fold stratified cross-validation
f1_scores = []
for train_index, test_index in kfold.split(OH_X_train, y_train):
  # Train on fold data
  fold_train_x, fold_train_y = OH_X_train.iloc[train_index], y_train.iloc[train_index]
  rfc_model = RandomForestClassifier(n_estimators=100, random_state=0)
  rfc_model.fit(fold_train_x, fold_train_y)

  # Make predictions on validation fold data
  fold_test_x, fold_test_y = OH_X_train.iloc[test_index], y_train.iloc[test_index]
  predictions = rfc_model.predict(fold_test_x)

  # Calculate F1-score
  f1 = f1_score(fold_test_y, predictions, average='weighted')
  f1_scores.append(f1)

# Print average F1-score across folds
print("Average F1-score (KFold):", np.mean(f1_scores))

# Confusion Matrix on validation set
y_pred = rfc_model.predict(OH_X_valid)
confusion_matrix = confusion_matrix(y_valid, y_pred)
print("Confusion Matrix:\n", confusion_matrix)

# Classification Report on validation set (using F1-score)
print("Classification Report:")
print(classification_report(y_valid, y_pred))


Average F1-score (KFold): 0.9999967957069185
Confusion Matrix:
 [[112468      0      0]
 [     0   9553      0]
 [     0      0  34030]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    112468
           1       1.00      1.00      1.00      9553
           2       1.00      1.00      1.00     34030

    accuracy                           1.00    156051
   macro avg       1.00      1.00      1.00    156051
weighted avg       1.00      1.00      1.00    156051



In [3]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix
"""
In the below code, we select only the top features that we got from chi-squared tests.
Only the columns that are making lot of impact are included.
Additionally, the columns named '0' to '5' are included because we get from One-Hot Encoder
and they are essential for predicting.
"""
top_features = ['fwd_iat.min','fwd_iat.max', 'fwd_iat.tot', 'fwd_iat.avg', 'fwd_iat.std',
                'bwd_iat.max', 'bwd_iat.tot', 'bwd_iat.avg', 'bwd_iat.std',
                'flow_iat.min', 'flow_iat.max', 'flow_iat.tot', 'flow_iat.avg',
                'flow_iat.std', 'payload_bytes_per_second',
                'bwd_bulk_rate', 'active.min', 'active.max', 'active.tot', 'active.avg',
                'active.std', 'idle.min', 'idle.max', 'idle.tot', 'idle.avg',
                'idle.std','0','1','2','3','4','5'] 

X_train_top_features = OH_X_train[top_features]
X_valid_top_features = OH_X_valid[top_features]

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)  # 5-fold stratified cross-validation
f1_scores = []
for train_index, test_index in kfold.split(X_train_top_features, y_train):
  # Train on fold data
  fold_train_x, fold_train_y = X_train_top_features.iloc[train_index], y_train.iloc[train_index]
  rfc_model = RandomForestClassifier(n_estimators=100, random_state=0)
  rfc_model.fit(fold_train_x, fold_train_y)

  # Make predictions on validation fold data
  fold_test_x, fold_test_y = X_train_top_features.iloc[test_index], y_train.iloc[test_index]
  predictions = rfc_model.predict(fold_test_x)

  # Calculate F1-score
  f1 = f1_score(fold_test_y, predictions, average='weighted')
  f1_scores.append(f1)

# Print average F1-score across folds
print("Average F1-score (KFold):", np.mean(f1_scores))

# Confusion Matrix on validation set
y_pred = rfc_model.predict(X_valid_top_features)
y_valid_np = y_valid.to_numpy()
confusion_matrix = confusion_matrix(y_valid_np, y_pred)
print("Confusion Matrix:\n", confusion_matrix)

# Classification Report on validation set (using F1-score)
print("Classification Report:")
print(classification_report(y_valid, y_pred))


Average F1-score (KFold): 1.0
Confusion Matrix:
 [[112468      0      0]
 [     0   9553      0]
 [     0      0  34030]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    112468
           1       1.00      1.00      1.00      9553
           2       1.00      1.00      1.00     34030

    accuracy                           1.00    156051
   macro avg       1.00      1.00      1.00    156051
weighted avg       1.00      1.00      1.00    156051



In [4]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

# Initialize SVM classifier with optimized hyperparameters
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=0)

# Train SVM model
svm_model.fit(X_train_top_features, y_train)

# Predict on validation set
y_pred_svm = svm_model.predict(X_valid_top_features)

# Evaluate performance
print("Support Vector Machine (SVM):")
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred_svm))
print("Classification Report:\n", classification_report(y_valid, y_pred_svm))


In [11]:
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(hidden_layer_sizes=(200, 100), max_iter=1000, random_state=0)
mlp_model.fit(X_train_top_features, y_train)

y_pred_mlp = mlp_model.predict(X_valid_top_features)
accuracy = accuracy_score(y_valid, y_pred_mlp)
print("Multi-Layer Perceptron (MLP):")
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred_mlp))
print("Classification Report:\n", classification_report(y_valid, y_pred_mlp))

Multi-Layer Perceptron (MLP):
Confusion Matrix:
 [[112468      0      0]
 [  8792    761      0]
 [ 19774      0  14256]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      1.00      0.89    112468
           1       1.00      0.08      0.15      9553
           2       1.00      0.42      0.59     34030

    accuracy                           0.82    156051
   macro avg       0.93      0.50      0.54    156051
weighted avg       0.85      0.82      0.78    156051



In [12]:
print(accuracy)

0.816944460464848


In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

# Define a range of k values
k_values = range(1, 21)  # You can adjust the range as needed

best_k = None
best_accuracy = 0
best_f1_score = 0

# Iterate over each value of k
for k in k_values:
    # Train KNN model
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_train_top_features, y_train)

    # Predict on validation set
    y_pred_knn = knn_model.predict(X_valid_top_features)

    # Evaluate KNN performance
    accuracy = accuracy_score(y_valid, y_pred_knn)
    f1 = f1_score(y_valid, y_pred_knn, average='weighted')

    # Print performance metrics for each k value
    print(f"KNN with k={k}: Accuracy = {accuracy:.4f}, F1 Score = {f1:.4f}")

    # Update best_k if current model has higher accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_k = k
        best_f1_score = f1

# Print the best k value and corresponding metrics
print(f"\nBest k value: {best_k}")
print(f"Best Accuracy: {best_accuracy:.4f}")
print(f"Best F1 Score: {best_f1_score:.4f}")


KNN with k=1: Accuracy = 0.8393, F1 Score = 0.8393
KNN with k=2: Accuracy = 0.8735, F1 Score = 0.8610
KNN with k=3: Accuracy = 0.8597, F1 Score = 0.8556
KNN with k=4: Accuracy = 0.8740, F1 Score = 0.8626
KNN with k=5: Accuracy = 0.8670, F1 Score = 0.8615
KNN with k=6: Accuracy = 0.8741, F1 Score = 0.8633
KNN with k=7: Accuracy = 0.8700, F1 Score = 0.8637
KNN with k=8: Accuracy = 0.8741, F1 Score = 0.8636
KNN with k=9: Accuracy = 0.8724, F1 Score = 0.8657
KNN with k=10: Accuracy = 0.8738, F1 Score = 0.8635
KNN with k=11: Accuracy = 0.8721, F1 Score = 0.8652
KNN with k=12: Accuracy = 0.8733, F1 Score = 0.8632
KNN with k=13: Accuracy = 0.8725, F1 Score = 0.8655
KNN with k=14: Accuracy = 0.8729, F1 Score = 0.8630
KNN with k=15: Accuracy = 0.8719, F1 Score = 0.8647
KNN with k=16: Accuracy = 0.8728, F1 Score = 0.8630
KNN with k=17: Accuracy = 0.8722, F1 Score = 0.8649
KNN with k=18: Accuracy = 0.8721, F1 Score = 0.8622
KNN with k=19: Accuracy = 0.8711, F1 Score = 0.8637
KNN with k=20: Accura

In [13]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier()

xgb_model.fit(X_train_top_features, y_train)

y_pred_xgb = xgb_model.predict(X_valid_top_features)

# Evaluate XGBoost performance
print("XGBoost Classifier:")
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred_xgb))
print("Classification Report:\n", classification_report(y_valid, y_pred_xgb))

XGBoost Classifier:
Confusion Matrix:
 [[112468      0      0]
 [     0   9553      0]
 [     0      0  34030]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    112468
           1       1.00      1.00      1.00      9553
           2       1.00      1.00      1.00     34030

    accuracy                           1.00    156051
   macro avg       1.00      1.00      1.00    156051
weighted avg       1.00      1.00      1.00    156051

