In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from utils.data_preprocessing import load_data, preprocess_data_2018
import os
import numpy as np

save_data = "save_data_2018/"


cicids2018_path_v1 = 'data/Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv'
cicids2018_path_v2 = 'data/Friday-23-02-2018_TrafficForML_CICFlowMeter.csv'
if(not os.path.isfile(save_data + "web_attacks.csv")):
    # Optionally, remove duplicate rows
    data_2018_v1 = load_data(cicids2018_path_v1)
    data_2018_v2 = load_data(cicids2018_path_v2)

    print("Headers match. Proceeding with merge.")
    merged_df = pd.concat([data_2018_v1, data_2018_v2], axis=0, ignore_index=True)
    merged_df = merged_df.drop_duplicates()
    merged_df.to_csv(save_data + "merge.csv", index=False)
    
    merge_data = load_data(save_data + "merge.csv")
    preprocess_data_2018(merge_data, save_data)

Headers match. Proceeding with merge.
['Benign' 'Brute Force -Web' 'Brute Force -XSS' 'SQL Injection']
Label
Benign              2090330
Brute Force -Web        611
Brute Force -XSS        230
SQL Injection            87
Name: count, dtype: int64
2090330
928


In [21]:
df = pd.read_csv(save_data + 'web_attacks.csv')
df['Label'] = df['Label'].apply(lambda x: 0 if x == 'Benign' else 1)


normal_df = df[df['Label'] == 0]
attack_df = df[df['Label'] != 0]

num_attack = len(attack_df)

num_normal = 2 * num_attack

In [22]:
if num_normal > len(normal_df):
    raise ValueError("num_normal exceeds the number of rows in normal_df.")
normal_df = normal_df.sample(n=num_normal, random_state=42)

# Ensure 'Label' column exists
if 'Label' not in normal_df.columns or 'Label' not in attack_df.columns:
    raise ValueError("Both DataFrames must have a 'Label' column.")

# Combine and shuffle the dataset
balanced_df = pd.concat([normal_df, attack_df]).sample(frac=1, random_state=42).reset_index(drop=True)

# Split the balanced dataset
if 'Label' not in balanced_df.columns:
    raise ValueError("The 'Label' column is missing from the balanced DataFrame.")
train_df, temp_df = train_test_split(balanced_df, test_size=0.4, random_state=42, stratify=balanced_df['Label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['Label'])

# Ensure enough data for balancing
train_normal_df = train_df[train_df['Label'] == 0]
train_attack_df = train_df[train_df['Label'] == 1]

if len(train_attack_df) == 0:
    raise ValueError("There are no attack instances in the training set.")

num_train_attack = len(train_attack_df)
num_train_normal = 2 * num_train_attack

# Ensure we don't try to sample more rows than are available
if len(train_normal_df) < num_train_normal:
    num_train_normal = len(train_normal_df)  # Adjust to available number of rows

train_normal_df = train_normal_df.sample(n=num_train_normal, random_state=42)
train_df = pd.concat([train_normal_df, train_attack_df]).sample(frac=1, random_state=42).reset_index(drop=True)

In [23]:
# Print the count of each class in the training, validation, and testing sets
for name, df in zip(["Training", "Validation", "Testing"], [train_df, val_df, test_df]):
    unique, counts = np.unique(df['Label'], return_counts=True)
    print(f"{name} set class distribution:", dict(zip(unique, counts)))

Training set class distribution: {0: 1113, 1: 557}
Validation set class distribution: {0: 371, 1: 186}
Testing set class distribution: {0: 372, 1: 185}


In [24]:
# Save the train, validation, and test sets to CSV files
train_df.to_csv(save_data +  'train_set.csv', index=False)
val_df.to_csv(save_data +  'val_set.csv', index=False)
test_df.to_csv(save_data + 'test_set.csv', index=False)

In [25]:
# Define feature columns (excluding the label column)
feature_cols = [col for col in df.columns if col != 'Label']

# Extract features and labels for training, validation, and testing sets
X_train = train_df[feature_cols]
y_train = train_df['Label']
X_val = val_df[feature_cols]
y_val = val_df['Label']
X_test = test_df[feature_cols]
y_test = test_df['Label']

In [26]:
# Define classifiers
from models.decision_tree import train_decision_tree
from models.random_forest import train_random_forest
from models.lda import train_lda
from models.naive_bayes import train_naive_bayes
from models.logistic_regression import train_logistic_regression
from models.knn import train_knn
from models.svm import train_svm
from models.extra_trees import train_extra_trees
from models.bagging import train_bagging
from models.mlp import train_mlp

classifiers = {
    'Decision Tree': train_decision_tree,
    'Random Forest': train_random_forest,
    'Linear Discriminant Analysis': train_lda,
    'Naive Bayes': train_naive_bayes,
    'Logistic Regression': train_logistic_regression,
    'K-Nearest Neighbors': train_knn,
    'Support Vector Machine': train_svm,
    'Extra Trees Classifier': train_extra_trees,
    'Bagging Classifier': train_bagging,
    'Multi-layer Perceptron': train_mlp
}

In [27]:
def model_10Classifier():
    results = []
    for name, train_function in classifiers.items():
        model, val_acc, val_pre, val_rec, val_f1, test_acc, test_pre, test_rec, test_f1 = train_function(X_train, y_train, X_val, y_val, X_test, y_test, save_data)
        results.append({
            'Model': name,
            'Validation Accuracy': val_acc,
            'Validation Precision': val_pre,
            'Validation Recall': val_rec,
            'Validation F1 Score': val_f1,
            "Test Accuracy": test_acc,
            "Test Precision": test_pre,
            "Test Recall": test_rec,
            "Test F1 Score": test_f1,
        })

    # Save results to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(save_data + '10Classsifer_results_2018.csv', index=False)
    
model_10Classifier()

Decision Tree Validation - Accuracy: 0.9228007181328546 Precision: 0.8864864864864865 Recall: 0.8817204301075269 F1 Score: 0.8840970350404312
Random Forest Validation - Accuracy: 0.9371633752244165 Precision: 0.9171270718232044 Recall: 0.8924731182795699 F1 Score: 0.9046321525885559
LDA Validation - Accuracy: 0.9228007181328546 Precision: 0.8325581395348837 Recall: 0.9623655913978495 F1 Score: 0.8927680798004988
Naive Bayes Validation - Accuracy: 0.6732495511669659 Precision: 0.5056179775280899 Recall: 0.967741935483871 F1 Score: 0.6642066420664207


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Validation - Accuracy: 0.8132854578096947 Precision: 0.7808219178082192 Recall: 0.6129032258064516 F1 Score: 0.6867469879518072
KNN Validation - Accuracy: 0.9353680430879713 Precision: 0.9213483146067416 Recall: 0.8817204301075269 F1 Score: 0.9010989010989011
SVM Validation - Accuracy: 0.7917414721723519 Precision: 0.9861111111111112 Recall: 0.3817204301075269 F1 Score: 0.5503875968992248
Extra Trees Validation - Accuracy: 0.9371633752244165 Precision: 0.9171270718232044 Recall: 0.8924731182795699 F1 Score: 0.9046321525885559
MLP Validation - Accuracy: 0.7989228007181328 Precision: 0.7936507936507936 Recall: 0.5376344086021505 F1 Score: 0.6410256410256411


In [28]:
import pandas as pd
from sklearn.decomposition import PCA

# Assume the classifiers dictionary and train functions are already defined
# Define a function to apply PCA and train classifiers

def train_with_pca(X_train, y_train, X_val, y_val, X_test, y_test, n_components=20):
    X_combined = np.concatenate((X_train, X_val, X_test), axis=0)
    y_combined = np.concatenate((y_train, y_val, y_test), axis=0)

    pca = PCA(n_components=n_components)
    X_combined_pca = pca.fit_transform(X_combined)

    # Bước 3: Tách lại dữ liệu thành các tập train, validation và test
    X_train_pca = X_combined_pca[:len(X_train)]
    X_val_pca = X_combined_pca[len(X_train):len(X_train)+len(X_val)]
    X_test_pca = X_combined_pca[len(X_train)+len(X_val):]

    # Kiểm tra kích thước của các tập dữ liệu sau khi PCA
    print("X_train_pca shape:", X_train_pca.shape)
    print("X_val_pca shape:", X_val_pca.shape)
    print("X_test_pca shape:", X_test_pca.shape)
    
    results = []
    for name, train_function in classifiers.items():
        model, val_acc, val_pre, val_rec, val_f1, test_acc, test_pre, test_rec, test_f1 = train_function(X_train_pca, y_train, X_val_pca, y_val, X_test_pca, y_test, save_data)
        results.append({
            'Model': name,
            'Validation Accuracy': val_acc,
            'Validation Precision': val_pre,
            'Validation Recall': val_rec,
            'Validation F1 Score': val_f1,
            'Test Accuracy': test_acc,
            'Test Precision': test_pre,
            'Test Recall': test_rec,
            'Test F1 Score': test_f1
        })
    
    return results

# Call the function and save results
pca_results = train_with_pca(X_train, y_train, X_val, y_val, X_test, y_test)
pca_results_df = pd.DataFrame(pca_results)
pca_results_df.to_csv(save_data + 'pca_model_results_2018.csv', index=False)


X_train_pca shape: (1670, 20)
X_val_pca shape: (557, 20)
X_test_pca shape: (557, 20)
Decision Tree Validation - Accuracy: 0.9245960502692998 Precision: 0.8956043956043956 Recall: 0.8763440860215054 F1 Score: 0.8858695652173914
Random Forest Validation - Accuracy: 0.9317773788150808 Precision: 0.9204545454545454 Recall: 0.8709677419354839 F1 Score: 0.8950276243093923
LDA Validation - Accuracy: 0.7809694793536804 Precision: 0.8902439024390244 Recall: 0.3924731182795699 F1 Score: 0.5447761194029851
Naive Bayes Validation - Accuracy: 0.39676840215439857 Precision: 0.33766233766233766 Recall: 0.8387096774193549 F1 Score: 0.48148148148148145


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Validation - Accuracy: 0.8186714542190305 Precision: 0.8695652173913043 Recall: 0.5376344086021505 F1 Score: 0.6644518272425249
KNN Validation - Accuracy: 0.9299820466786356 Precision: 0.9248554913294798 Recall: 0.8602150537634409 F1 Score: 0.8913649025069638
SVM Validation - Accuracy: 0.7917414721723519 Precision: 0.9861111111111112 Recall: 0.3817204301075269 F1 Score: 0.5503875968992248
Extra Trees Validation - Accuracy: 0.9228007181328546 Precision: 0.9085714285714286 Recall: 0.8548387096774194 F1 Score: 0.8808864265927978
MLP Validation - Accuracy: 0.8599640933572711 Precision: 0.921875 Recall: 0.6344086021505376 F1 Score: 0.7515923566878981


In [30]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define a function to build and train a deep autoencoder
def build_autoencoder(input_dim):
    # 3 lớp ẩn
    encoding_dim1 = 50
    encoding_dim2 = 30
    # lớp trung gian
    encoding_dim3 = int(np.sqrt(input_dim)) + 1
    
    input_layer = Input(shape=(input_dim,))
    
    # Lớp mã hóa
    encoded = Dense(encoding_dim1, activation='relu')(input_layer)
    encoded = Dense(encoding_dim2, activation='relu')(encoded)
    encoded = Dense(encoding_dim3, activation='relu')(encoded)
    
    # Lớp giải mã
    decoded = Dense(encoding_dim2, activation='relu')(encoded)
    decoded = Dense(encoding_dim1, activation='relu')(decoded)
    decoded = Dense(input_dim, activation='sigmoid')(decoded)
    
    autoencoder = Model(input_layer, decoded)
    encoder = Model(input_layer, encoded)
    
    autoencoder.compile(optimizer='adam', loss='mse')
    return autoencoder, encoder

# Define a function to apply deep autoencoder and train classifiers
def train_with_autoencoder(X_train, y_train, X_val, y_val, X_test, y_test, epochs=50, batch_size=256):
    input_dim = X_train.shape[1]
    print(input_dim)
    autoencoder, encoder = build_autoencoder(input_dim)
    
    autoencoder.fit(X_train, X_train, epochs=epochs, batch_size=batch_size, shuffle=True, validation_data=(X_val, X_val), verbose=1)
    
    X_train_enc = encoder.predict(X_train)
    X_val_enc = encoder.predict(X_val)
    X_test_enc = encoder.predict(X_test)
    
    results = []
    for name, train_function in classifiers.items():
        model, val_acc, val_pre, val_rec, val_f1, test_acc, test_pre, test_rec, test_f1 = train_function(X_train_enc, y_train, X_val_enc, y_val, X_test_enc, y_test, save_data)
        results.append({
            'Model': name,
            'Validation Accuracy': val_acc,
            'Validation Precision': val_pre,
            'Validation Recall': val_rec,
            'Validation F1 Score': val_f1,
            'Test Accuracy': test_acc,
            'Test Precision': test_pre,
            'Test Recall': test_rec,
            'Test F1 Score': test_f1
        })
    
    return results

# Call the function and save results
autoencoder_results = train_with_autoencoder(X_train, y_train, X_val, y_val, X_test, y_test)
autoencoder_results_df = pd.DataFrame(autoencoder_results)
autoencoder_results_df.to_csv(save_data + 'autoencoder_model_results_2018.csv', index=False)


74
Epoch 1/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - loss: 124269300285440.0000 - val_loss: 130405491539968.0000
Epoch 2/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 126587357888512.0000 - val_loss: 130405491539968.0000
Epoch 3/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 134347122278400.0000 - val_loss: 130405491539968.0000
Epoch 4/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 133368599543808.0000 - val_loss: 130405491539968.0000
Epoch 5/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 125627634024448.0000 - val_loss: 130405491539968.0000
Epoch 6/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 128293365874688.0000 - val_loss: 130405491539968.0000
Epoch 7/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 115728724262912.0000 - val_loss: 13040549