In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from utils.data_preprocessing import load_data, preprocess_data
import os
import numpy as np

save_data = "save_data_2017/"
def load_data(file_path):
    return pd.read_csv(file_path, encoding='cp1252', engine='python')
cicids2017_path = 'data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv'

if(not os.path.isfile(save_data + "web_attacks.csv")):
    data_2017 = load_data(cicids2017_path)
    preprocess_data(data_2017, save_data)

(458968, 84)
(170366, 84)
168186
2180


In [16]:
df = pd.read_csv(save_data + 'web_attacks.csv')
df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)


normal_df = df[df['Label'] == 0]
attack_df = df[df['Label'] != 0]

num_attack = len(attack_df)

num_normal = 2 * num_attack

In [17]:
normal_df = normal_df.sample(n=num_normal, random_state=42)

# Combine and shuffle the dataset
balanced_df = pd.concat([normal_df, attack_df]).sample(frac=1, random_state=42).reset_index(drop=True)

# Split the balanced dataset into training (60%), validation (20%), and testing sets (20%)
train_df, temp_df = train_test_split(balanced_df, test_size=0.4, random_state=42, stratify=balanced_df['Label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['Label'])

# Within the training set, ensure normal instances are twice the number of attack instances
train_normal_df = train_df[train_df['Label'] == 0]
train_attack_df = train_df[train_df['Label'] == 1]
num_train_attack = len(train_attack_df)
num_train_normal = 2 * num_train_attack
train_normal_df = train_normal_df.sample(n=num_train_normal, random_state=42)
train_df = pd.concat([train_normal_df, train_attack_df]).sample(frac=1, random_state=42).reset_index(drop=True)

In [18]:
# Print the count of each class in the training, validation, and testing sets
for name, df in zip(["Training", "Validation", "Testing"], [train_df, val_df, test_df]):
    unique, counts = np.unique(df['Label'], return_counts=True)
    print(f"{name} set class distribution:", dict(zip(unique, counts)))

Training set class distribution: {0: 2616, 1: 1308}
Validation set class distribution: {0: 872, 1: 436}
Testing set class distribution: {0: 872, 1: 436}


In [19]:
# Save the train, validation, and test sets to CSV files
train_df.to_csv(save_data +  'train_set.csv', index=False)
val_df.to_csv(save_data +  'val_set.csv', index=False)
test_df.to_csv(save_data + 'test_set.csv', index=False)

In [20]:
# Define feature columns (excluding the label column)
feature_cols = [col for col in df.columns if col != 'Label']

# Extract features and labels for training, validation, and testing sets
X_train = train_df[feature_cols]
y_train = train_df['Label']
X_val = val_df[feature_cols]
y_val = val_df['Label']
X_test = test_df[feature_cols]
y_test = test_df['Label']

In [21]:
# Define classifiers
from models.decision_tree import train_decision_tree
from models.random_forest import train_random_forest
from models.lda import train_lda
from models.naive_bayes import train_naive_bayes
from models.logistic_regression import train_logistic_regression
from models.knn import train_knn
from models.svm import train_svm
from models.extra_trees import train_extra_trees
from models.bagging import train_bagging
from models.mlp import train_mlp

classifiers = {
    'Decision Tree': train_decision_tree,
    'Random Forest': train_random_forest,
    'Linear Discriminant Analysis': train_lda,
    'Naive Bayes': train_naive_bayes,
    'Logistic Regression': train_logistic_regression,
    'K-Nearest Neighbors': train_knn,
    'Support Vector Machine': train_svm,
    'Extra Trees Classifier': train_extra_trees,
    'Bagging Classifier': train_bagging,
    'Multi-layer Perceptron': train_mlp
}

In [22]:
def model_10Classifier():
    results = []
    for name, train_function in classifiers.items():
        model, val_acc, val_pre, val_rec, val_f1, test_acc, test_pre, test_rec, test_f1 = train_function(X_train, y_train, X_val, y_val, X_test, y_test, save_data)
        results.append({
            'Model': name,
            'Validation Accuracy': val_acc,
            'Validation Precision': val_pre,
            'Validation Recall': val_rec,
            'Validation F1 Score': val_f1,
            "Test Accuracy": test_acc,
            "Test Precision": test_pre,
            "Test Recall": test_rec,
            "Test F1 Score": test_f1,
        })

    # Save results to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(save_data + '10Classsifer_results_2017.csv', index=False)
    
model_10Classifier()

Decision Tree Validation - Accuracy: 0.9770642201834863 Precision: 0.9613636363636363 Recall: 0.9701834862385321 F1 Score: 0.9657534246575342
Random Forest Validation - Accuracy: 0.9770642201834863 Precision: 0.9634703196347032 Recall: 0.9678899082568807 F1 Score: 0.965675057208238
LDA Validation - Accuracy: 0.9503058103975535 Precision: 0.9406175771971497 Recall: 0.908256880733945 F1 Score: 0.9241540256709452
Naive Bayes Validation - Accuracy: 0.8165137614678899 Precision: 0.6449704142011834 Recall: 1.0 F1 Score: 0.7841726618705036


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Validation - Accuracy: 0.9686544342507645 Precision: 0.9876543209876543 Recall: 0.9174311926605505 F1 Score: 0.9512485136741974
KNN Validation - Accuracy: 0.9640672782874617 Precision: 0.9237472766884531 Recall: 0.9724770642201835 F1 Score: 0.9474860335195531
SVM Validation - Accuracy: 0.941131498470948 Precision: 0.9126436781609195 Recall: 0.9105504587155964 F1 Score: 0.9115958668197475
Extra Trees Validation - Accuracy: 0.9793577981651376 Precision: 0.9658314350797267 Recall: 0.9724770642201835 F1 Score: 0.9691428571428572
MLP Validation - Accuracy: 0.9625382262996942 Precision: 0.9215686274509803 Recall: 0.9701834862385321 F1 Score: 0.9452513966480447


In [23]:
import pandas as pd
from sklearn.decomposition import PCA

# Assume the classifiers dictionary and train functions are already defined
# Define a function to apply PCA and train classifiers

def train_with_pca(X_train, y_train, X_val, y_val, X_test, y_test, n_components=20):
    X_combined = np.concatenate((X_train, X_val, X_test), axis=0)
    y_combined = np.concatenate((y_train, y_val, y_test), axis=0)

    pca = PCA(n_components=n_components)
    X_combined_pca = pca.fit_transform(X_combined)

    # Bước 3: Tách lại dữ liệu thành các tập train, validation và test
    X_train_pca = X_combined_pca[:len(X_train)]
    X_val_pca = X_combined_pca[len(X_train):len(X_train)+len(X_val)]
    X_test_pca = X_combined_pca[len(X_train)+len(X_val):]

    # Kiểm tra kích thước của các tập dữ liệu sau khi PCA
    print("X_train_pca shape:", X_train_pca.shape)
    print("X_val_pca shape:", X_val_pca.shape)
    print("X_test_pca shape:", X_test_pca.shape)
    
    results = []
    for name, train_function in classifiers.items():
        model, val_acc, val_pre, val_rec, val_f1, test_acc, test_pre, test_rec, test_f1 = train_function(X_train_pca, y_train, X_val_pca, y_val, X_test_pca, y_test, save_data)
        results.append({
            'Model': name,
            'Validation Accuracy': val_acc,
            'Validation Precision': val_pre,
            'Validation Recall': val_rec,
            'Validation F1 Score': val_f1,
            'Test Accuracy': test_acc,
            'Test Precision': test_pre,
            'Test Recall': test_rec,
            'Test F1 Score': test_f1
        })
    
    return results

# Call the function and save results
pca_results = train_with_pca(X_train, y_train, X_val, y_val, X_test, y_test)
pca_results_df = pd.DataFrame(pca_results)
pca_results_df.to_csv(save_data + 'pca_model_results_2017.csv', index=False)


X_train_pca shape: (3924, 20)
X_val_pca shape: (1308, 20)
X_test_pca shape: (1308, 20)
Decision Tree Validation - Accuracy: 0.9755351681957186 Precision: 0.9654377880184332 Recall: 0.9610091743119266 F1 Score: 0.9632183908045977
Random Forest Validation - Accuracy: 0.97782874617737 Precision: 0.967816091954023 Recall: 0.9655963302752294 F1 Score: 0.9667049368541906
LDA Validation - Accuracy: 0.9281345565749235 Precision: 0.9476439790575916 Recall: 0.8302752293577982 F1 Score: 0.8850855745721271
Naive Bayes Validation - Accuracy: 0.5680428134556575 Precision: 0.42981501632208924 Recall: 0.9059633027522935 F1 Score: 0.5830258302583026


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Validation - Accuracy: 0.9388379204892966 Precision: 0.908256880733945 Recall: 0.908256880733945 F1 Score: 0.908256880733945
KNN Validation - Accuracy: 0.9602446483180428 Precision: 0.9247787610619469 Recall: 0.9587155963302753 F1 Score: 0.9414414414414415
SVM Validation - Accuracy: 0.9418960244648318 Precision: 0.9147465437788018 Recall: 0.9105504587155964 F1 Score: 0.9126436781609195
Extra Trees Validation - Accuracy: 0.9785932721712538 Precision: 0.9700460829493087 Recall: 0.9655963302752294 F1 Score: 0.967816091954023
MLP Validation - Accuracy: 0.959480122324159 Precision: 0.9592326139088729 Recall: 0.9174311926605505 F1 Score: 0.9378663540445487


In [24]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define a function to build and train a deep autoencoder
def build_autoencoder(input_dim):
    # 3 lớp ẩn
    encoding_dim1 = 50
    encoding_dim2 = 30
    # lớp trung gian
    encoding_dim3 = int(np.sqrt(input_dim)) + 1
    
    input_layer = Input(shape=(input_dim,))
    
    # Lớp mã hóa
    encoded = Dense(encoding_dim1, activation='relu')(input_layer)
    encoded = Dense(encoding_dim2, activation='relu')(encoded)
    encoded = Dense(encoding_dim3, activation='relu')(encoded)
    
    # Lớp giải mã
    decoded = Dense(encoding_dim2, activation='relu')(encoded)
    decoded = Dense(encoding_dim1, activation='relu')(decoded)
    decoded = Dense(input_dim, activation='sigmoid')(decoded)
    
    autoencoder = Model(input_layer, decoded)
    encoder = Model(input_layer, encoded)
    
    autoencoder.compile(optimizer='adam', loss='mse')
    return autoencoder, encoder

# Define a function to apply deep autoencoder and train classifiers
def train_with_autoencoder(X_train, y_train, X_val, y_val, X_test, y_test,  epochs=50, batch_size=256):
    input_dim = X_train.shape[1]
    autoencoder, encoder = build_autoencoder(input_dim)
    
    autoencoder.fit(X_train, X_train, epochs=epochs, batch_size=batch_size, shuffle=True, validation_data=(X_val, X_val), verbose=1)
    
    X_train_enc = encoder.predict(X_train)
    X_val_enc = encoder.predict(X_val)
    X_test_enc = encoder.predict(X_test)
    
    results = []
    for name, train_function in classifiers.items():
        model, val_acc, val_pre, val_rec, val_f1, test_acc, test_pre, test_rec, test_f1 = train_function(X_train_enc, y_train, X_val_enc, y_val, X_test_enc, y_test, save_data)
        results.append({
            'Model': name,
            'Validation Accuracy': val_acc,
            'Validation Precision': val_pre,
            'Validation Recall': val_rec,
            'Validation F1 Score': val_f1,
            'Test Accuracy': test_acc,
            'Test Precision': test_pre,
            'Test Recall': test_rec,
            'Test F1 Score': test_f1
        })
    
    return results

# Call the function and save results
autoencoder_results = train_with_autoencoder(X_train, y_train, X_val, y_val, X_test, y_test)
autoencoder_results_df = pd.DataFrame(autoencoder_results)
autoencoder_results_df.to_csv(save_data + 'autoencoder_model_results_2017.csv', index=False)


Epoch 1/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - loss: 60702412767232.0000 - val_loss: 46646901604352.0000
Epoch 2/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 62191139028992.0000 - val_loss: 46646901604352.0000
Epoch 3/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 70830839037952.0000 - val_loss: 46646901604352.0000
Epoch 4/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 66795385913344.0000 - val_loss: 46646901604352.0000
Epoch 5/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 58237458055168.0000 - val_loss: 46646901604352.0000
Epoch 6/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 59354321518592.0000 - val_loss: 46646901604352.0000
Epoch 7/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 61542598967296.0000 - val_loss: 46646901604