In [None]:
import pandas as pd

import seaborn as sns
import time
import numpy as np
import matplotlib.pyplot as plt
import shap

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix,precision_score, recall_score, f1_score,accuracy_score

import tensorflow as tf
from tensorflow import keras
from keras import optimizers
from keras.optimizers import RMSprop
from keras.layers import Dense, Input, Dropout,LeakyReLU,BatchNormalization

In [None]:
# Combine two dataset
df_feature = pd.read_csv('NUSW-NB15_features.csv', encoding = 'cp1252')
df = pd.read_csv('UNSW-NB15_1.csv', names = df_feature['Name'])

In [None]:
# Convert '-' to 'None'
df.loc[df.service=='-', 'service'] = 'None'

In [None]:
# Check the percentage of missing data
df.isnull().sum().sort_values(ascending=False)/df.shape[0]

In [None]:
# Extracting the first component of the 'srcip' column by splitting the IP address string at each period.
df['srcip'] = df['srcip'].apply(lambda x: x.split('.')[0])

# Extracting the first component of the 'dstip' column by splitting the IP address string at each period.
df['dstip'] = df['dstip'].apply(lambda x: x.split('.')[0])

In [None]:
# One-hot encoding categorical features
df = pd.get_dummies(df, columns = ['srcip','dstip','proto','state','service'])

In [None]:
def draw_confusion_matrix(y_test,y_pred):
    
    # Define labels for the confusion matrix
    labels = ['Normal','Anomaly']
    
    # Calculate the confusion matrix
    matrix = confusion_matrix(y_test,y_pred)
    plt.figure(figsize=(10,8))
    colors = ['blue','black']
    
    # Create the heatmap with the confusion matrix
    sns.heatmap(matrix,xticklabels=labels,yticklabels=labels, cmap=colors, annot=True, fmt='d')
    
    # Set the title and axis labels
    plt.title('Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

# Logistics Regression

In [None]:
X = df.drop(columns = ['Label','Stime', 'Ltime','sport','dsport','attack_cat'],axis = 1)
y = df['Label']

# Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y,random_state=42)

In [None]:
# Calculate the running time, the model used here is not a real model, it is only used to measure the running time.
# Begin to record the time
start_time = time.time()


log_pipe = Pipeline([
        ("scaler", StandardScaler()),  # Apply feature scaling using StandardScaler
        ("model", LogisticRegression(random_state=42, max_iter = 5000))])  # Create a logistic regression model 
log_pipe.fit(X_train,y_train)

end_time = time.time()

# Calculate the Running time
duration = end_time - start_time

# Print the Running time
print("Running Time：", duration, "s")


In [None]:
# Find the best model by using grid search

log_pipe = Pipeline([
        ("scaler", StandardScaler()),          # Apply feature scaling using StandardScaler    
        ("model", LogisticRegression(random_state=42, max_iter = 5000))]) # Create a logistic regression model 

param_grid = {'model__C': [0.01,0.1,1,10]}

# Create grid search objects using 5 fold cross validation
grid_search = GridSearchCV(log_pipe, 
                           param_grid,
                           scoring = ["accuracy", "f1","recall","precision"], 
                           cv=3, 
                           refit = "accuracy", 
                           return_train_score = True,
                           error_score='raise')

# Fitting training data using grid search
grid_search.fit(X_train, y_train)

# Output the best parameters and the corresponding scores
print("Best Parameter: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)


In [None]:
log_rs_df = pd.DataFrame(grid_search.cv_results_)
#The summary of the models 
log_rs_df.sort_values("mean_test_accuracy", ascending=False)[[
    'param_model__C',
"mean_train_accuracy",
"std_train_accuracy",
"mean_test_accuracy", 
"std_test_accuracy"]].head()

In [None]:
# Predict the target variable using the trained model
y_pred = grid_search.predict(X_test)

In [None]:
# print the confusion matrix
draw_confusion_matrix(y_test,y_pred)

In [None]:
print(classification_report(y_test,y_pred,digits = 4))

# Autoencoder

In [None]:
# Calculate the running time, the model used here is not a real model, it is only used to measure the running time.

# Record the beginning time
start_time = time.time()

# Exacture the normal data
normal_data = df[df['Label'] == 0]

# Create a StandardScaler object for feature scaling
scaler = StandardScaler()


X_nor = normal_data.drop(columns = ['Label','Stime', 'Ltime','sport','dsport','attack_cat'],axis = 1)
y_nor = normal_data['Label']

# Split normal data into training, validation, and test sets
X_train, X_valid_test, y_train, y_valid_test = train_test_split(X_nor, y_nor, test_size=0.1,random_state=42)
X_valid_nor, X_test_nor, y_valid_nor, y_test_nor = train_test_split(X_valid_test, y_valid_test, test_size=0.5,random_state=42)

# Scale the features of the training set
X_train = scaler.fit_transform(X_train)

# Dimensionality of the input data
input_dim = X_train.shape[1]

# Dimensionality of the encoded representation
encoding_dim = 7

# Define the input layer
input_layer = keras.Input(shape=(input_dim,))

# Define the encoding layer
encoded = Dense(encoding_dim, activation='relu')(input_layer)

# Define the decoding layer
decoded = Dense(input_dim, activation='softmax')(encoded)

# Create the autoencoder model
autoencoder = keras.Model(input_layer, decoded)
autoencoder.summary()

# Set the batch size for training
batch_size = 1280

# Number of training epochs
epochs = 20

# Compile the autoencoder model with optimizer, loss function, and metrics
autoencoder.compile(optimizer='adam', loss='mean_squared_error',metrics=['mae','accuracy'])

# Train the autoencoder model
history = autoencoder.fit(X_train, X_train,
                verbose = 1,
                epochs=epochs,
                batch_size=batch_size,
                shuffle=True,
                validation_data=(X_valid, X_valid))

end_time = time.time()

# Calculate the running time
duration = end_time - start_time

# Print the running time
print("Running time：", duration, "s")

In [None]:
# Generate the training, validation, testing data

# Exacture the normal data
normal_data = df[df['Label'] == 0]

# Create a StandardScaler object for feature scaling
scaler = StandardScaler()


X_nor = normal_data.drop(columns = ['Label','Stime', 'Ltime','sport','dsport','attack_cat'],axis = 1)
y_nor = normal_data['Label']

# Split normal data into training, validation, and test sets
X_train, X_valid_test, y_train, y_valid_test = train_test_split(X_nor, y_nor, test_size=0.2,random_state=42)
X_valid_nor, X_test_nor, y_valid_nor, y_test_nor = train_test_split(X_valid_test, y_valid_test, test_size=0.5,random_state=42)

# Scale the features of the training set
X_train = scaler.fit_transform(X_train)

# Exacture the anomaly data
abnormal_data = df[df['Label'] == 1]
X_ab = abnormal_data.drop(columns = ['Label','Stime', 'Ltime','sport','dsport','attack_cat'],axis = 1)
y_ab = abnormal_data['Label']

# Split anomaly data into validation, and test sets
X_valid_ab, X_test_ab, y_valid_ab, y_test_ab = train_test_split(X_ab, y_ab, test_size=0.5,random_state=42)

# Combine normal and anamoly data for validation and test sets
X_valid = pd.concat([X_valid_nor,X_valid_ab])
X_test = pd.concat([X_test_nor,X_test_ab])
y_valid = pd.concat([y_valid_nor,y_valid_ab])
y_test = pd.concat([y_test_nor,y_test_ab])

# Scale the features of the validation and test sets using the same scaler
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)



In [None]:
# Dimensionality of the input data
input_dim = X_train.shape[1]

# Dimensionality of the encoded representation
encoding_dim = 7

# Define the input layer
input_layer = keras.Input(shape=(input_dim,))

# Define the encoding layer
encoded = Dense(encoding_dim, activation='relu')(input_layer)

# Define the decoding layer
decoded = Dense(input_dim, activation='softmax')(encoded)

# Create the autoencoder model
autoencoder = keras.Model(input_layer, decoded)
autoencoder.summary()

In [None]:
# Set the batch size for training
batch_size = 1280

# Number of training epochs
epochs = 20

# Compile the autoencoder model with optimizer, loss function, and metrics
autoencoder.compile(optimizer='adam', loss='mean_squared_error',metrics=['mae','accuracy'])

# Train the autoencoder model
history = autoencoder.fit(X_train, X_train,
                verbose = 1,
                epochs=epochs,
                batch_size=batch_size,
                shuffle=True,
                validation_data=(X_valid, X_valid))

In [None]:
# Record the training loss for each epoch
epochs = range(1, 21)
train_mae = history.history['mae']
valid_mae = history.history['val_mae']

# Plotting MAE versus epochs
plt.plot(epochs, train_mae, 'b', label='Training MAE')
plt.plot(epochs, valid_mae, 'r', label='Validation MAE')
plt.title('Training and Validation MAE')
plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.legend()
plt.show()


In [None]:
# Generate a list of threshold values
threshold = np.linspace(0,20,40)

acc = []
rec = []

# Loop over each threshold value
for t in threshold:
    
    # Generate predictions using the autoencoder
    y_pred = autoencoder.predict(X_test)
    
    # Calculate the reconstruction error
    y_dist = np.linalg.norm(X_test - y_pred, axis = -1)
    
    # Threshold the reconstruction error to classify anomalies
    y_pred = np.where(y_dist>=t,1,0)
    
    # Compute and store the accuracy score
    acc.append(accuracy_score(y_test,y_pred))
    
    # Compute and store the recall score
    rec.append(recall_score(y_test,y_pred))

In [None]:
#plot the accuary score and recall score
plt.figure(figsize=(8,6))
plt.plot(threshold,acc,c='y',label='Acc')
plt.plot(threshold,rec,c='b',label='Recall')
plt.xlabel('threshold')
plt.ylabel('classification score')
plt.legend()
plt.show()

In [None]:
 # Find the index with the maximum value of the sum of 'rec' and 'acc'
i = np.argmax(np.array(rec)+np.array(acc))

# Get the threshold value corresponding to the index
t = threshold[i]

# Predict the output using the autoencoder model
y_pred = autoencoder.predict(X_test)

# Calculate the Euclidean distance between the input and predicted output
y_dist = np.linalg.norm(X_test - y_pred, axis = -1)

# Create a zip object containing a boolean indicator for anomaly and the corresponding distance
z = zip(y_dist >= t, y_dist)
y_label = []
error = []

# Iterate through the zip object
for idx, (is_anomaly,y_dist) in enumerate(z):
    if is_anomaly:
        y_label.append(1)  # Append 1 to the 'y_label' list if it is an anomaly
    else:
        y_label.append(0)  # Append 0 to the 'y_label' list if it is not an anomaly
    error.append(y_dist)

In [None]:
# Print classification report
print(classification_report(y_test,y_label,digits = 4))

# Shap

In [None]:
def error(data):
    
    # Predict the output using the autoencoder model
    y_pred = autoencoder.predict(data)
    
    # Calculate the Euclidean distance between the input and predicted output
    y_dist = np.linalg.norm(data - y_pred, axis = -1)
    
    # Return the distance
    return y_dist

In [None]:
shap.initjs()
X = df.drop(columns = ['Label','Stime', 'Ltime','sport','dsport','attack_cat'],axis = 1)
X = pd.DataFrame(scaler.fit_transform(X))

# Select a sample of 100 data points
X_sample = X.sample(100)
X_train = pd.DataFrame(X_train)
X_sample.columns = df.drop(columns=['Label', 'Stime', 'Ltime', 'sport', 'dsport', 'attack_cat'], axis=1).columns
X_train.columns = df.drop(columns=['Label', 'Stime', 'Ltime', 'sport', 'dsport', 'attack_cat'], axis=1).columns
X_sample.columns = X.columns

# Sample data from X_train for SHAP analysis
data = shap.sample(X_train, 500)

# Create a SHAP KernelExplainer object
explainer = shap.KernelExplainer(error,data)

# Compute SHAP values for X_sample using the explainer
shap_values = explainer.shap_values(X_sample)

In [None]:
# Get the list of feature names
feature_names = X_train.columns.tolist()

# Predict using the autoencoder model
y_pred = autoencoder.predict(X_sample)

# Compute the reconstruction error
y_dist = np.linalg.norm(X_sample - y_pred, axis = -1)

# Set the threshold for anomaly detection
threshold = t

# Identify the indices of detected anomalies
anomalies_detected = np.where(y_dist > threshold)[0]

# Select the index of the anomaly to visualize (e.g., the third anomaly)
anomaly_index = anomalies_detected[2]

# Generate the SHAP force plot for the selected anomaly
shap.force_plot(explainer.expected_value, shap_values[anomaly_index], X_sample.iloc[anomaly_index],feature_names)

In [None]:
# Generate the SHAP force plot for the normal data
shap.force_plot(explainer.expected_value, shap_values[6], X_sample.iloc[6],feature_names)

In [None]:
# Plot the summary of SHAP values
shap.summary_plot(shap_values,X_sample,feature_names=feature_names, max_display=10, plot_type="dot")

In [None]:
# Plot the summary of SHAP values
shap.summary_plot(shap_values,X_sample,feature_names=feature_names, max_display=10, plot_type="bar")