In [None]:
## Import packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.metrics import ConfusionMatrixDisplay, f1_score, precision_score, recall_score, accuracy_score
from xgboost import XGBClassifier
from matplotlib import pyplot


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Data Engineering Pipeline - Extract, Transform, Load (ETL) 
3. Data integration (extract) 


In [None]:
def extract():
    # Load the CSV files
    csv0 = pd.read_csv("ids_0.csv")
    csv1 = pd.read_csv("ids_1.csv")
    csv2 = pd.read_csv("ids_2.csv")
    # Concatenate CSV files.
    combined = pd.concat([ csv0 , csv1 , csv2], ignore_index=True)

    # Load the JSON files
    json3 = pd.read_json("ids_3.json", lines=True)
    json4 = pd.read_json("ids_4.json", lines=True)
    json7 = pd.read_json("ids_7.json", lines=True)
    json9 = pd.read_json("ids_9.json", lines=True)
    json10 = pd.read_json("ids_10.json", lines=True)
    # Concatenate JSON files.
    combined = pd.concat([combined , json3 , json4 , json7 , json9 , json10], ignore_index=True)

    # Load the Parquet files
    parquet5 = pd.read_parquet("ids_5.parquet")
    parquet6 = pd.read_parquet("ids_6.parquet")
    parquet8 = pd.read_parquet("ids_8.parquet")
    parquet11 = pd.read_parquet("ids_11.parquet")
    # Concatenate Parquet files.
    combined = pd.concat([combined , parquet5 , parquet6 , parquet8 , parquet11], ignore_index=True)
    
    # Return files as a Pandas dataframe
    return combined

# Use the created function and save the dataframe into 'data'. Contains raw data.
data = extract()

4. Data Transformation (Transform) 


In [None]:
# No data conversion needs to be done.
# Drop missing data. 138 rows dropped.
print(data.shape)
data = data.dropna()
print(data.shape)


5. Data Storage (Load) 

In [None]:
# Store raw data into a new CSV file called 'dataSet.csv'
data.to_csv('dataSet.csv', index=False)

6. Reading Data 

In [None]:
# CSV file is read and stored back onto a dataframe 'data'
data = pd.read_csv('dataSet.csv')


7. Exploratory Data Analysis 

In [None]:
# Identify shape of the dataset.
print("Shape of dataset: ",data.shape)
# Identify if datase has missing data. Data was cleared in step 4.
print("Missing data: ",data.isna().sum().sum())

In [None]:
# Column statistics. Not standardized.
data.describe()

In [None]:
# Univariate analysis. Showing the frequency which ports appear from a sample of 30 elements.
# Modifying the number of samples can alter the graphy wildly.
# Random state is used to keep the same samples each time the code is run.
sample = data[' Destination Port'].sample(30,random_state=22)
sample

In [None]:
sample.plot(kind='hist', xlabel='Port Number')
# Mostly port 80 in this sample.

### Machine Learning Pipeline 
8. Data Preprocessing 


In [None]:
# Modifying feature names for easier use.
# Replacing empty spaces and dots.
print("Original: ",data.columns[2])
data.columns = data.columns.str.strip()
data.columns = data.columns.str.replace(" " , "_")
data.columns = data.columns.str.replace(".1" , "")
print("Modified: ",data.columns[2])


In [None]:
print(data.shape)

# Drop duplicate values. 15,434 rows dropped.
data = data.drop_duplicates()

# Drop duplicate columns. 1 column dropped.
data = data.loc[:,~data.columns.duplicated()].copy()

# Remove the rows with a 'Heartbleed' as their 'Label'. 11 rows dropped.
data = data[data.Label != 'Heartbleed']

print(data.shape)

In [None]:
# Create 'attack' dataframe. Will become our 'y' test/train.
attack = data[['Label']]
print("Attack shape: ",attack.shape)

# Delete 'Label' from original dataset
data.drop('Label',axis=1, inplace=True)
print("Data shape: ",data.shape)

In [None]:
print(data.shape)

# Drop features(columns) with variance less than 0.05. 3 features(columns) dropped.
# These features(columns) are less significant for our traning model.
data = data.loc[:, data.var(axis=0) >= 0.05]

print(data.shape)

In [None]:
# Encoding data.
print("Original")
print(attack.sample(5, random_state=2))

# Results will be binary. 0 = Benign connection. 1 = DoS attack.
attack['Label'] = attack['Label'].replace('BENIGN', 0, regex=True)
attack['Label'] = attack['Label'].replace('DoS Hulk', 1, regex=True)
attack['Label'] = attack['Label'].replace('DoS GoldenEye', 1, regex=True)
attack['Label'] = attack['Label'].replace('DoS Slowhttptest', 1, regex=True)

print("\nModified")
print(attack.sample(5, random_state=2))

In [None]:
# 7-2. MORE EXPLORATORY ANALYSIS
# Bivariate analysis. Correlations.

# Correlation matrix. Displays how strong are the features linked.
(data
 .corr(method='pearson')
 .style
 .background_gradient(cmap='RdBu', vmin=-1, vmax=1)
)

In [None]:
# Correlation visualization. High correlation. They increase similarly.
(data
 .plot.scatter(x='Flow_Duration', y='Fwd_IAT_Total', alpha=0.6, color='purple')
 )

In [None]:
# Correlation visualization. Low correlation. The increase separately.
(data
 .plot.scatter(x='Flow_Duration', y='Init_Win_bytes_forward', alpha=0.6, color='purple')
 )

In [None]:
# Splitting the data.

# Convert 'data' and 'attack' to numpy arrays.
data = data.to_numpy()
attack = attack.to_numpy()

# Split the data into 4. Xtrain/yTrain and Xtest/ytest
# Split is done 75/25 respectively.
X_train, X_test, y_train, y_test = train_test_split(data, attack, test_size=0.25, random_state=6969)
# X contains our features while y contains our labels.

# Flatten the y sets. Causes warning if not done. Assume it affects the results.
y_train = y_train.flatten()
y_test = y_test.flatten()

9. Feature Engineering 

In [None]:
# Standardize the data
scaler = StandardScaler()
X_train_standard = scaler.fit_transform(X_train)
X_test_standard = scaler.fit(X_test)

In [None]:
# Describe the data to ensure correct standardization.
# Mean is closer to 0 and Standard Deviation(std) is closer to 1.
print(pd.DataFrame(X_train_standard).describe())

In [None]:
# Feature selection. XGBoost Classifier is used to select the most important features.
# The inputs are our Xtrain (60 columns) and yTrain (1 column).
xgbC = XGBClassifier()
xgbC.fit(X_train_standard,y_train)
xgbC.feature_importances_

In [None]:
# Easier visualization to know which features are important and their score.
pyplot.bar(range(len(xgbC.feature_importances_)), xgbC.feature_importances_)
pyplot.show()

In [None]:
# Only the features that have an importance higher than .01 are kept.

# Use SelectFromModel to cut down the features based on the result from xgbC.
selection = SelectFromModel(xgbC,threshold=.01, prefit=True)

# This brings us from 60 features down to just 5 features.
X_train_selected = selection.transform(X_train)
print("Xtrain_selected shape: ",X_train_selected.shape)

# Apply the same change to our Xtest.
X_test_selected = selection.transform(X_test)
print("Xtest_selected shape: ",X_test_selected.shape)

# In goes XTrainStandard, out comes XTrainSelected. Same for test.
# The 'Select's will be that data that we use to train our models.

10. Processed Data Loading

In [None]:
# Save our selected Xtrain and Xtest onto their own CSV files. Same as we did in step 5, except we will not load them back this time, we will use them as they are.
pd.DataFrame(X_train_selected).to_csv('CleanTrain.csv', index=False)
pd.DataFrame(X_test_selected).to_csv('CleanTest.csv', index=False)

11. Model Selection and Training

In [None]:
# Before, we used XGBoostClassifier only to select the features that were important and discard the rest.
# Now, we will use XGBoostClassifier as a model to train and predict.

# XGBC MODEL (Our Best Performing Model)

# Training the model
xgbC.fit(X_train_selected , y_train)
# Make a prediction on the data we trained on.
X_train_selected_pred = xgbC.predict(X_train_selected)
# Make a prediction on the test data (Didn't train on).
X_test_selected_pred = xgbC.predict(X_test_selected)

# Scores from the first prediction. (Train data)
train_acc_perc = accuracy_score(y_train, X_train_selected_pred)
train_f1score_perc = f1_score(y_train, X_train_selected_pred)
train_precision_perc = precision_score(y_train, X_train_selected_pred)
train_recall_perc = recall_score(y_train, X_train_selected_pred)
print('Train XGBC Model')
print('Accuracy: {:.3f}'.format(train_acc_perc))
print('Precision: {:.3f}'.format(train_precision_perc))
print('Recall: {:.3f}'.format(train_recall_perc))
print('F1-score: {:.3f}'.format(train_f1score_perc))

# Scores from the second prediction. (Test data)
test_acc_perc = accuracy_score(y_test, X_test_selected_pred)
test_f1score_perc = f1_score(y_test, X_test_selected_pred)
test_precision_perc = precision_score(y_test, X_test_selected_pred)
test_recall_perc = recall_score(y_test, X_test_selected_pred)
print('\nTest XGBC Model')
print('Accuracy: {:.3f}'.format(test_acc_perc))
print('Precision: {:.3f}'.format(test_precision_perc))
print('Recall: {:.3f}'.format(test_recall_perc))
print('F1-score: {:.3f}'.format(test_f1score_perc))

# Score calculation methods are imported from sklearn.metrics

In [None]:
# Confusion matrix allows us to view the results from our prediction easily.
#  [Actual No / Predicted No]   [Actual No / Predicted Yes]       Or    [Correct prediction]  [Incorrect prediction]
#  [Actual Yes / Predicted No]  [Actual Yes / Predicted Yes]           [Incorrect prediction]  [Correct predicion]

# If you add up the numbers we end up with Xtrain + Xtest (The whole dataset before splitting).
ConfusionMatrixDisplay.from_estimator(xgbC, np.concatenate((X_train_selected, X_test_selected), axis=0), np.concatenate((y_train, y_test)))

In [None]:
#PERCEPTRON MODEL

# Training the model
perceptron = Perceptron(eta0=0.001, random_state=1)
perceptron.fit(X_train_selected,y_train)

# Make a prediction on the data we trained on.
X_train_selected_pred = perceptron.predict(X_train_selected)
# Make a prediction on the test data (Didn't train on).
X_test_selected_pred = perceptron.predict(X_test_selected)

# Scores from the first prediction. (Train data)
train_acc_perc = accuracy_score(y_train, X_train_selected_pred)
train_f1score_perc = f1_score(y_train, X_train_selected_pred)
train_precision_perc = precision_score(y_train, X_train_selected_pred)
train_recall_perc = recall_score(y_train, X_train_selected_pred)
print('Train Perceptron Model')
print('Accuracy: {:.3f}'.format(train_acc_perc))
print('Precision: {:.3f}'.format(train_precision_perc))
print('Recall: {:.3f}'.format(train_recall_perc))
print('F1-score: {:.3f}'.format(train_f1score_perc))

# Scores from the second prediction. (Train data)
test_acc_perc = accuracy_score(y_test, X_test_selected_pred)
test_f1score_perc = f1_score(y_test, X_test_selected_pred)
test_precision_perc = precision_score(y_test, X_test_selected_pred)
test_recall_perc = recall_score(y_test, X_test_selected_pred)
print('\nTest Perceptron Model')
print('Accuracy: {:.3f}'.format(test_acc_perc))
print('Precision: {:.3f}'.format(test_precision_perc))
print('Recall: {:.3f}'.format(test_recall_perc))
print('F1-score: {:.3f}'.format(test_f1score_perc))

In [None]:
# Confusion matrix allows us to view the results from our prediction easily.
#  [Actual No / Predicted No]   [Actual No / Predicted Yes]       Or    [Correct prediction]  [Incorrect prediction]
#  [Actual Yes / Predicted No]  [Actual Yes / Predicted Yes]           [Incorrect prediction]  [Correct predicion]

# If you add up the numbers we end up with Xtrain + Xtest (The whole dataset before splitting).
ConfusionMatrixDisplay.from_estimator(perceptron, np.concatenate((X_train_selected, X_test_selected), axis=0), np.concatenate((y_train, y_test)))

In [None]:
#ADABOOST MODEL

# Training the model. Perceptron model must be done before since this model uses it.
adaboost_perc = AdaBoostClassifier(estimator=Perceptron(eta0=0.001, random_state=1), n_estimators=100, learning_rate=0.001, random_state=1, algorithm='SAMME')
adaboost_perc.fit(X_train_selected,y_train)

# Make a prediction on the data we trained on.
X_train_selected_pred = adaboost_perc.predict(X_train_selected)
# Make a prediction on the test data (Didn't train on).
X_test_selected_pred = adaboost_perc.predict(X_test_selected)

# Scores from the first prediction. (Train data)
train_acc_perc = accuracy_score(y_train, X_train_selected_pred)
train_f1score_perc = f1_score(y_train, X_train_selected_pred)
train_precision_perc = precision_score(y_train, X_train_selected_pred)
train_recall_perc = recall_score(y_train, X_train_selected_pred)
print('Train ADABoost Model')
print('Accuracy: {:.3f}'.format(train_acc_perc))
print('Precision: {:.3f}'.format(train_precision_perc))
print('Recall: {:.3f}'.format(train_recall_perc))
print('F1-score: {:.3f}'.format(train_f1score_perc))

# Scores from the second prediction. (Train data)
test_acc_perc = accuracy_score(y_test, X_test_selected_pred)
test_f1score_perc = f1_score(y_test, X_test_selected_pred)
test_precision_perc = precision_score(y_test, X_test_selected_pred)
test_recall_perc = recall_score(y_test, X_test_selected_pred)
print('\nTest ADABoost Model')
print('Accuracy: {:.3f}'.format(test_acc_perc))
print('Precision: {:.3f}'.format(test_precision_perc))
print('Recall: {:.3f}'.format(test_recall_perc))
print('F1-score: {:.3f}'.format(test_f1score_perc))

In [None]:
# Confusion matrix allows us to view the results from our prediction easily.
#  [Actual No / Predicted No]   [Actual No / Predicted Yes]       Or    [Correct prediction]  [Incorrect prediction]
#  [Actual Yes / Predicted No]  [Actual Yes / Predicted Yes]           [Incorrect prediction]  [Correct predicion]

# If you add up the numbers we end up with Xtrain + Xtest (The whole dataset before splitting).
ConfusionMatrixDisplay.from_estimator(adaboost_perc, np.concatenate((X_train_selected, X_test_selected), axis=0), np.concatenate((y_train, y_test)))

In [None]:
#LOGISTIC REGRESSION MODEL

# Training the model
lr = LogisticRegression()
lr.fit(X_train_selected,y_train)
# Make a prediction on the data we trained on.
X_train_selected_pred = lr.predict(X_train_selected)
# Make a prediction on the test data (Didn't train on).
X_test_selected_pred = lr.predict(X_test_selected)

# Scores from the first prediction. (Train data)
train_acc_perc = accuracy_score(y_train, X_train_selected_pred)
train_f1score_perc = f1_score(y_train, X_train_selected_pred)
train_precision_perc = precision_score(y_train, X_train_selected_pred)
train_recall_perc = recall_score(y_train, X_train_selected_pred)
print('Train Logistic Regression Model')
print('Accuracy: {:.3f}'.format(train_acc_perc))
print('Precision: {:.3f}'.format(train_precision_perc))
print('Recall: {:.3f}'.format(train_recall_perc))
print('F1-score: {:.3f}'.format(train_f1score_perc))

# Scores from the second prediction. (Train data)
test_acc_perc = accuracy_score(y_test, X_test_selected_pred)
test_f1score_perc = f1_score(y_test, X_test_selected_pred)
test_precision_perc = precision_score(y_test, X_test_selected_pred)
test_recall_perc = recall_score(y_test, X_test_selected_pred)
print('\nTest Logistic Regression Model')
print('Accuracy: {:.3f}'.format(test_acc_perc))
print('Precision: {:.3f}'.format(test_precision_perc))
print('Recall: {:.3f}'.format(test_recall_perc))
print('F1-score: {:.3f}'.format(test_f1score_perc))


In [None]:
# Confusion matrix allows us to view the results from our prediction easily.
#  [Actual No / Predicted No]   [Actual No / Predicted Yes]       Or    [Correct prediction]  [Incorrect prediction]
#  [Actual Yes / Predicted No]  [Actual Yes / Predicted Yes]           [Incorrect prediction]  [Correct predicion]

# If you add up the numbers we end up with Xtrain + Xtest (The whole dataset before splitting).
ConfusionMatrixDisplay.from_estimator(lr, np.concatenate((X_train_selected, X_test_selected), axis=0), np.concatenate((y_train, y_test)))

In [None]:
#MAJORITYVOTING MODEL

# Training the model. Perceptron, AdaBoost, and LogisticRegression must be done before, since this model uses them.
hard_majorityvote = VotingClassifier(estimators=[('perceptron', perceptron),('adaboost_perc', adaboost_perc),('logistic regression', lr)],voting='hard')
hard_majorityvote.fit(X_train_selected,y_train)

# Make a prediction on the data we trained on.
X_train_selected_pred = hard_majorityvote.predict(X_train_selected)
# Make a prediction on the test data (Didn't train on).
X_test_selected_pred = hard_majorityvote.predict(X_test_selected)

# Scores from the first prediction. (Train data)
train_acc_perc = accuracy_score(y_train, X_train_selected_pred)
train_f1score_perc = f1_score(y_train, X_train_selected_pred)
train_precision_perc = precision_score(y_train, X_train_selected_pred)
train_recall_perc = recall_score(y_train, X_train_selected_pred)
print('Train Mayority Voting Model')
print('Accuracy: {:.3f}'.format(train_acc_perc))
print('Precision: {:.3f}'.format(train_precision_perc))
print('Recall: {:.3f}'.format(train_recall_perc))
print('F1-score: {:.3f}'.format(train_f1score_perc))

# Scores from the second prediction. (Train data)
test_acc_perc = accuracy_score(y_test, X_test_selected_pred)
test_f1score_perc = f1_score(y_test, X_test_selected_pred)
test_precision_perc = precision_score(y_test, X_test_selected_pred)
test_recall_perc = recall_score(y_test, X_test_selected_pred)
print('\nTest Mayority Voting Model')
print('Accuracy: {:.3f}'.format(test_acc_perc))
print('Precision: {:.3f}'.format(test_precision_perc))
print('Recall: {:.3f}'.format(test_recall_perc))
print('F1-score: {:.3f}'.format(test_f1score_perc))

In [None]:
# Confusion matrix allows us to view the results from our prediction easily.
#  [Actual No / Predicted No]   [Actual No / Predicted Yes]       Or    [Correct prediction]  [Incorrect prediction]
#  [Actual Yes / Predicted No]  [Actual Yes / Predicted Yes]           [Incorrect prediction]  [Correct predicion]

# If you add up the numbers we end up with Xtrain + Xtest (The whole dataset before splitting).
ConfusionMatrixDisplay.from_estimator(hard_majorityvote, np.concatenate((X_train_selected, X_test_selected), axis=0), np.concatenate((y_train, y_test)))