In [2]:
# Import libraries

from collections import Counter
import csv
import pandas as pd
import numpy as np
from sklearn import metrics, linear_model, tree, svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
import time

In [3]:
# Dataset path
DATASET_PATH = "C:\\Users\\Marek\\PycharmProjects\\DP\\venv\\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv"

# Load dataset
flow_data_headers = ["Destination Port", "Flow Duration", "Total Fwd Packets", "Total Backward Packets","Total Length of Fwd Packets", "Total Length of Bwd Packets", "Fwd Packet Length Max", "Fwd Packet Length Min", "Fwd Packet Length Mean", "Fwd Packet Length Std","Bwd Packet Length Max", "Bwd Packet Length Min", "Bwd Packet Length Mean", "Bwd Packet Length Std","Flow Bytes/s", "Flow Packets/s", "Flow IAT Mean", "Flow IAT Std", "Flow IAT Max", "Flow IAT Min","Fwd IAT Total", "Fwd IAT Mean", "Fwd IAT Std", "Fwd IAT Max", "Fwd IAT Min","Bwd IAT Total", "Bwd IAT Mean", "Bwd IAT Std", "Bwd IAT Max", "Bwd IAT Min","Fwd PSH Flags", "Bwd PSH Flags", "Fwd URG Flags", "Bwd URG Flags", "Fwd Header Length", "Bwd Header Length","Fwd Packets/s", "Bwd Packets/s", "Min Packet Length", "Max Packet Length", "Packet Length Mean", "Packet Length Std", "Packet Length Variance","FIN Flag Count", "SYN Flag Count", "RST Flag Count", "PSH Flag Count", "ACK Flag Count", "URG Flag Count", "CWE Flag Count", "ECE Flag Count", "Down/Up Ratio", "Average Packet Size", "Avg Fwd Segment Size", "Avg Bwd Segment Size", "Fwd Header Lengthtwo","Fwd Avg Bytes/Bulk", "Fwd Avg Packets/Bulk", "Fwd Avg Bulk Rate", "Bwd Avg Bytes/Bulk", "Bwd Avg Packets/Bulk","Bwd Avg Bulk Rate","Subflow Fwd Packets", "Subflow Fwd Bytes", "Subflow Bwd Packets", "Subflow Bwd Bytes","Init_Win_bytes_forward", "Init_Win_bytes_backward", "act_data_pkt_fwd", "min_seg_size_forward","Active Mean", "Active Std", "Active Max", "Active Min","Idle Mean", "Idle Std", "Idle Max", "Idle Min", "Label"]
flow_data = pd.read_csv(DATASET_PATH, names=flow_data_headers, skiprows=1).replace('Infinity',np.inf)
# Purge records with infinity or NaN values
flow_data.replace([np.inf, -np.inf], np.nan, inplace=True)
flow_data.dropna(inplace=True)

In [4]:
# Check to see labels contained in the dataset
print(set(flow_data["Label"]))

{'PortScan', 'BENIGN'}


In [5]:
# Basic data information
print ("Number of observations ::", len(flow_data.index))
print ("Number of columns ::", len(flow_data.columns))
print ("Headers ::", flow_data.columns.values)

Number of observations :: 286096
Number of columns :: 79
Headers :: ['Destination Port' 'Flow Duration' 'Total Fwd Packets'
 'Total Backward Packets' 'Total Length of Fwd Packets'
 'Total Length of Bwd Packets' 'Fwd Packet Length Max'
 'Fwd Packet Length Min' 'Fwd Packet Length Mean' 'Fwd Packet Length Std'
 'Bwd Packet Length Max' 'Bwd Packet Length Min' 'Bwd Packet Length Mean'
 'Bwd Packet Length Std' 'Flow Bytes/s' 'Flow Packets/s' 'Flow IAT Mean'
 'Flow IAT Std' 'Flow IAT Max' 'Flow IAT Min' 'Fwd IAT Total'
 'Fwd IAT Mean' 'Fwd IAT Std' 'Fwd IAT Max' 'Fwd IAT Min' 'Bwd IAT Total'
 'Bwd IAT Mean' 'Bwd IAT Std' 'Bwd IAT Max' 'Bwd IAT Min' 'Fwd PSH Flags'
 'Bwd PSH Flags' 'Fwd URG Flags' 'Bwd URG Flags' 'Fwd Header Length'
 'Bwd Header Length' 'Fwd Packets/s' 'Bwd Packets/s' 'Min Packet Length'
 'Max Packet Length' 'Packet Length Mean' 'Packet Length Std'
 'Packet Length Variance' 'FIN Flag Count' 'SYN Flag Count'
 'RST Flag Count' 'PSH Flag Count' 'ACK Flag Count' 'URG Flag Count'
 

In [6]:
print (flow_data['Label'].value_counts())

PortScan    158804
BENIGN      127292
Name: Label, dtype: int64


In [129]:
# Dataset path
DATASET_PATH_TEST = "C:\\Users\\Marek\\PycharmProjects\\DP\\venv\\dataset\\csv\\raw\\raw_sS.csv"

test_flow_data_headers = ["Timestamp", "FlowID", "In Port", "L4 Protocol","L4 Dest", "Total Fwd Packets", "Total Backward Packets", "Total Length of Fwd Packets", "Total Length of Bwd Packets", "Fwd IAT Mean","Fwd IAT Std", "Fwd IAT Max", "Fwd IAT Min", "Bwd IAT Mean","Bwd IAT Std", "Bwd IAT Max", "Bwd IAT Min", "Fwd PSH Flags", "Bwd PSH Flags", "Fwd URG Flags", "Bwd URG Flags", "Average Packet Size", "Avg Fwd Segment Size", "Avg Bwd Segment Size"]
test_flow_data = pd.read_csv(DATASET_PATH_TEST, names=test_flow_data_headers, skiprows=1)

In [130]:
print (test_flow_data['In Port'].value_counts())

1    200
3     51
2     51
Name: In Port, dtype: int64


In [131]:
def assign_label(row):
    if row['In Port'] == 1:
        return 'PortScan'
    else:
        return 'BENIGN'

test_flow_data['Label'] = test_flow_data.apply(lambda row: assign_label(row), axis=1) 

In [132]:
print (test_flow_data['Label'].value_counts())

PortScan    200
BENIGN      102
Name: Label, dtype: int64


In [133]:
test_flow_data.drop(test_flow_data.columns[range(5)], axis=1, inplace=True)
test_flow_data.to_csv('processed_sS.csv', index=False)

In [12]:
# Basic data information
print ("Number of observations ::", len(test_flow_data.index))
print ("Number of columns ::", len(test_flow_data.columns))
print ("Headers ::", test_flow_data.columns.values)

Number of observations :: 302
Number of columns :: 20
Headers :: ['Total Fwd Packets' 'Total Backward Packets'
 'Total Length of Fwd Packets' 'Total Length of Bwd Packets'
 'Fwd IAT Mean' 'Fwd IAT Std' 'Fwd IAT Max' 'Fwd IAT Min' 'Bwd IAT Mean'
 'Bwd IAT Std' 'Bwd IAT Max' 'Bwd IAT Min' 'Fwd PSH Flags' 'Bwd PSH Flags'
 'Fwd URG Flags' 'Bwd URG Flags' 'Average Packet Size'
 'Avg Fwd Segment Size' 'Avg Bwd Segment Size' 'Label']


In [83]:
basic_features = ['Total Fwd Packets', 'Total Backward Packets',
 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Label']

flow_data_basic = flow_data[basic_features]

In [84]:
# Basic data information
print ("Number of observations ::", len(flow_data_basic.index))
print ("Number of columns ::", len(flow_data_basic.columns))
print ("Headers ::", flow_data_basic.columns.values)

Number of observations :: 286096
Number of columns :: 5
Headers :: ['Total Fwd Packets' 'Total Backward Packets'
 'Total Length of Fwd Packets' 'Total Length of Bwd Packets' 'Label']


In [85]:
train_x, test_x, train_y, test_y = train_test_split(flow_data_selected[selected_features[:-1]],
                                                    flow_data_selected[selected_features[-1]], 
                                                    train_size=0.67, 
                                                    test_size=0.33, random_state=1)

In [106]:
rfc = RandomForestClassifier(n_estimators=10)

start1 = time.process_time()
rfc.fit(train_x, train_y)

print('Time to train RFC: ' + str(time.process_time() - start1) + 's')

trafficLabels = ['BENIGN', 'PortScan']

# Print out the results
print(metrics.confusion_matrix(test_y, rfc.predict(test_x), labels=trafficLabels))
print("Random forest Test Accuracy :: ", metrics.accuracy_score(test_y, rfc.predict(test_x)))
print("Random forest Test Sensitivity :: ", metrics.recall_score(test_y, rfc.predict(test_x), pos_label="PortScan"))
print("Random forest Test Specificity :: ", metrics.recall_score(test_y, rfc.predict(test_x), pos_label="BENIGN"))
print("Random forest Test Precision :: ", metrics.precision_score(test_y, rfc.predict(test_x), pos_label="PortScan"))
print("Random forest F1 Score :: ", metrics.f1_score(test_y, rfc.predict(test_x), average=None, labels=trafficLabels))


Time to train RFC: 1.296875s
[[41880    30]
 [    5 52497]]
Random forest Test Accuracy ::  0.9996292844129984
Random forest Test Sensitivity ::  0.9999047655327417
Random forest Test Specificity ::  0.9992841803865425
Random forest Test Precision ::  0.9994288651550631
Random forest F1 Score ::  [0.99958231 0.99966676]


In [105]:
# Train GradientBooster
gbc = GradientBoostingClassifier(n_estimators=10)

start4 = time.process_time()
gbc.fit(train_x, train_y)

trafficLabels = ['BENIGN', 'PortScan']

print('Time to train GBC: ' + str(time.process_time() - start4) + 's')

# Print out the results
print(metrics.confusion_matrix(test_y, gbc.predict(test_x), labels=trafficLabels))
print("GBC Test Accuracy :: ", metrics.accuracy_score(test_y, gbc.predict(test_x)))
print("GBC Test Sensitivity :: ", metrics.recall_score(test_y, gbc.predict(test_x), pos_label="PortScan"))
print("GBC Test Specificity :: ", metrics.recall_score(test_y, gbc.predict(test_x), pos_label="BENIGN"))
print("GBC Test Precision :: ", metrics.precision_score(test_y, gbc.predict(test_x), pos_label="PortScan"))
print("GBC Test F1 Score :: ", metrics.f1_score(test_y, gbc.predict(test_x), average=None, labels=trafficLabels))


Time to train GBC: 2.21875s
[[41856    54]
 [  210 52292]]
GBC Test Accuracy ::  0.9972037452866161
GBC Test Sensitivity ::  0.9960001523751476
GBC Test Specificity ::  0.9987115246957766
GBC Test Precision ::  0.9989684025522485
GBC F1 Score ::  [0.99685624 0.99748207]


In [107]:
# Train Decision tree classifier
dt = tree.DecisionTreeClassifier(criterion = "gini", splitter = 'random', max_leaf_nodes = 10, min_samples_leaf = 5, max_depth= 5)

start2 = time.process_time()
dt.fit(train_x, train_y)

print('Time to train DT: ' + str(time.process_time() - start2) + 's')

# Print out the results
print(metrics.confusion_matrix(test_y, dt.predict(test_x), labels=trafficLabels))
print("DT Test Accuracy :: ", metrics.accuracy_score(test_y, dt.predict(test_x)))
print("DT Test Sensitivity :: ", metrics.recall_score(test_y, dt.predict(test_x), pos_label="PortScan"))
print("DT Test Specificity :: ", metrics.recall_score(test_y, dt.predict(test_x), pos_label="BENIGN"))
print("DT Test Precision :: ", metrics.precision_score(test_y, dt.predict(test_x), pos_label="PortScan"))
print("DT Test F1 Score :: ", metrics.f1_score(test_y, dt.predict(test_x), average=None, labels=trafficLabels))


Time to train DT: 0.40625s
[[33304  8606]
 [   97 52405]]
DT Test Accuracy ::  0.9078189213235606
DT Test Sensitivity ::  0.9981524513351873
DT Test Specificity ::  0.7946552135528514
DT Test Precision ::  0.8589434692104703
DT Test F1 Score ::  [0.88443919 0.92333037]


In [117]:
selected_features = ['Total Fwd Packets', 'Total Backward Packets',
 'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Mean',
 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'PSH Flag Count',
 'Fwd URG Flags', 'Bwd URG Flags', 'URG Flag Count', 'Average Packet Size',
 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Label']

flow_data_selected = flow_data[selected_features]

In [118]:
# Basic data information
print ("Number of observations ::", len(flow_data_selected.index))
print ("Number of columns ::", len(flow_data_selected.columns))
print ("Headers ::", flow_data_selected.columns.values)

Number of observations :: 286096
Number of columns :: 22
Headers :: ['Total Fwd Packets' 'Total Backward Packets'
 'Total Length of Fwd Packets' 'Total Length of Bwd Packets'
 'Fwd IAT Mean' 'Fwd IAT Std' 'Fwd IAT Max' 'Fwd IAT Min' 'Bwd IAT Mean'
 'Bwd IAT Std' 'Bwd IAT Max' 'Bwd IAT Min' 'Fwd PSH Flags' 'Bwd PSH Flags'
 'PSH Flag Count' 'Fwd URG Flags' 'Bwd URG Flags' 'URG Flag Count'
 'Average Packet Size' 'Avg Fwd Segment Size' 'Avg Bwd Segment Size'
 'Label']


In [119]:
train_x, test_x, train_y, test_y = train_test_split(flow_data_selected[selected_features[:-1]],
                                                    flow_data_selected[selected_features[-1]], 
                                                    train_size=0.67, 
                                                    test_size=0.33, random_state=1)

In [120]:
rfc = RandomForestClassifier(n_estimators=10)

start1 = time.process_time()
rfc.fit(train_x, train_y)

print('Time to train RFC: ' + str(time.process_time() - start1) + 's')

trafficLabels = ['BENIGN', 'PortScan']

# Print out the results
print(metrics.confusion_matrix(test_y, rfc.predict(test_x), labels=trafficLabels))
print("Random forest Test Accuracy :: ", metrics.accuracy_score(test_y, rfc.predict(test_x)))
print("Random forest Test Sensitivity :: ", metrics.recall_score(test_y, rfc.predict(test_x), pos_label="PortScan"))
print("Random forest Test Specificity :: ", metrics.recall_score(test_y, rfc.predict(test_x), pos_label="BENIGN"))
print("Random forest Test Precision :: ", metrics.precision_score(test_y, rfc.predict(test_x), pos_label="PortScan"))
print("Random forest F1 Score :: ", metrics.f1_score(test_y, rfc.predict(test_x), average=None, labels=trafficLabels))


Time to train RFC: 1.296875s
[[41889    21]
 [    5 52497]]
Random forest Test Accuracy ::  0.9997246112782273
Random forest Test Sensitivity ::  0.9999047655327417
Random forest Test Specificity ::  0.9994989262705798
Random forest Test Precision ::  0.9996001370958528
Random forest F1 Score ::  [0.99968975 0.99975243]


In [121]:
# Train GradientBooster
gbc = GradientBoostingClassifier(n_estimators=10)

start4 = time.process_time()
gbc.fit(train_x, train_y)

trafficLabels = ['BENIGN', 'PortScan']

print('Time to train GBC: ' + str(time.process_time() - start4) + 's')

# Print out the results
print(metrics.confusion_matrix(test_y, gbc.predict(test_x), labels=trafficLabels))
print("GBC Test Accuracy :: ", metrics.accuracy_score(test_y, gbc.predict(test_x)))
print("GBC Test Sensitivity :: ", metrics.recall_score(test_y, gbc.predict(test_x), pos_label="PortScan"))
print("GBC Test Specificity :: ", metrics.recall_score(test_y, gbc.predict(test_x), pos_label="BENIGN"))
print("GBC Test Precision :: ", metrics.precision_score(test_y, gbc.predict(test_x), pos_label="PortScan"))
print("GBC Test F1 Score :: ", metrics.f1_score(test_y, gbc.predict(test_x), average=None, labels=trafficLabels))


Time to train GBC: 2.3125s
[[41848    62]
 [   61 52441]]
GBC Test Accuracy ::  0.9986971995085371
GBC Test Sensitivity ::  0.9988381394994477
GBC Test Specificity ::  0.9985206394655214
GBC Test Precision ::  0.9988191150981849
GBC Test F1 Score ::  [0.99853255 0.99882863]


In [122]:
# Train Decision tree classifier
dt = tree.DecisionTreeClassifier(criterion = "gini", splitter = 'random', max_leaf_nodes = 10, min_samples_leaf = 5, max_depth= 5)

start2 = time.process_time()
dt.fit(train_x, train_y)

print('Time to train DT: ' + str(time.process_time() - start2) + 's')

# Print out the results
print(metrics.confusion_matrix(test_y, dt.predict(test_x), labels=trafficLabels))
print("DT Test Accuracy :: ", metrics.accuracy_score(test_y, dt.predict(test_x)))
print("DT Test Sensitivity :: ", metrics.recall_score(test_y, dt.predict(test_x), pos_label="PortScan"))
print("DT Test Specificity :: ", metrics.recall_score(test_y, dt.predict(test_x), pos_label="BENIGN"))
print("DT Test Precision :: ", metrics.precision_score(test_y, dt.predict(test_x), pos_label="PortScan"))
print("DT Test F1 Score :: ", metrics.f1_score(test_y, dt.predict(test_x), average=None, labels=trafficLabels))


Time to train DT: 0.359375s
[[41648   262]
 [  358 52144]]
DT Test Accuracy ::  0.9934330381731136
DT Test Sensitivity ::  0.9931812121442992
DT Test Specificity ::  0.9937485087091387
DT Test Precision ::  0.9950005724535359
DT Test F1 Score ::  [0.99261166 0.99409006]


In [92]:
errors = pd.DataFrame(columns=['Total Fwd Packets', 'Total Backward Packets',
 'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Mean',
 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
 'Fwd URG Flags', 'Bwd URG Flags', 'Average Packet Size',
 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Label'])

output_row = 0

for row_index, (prediction, label) in enumerate(zip (rfc.predict(test_x), test_y)):
    if prediction != label:
        row = test_x.iloc[row_index].values
        row = np.append(row, test_y.iloc[row_index])
        errors.loc[output_row] = row
        output_row += 1
        
errors.to_csv('errors.csv', index=False)


In [123]:
importance = rfc.feature_importances_
# Summarize feature importance, according to Random forest
for i,v in sorted(enumerate(importance), key=lambda x:x[1], reverse=True):
	print('Feature: %s, Score: %.5f' % (selected_features[i],v))

Feature: Avg Fwd Segment Size, Score: 0.20519
Feature: Total Length of Fwd Packets, Score: 0.19516
Feature: Average Packet Size, Score: 0.17197
Feature: Total Fwd Packets, Score: 0.12493
Feature: Total Length of Bwd Packets, Score: 0.11594
Feature: PSH Flag Count, Score: 0.07763
Feature: Fwd IAT Max, Score: 0.06383
Feature: Avg Bwd Segment Size, Score: 0.01441
Feature: Fwd IAT Mean, Score: 0.00894
Feature: URG Flag Count, Score: 0.00759
Feature: Fwd IAT Min, Score: 0.00685
Feature: Bwd IAT Std, Score: 0.00274
Feature: Bwd IAT Max, Score: 0.00157
Feature: Total Backward Packets, Score: 0.00141
Feature: Fwd IAT Std, Score: 0.00089
Feature: Bwd IAT Min, Score: 0.00077
Feature: Bwd IAT Mean, Score: 0.00016
Feature: Fwd PSH Flags, Score: 0.00000
Feature: Bwd PSH Flags, Score: 0.00000
Feature: Fwd URG Flags, Score: 0.00000
Feature: Bwd URG Flags, Score: 0.00000


In [19]:
train_x, test_x, train_y, test_y = train_test_split(test_flow_data[selected_features[:-1]],
                                                    test_flow_data[selected_features[-1]], 
                                                    train_size=0.67, 
                                                    test_size=0.33, random_state=1)

In [20]:
rfc = RandomForestClassifier(n_estimators=10)

start1 = time.process_time()
rfc.fit(train_x, train_y)

print('Time to train RFC: ' + str(time.process_time() - start1) + 's')

Time to train RFC: 0.015625s


In [21]:
trafficLabels = ['BENIGN', 'PortScan']

# Print out the results
print(metrics.confusion_matrix(test_y, rfc.predict(test_x), labels=trafficLabels))
print("Random forest Test Accuracy :: ", metrics.accuracy_score(test_y, rfc.predict(test_x)))
print("Random forest F1 Score :: ", metrics.f1_score(test_y, rfc.predict(test_x), average=None, labels=trafficLabels))


[[29  0]
 [ 0 71]]
Random forest Test Accuracy ::  1.0
Random forest F1 Score ::  [1. 1.]


In [22]:
importance = rfc.feature_importances_
# Summarize feature importance, according to Random forest
for i,v in sorted(enumerate(importance), key=lambda x:x[1], reverse=True):
	print('Feature: %s, Score: %.5f' % (selected_features[i],v))

Feature: Total Length of Fwd Packets, Score: 0.42855
Feature: Fwd IAT Std, Score: 0.19744
Feature: Total Fwd Packets, Score: 0.10000
Feature: Avg Fwd Segment Size, Score: 0.10000
Feature: Average Packet Size, Score: 0.06914
Feature: Total Length of Bwd Packets, Score: 0.05047
Feature: Avg Bwd Segment Size, Score: 0.04195
Feature: Bwd IAT Max, Score: 0.01245
Feature: Total Backward Packets, Score: 0.00000
Feature: Fwd IAT Mean, Score: 0.00000
Feature: Fwd IAT Max, Score: 0.00000
Feature: Fwd IAT Min, Score: 0.00000
Feature: Bwd IAT Mean, Score: 0.00000
Feature: Bwd IAT Std, Score: 0.00000
Feature: Bwd IAT Min, Score: 0.00000
Feature: Fwd PSH Flags, Score: 0.00000
Feature: Bwd PSH Flags, Score: 0.00000
Feature: Fwd URG Flags, Score: 0.00000
Feature: Bwd URG Flags, Score: 0.00000


In [86]:
train_x = flow_data[selected_features[:-1]]
train_y = flow_data[selected_features[-1]]

test_x = test_flow_data[selected_features[:-1]]
test_y = test_flow_data[selected_features[-1]]

In [90]:
rfc = RandomForestClassifier(n_estimators=10)

start1 = time.process_time()
rfc.fit(train_x, train_y)

print('Time to train RFC: ' + str(time.process_time() - start1) + 's')

Time to train RFC: 1.546875s


In [26]:
importance = rfc.feature_importances_
# Summarize feature importance, according to Random forest
for i,v in sorted(enumerate(importance), key=lambda x:x[1], reverse=True):
	print('Feature: %s, Score: %.5f' % (selected_features[i],v))

Feature: Total Length of Fwd Packets, Score: 0.18770
Feature: Avg Fwd Segment Size, Score: 0.16970
Feature: Average Packet Size, Score: 0.15304
Feature: Total Fwd Packets, Score: 0.13227
Feature: Avg Bwd Segment Size, Score: 0.09311
Feature: Fwd IAT Mean, Score: 0.06955
Feature: Fwd IAT Max, Score: 0.06449
Feature: Fwd IAT Min, Score: 0.06307
Feature: Total Length of Bwd Packets, Score: 0.05415
Feature: Fwd IAT Std, Score: 0.00714
Feature: Total Backward Packets, Score: 0.00243
Feature: Bwd IAT Mean, Score: 0.00158
Feature: Bwd IAT Min, Score: 0.00083
Feature: Bwd IAT Max, Score: 0.00050
Feature: Bwd IAT Std, Score: 0.00042
Feature: Fwd PSH Flags, Score: 0.00002
Feature: Bwd PSH Flags, Score: 0.00000
Feature: Fwd URG Flags, Score: 0.00000
Feature: Bwd URG Flags, Score: 0.00000


In [91]:
trafficLabels = ['BENIGN', 'PortScan']

# Print out the results
print(metrics.confusion_matrix(test_y, rfc.predict(test_x), labels=trafficLabels))
print("Random forest Test Accuracy :: ", metrics.accuracy_score(test_y, rfc.predict(test_x)))
print("Random forest F1 Score :: ", metrics.f1_score(test_y, rfc.predict(test_x), average=None, labels=trafficLabels))
print(test_x['Total Length of Fwd Packets'][1])

[[102   0]
 [200   0]]
Random forest Test Accuracy ::  0.33774834437086093
Random forest F1 Score ::  [0.5049505 0.       ]
58


  'precision', 'predicted', average, warn_for)


In [80]:
standardsc = StandardScaler()
mmsc = MinMaxScaler()

rfc = RandomForestClassifier(n_estimators=100)

start1 = time.process_time()
rfc.fit(standardsc.fit_transform(train_x), train_y)

print('Time to train RFC: ' + str(time.process_time() - start1) + 's')

Time to train RFC: 12.34375s


In [82]:
trafficLabels = ['BENIGN', 'PortScan']

# Print out the results
print(metrics.confusion_matrix(test_y, rfc.predict(standardsc.fit_transform(test_x)), labels=trafficLabels))
print("Random forest Test Accuracy :: ", metrics.accuracy_score(test_y, rfc.predict(standardsc.fit_transform(test_x))))
print("Random forest F1 Score :: ", metrics.f1_score(test_y, rfc.predict(standardsc.fit_transform(test_x)), average=None, labels=trafficLabels))


[[102   0]
 [200   0]]
Random forest Test Accuracy ::  0.33774834437086093
Random forest F1 Score ::  [0.5049505 0.       ]


  'precision', 'predicted', average, warn_for)


In [81]:
importance = rfc.feature_importances_
# Summarize feature importance, according to Random forest
for i,v in sorted(enumerate(importance), key=lambda x:x[1], reverse=True):
	print('Feature: %s, Score: %.5f' % (selected_features[i],v))

Feature: Total Length of Fwd Packets, Score: 0.25640
Feature: Average Packet Size, Score: 0.16411
Feature: Avg Fwd Segment Size, Score: 0.14966
Feature: Fwd IAT Max, Score: 0.10712
Feature: Total Length of Bwd Packets, Score: 0.08540
Feature: Avg Bwd Segment Size, Score: 0.08359
Feature: Fwd IAT Min, Score: 0.06299
Feature: Total Fwd Packets, Score: 0.05847
Feature: Total Backward Packets, Score: 0.01763
Feature: Fwd IAT Std, Score: 0.00520
Feature: Fwd IAT Mean, Score: 0.00386
Feature: Bwd IAT Min, Score: 0.00172
Feature: Bwd IAT Max, Score: 0.00158
Feature: Bwd IAT Std, Score: 0.00126
Feature: Bwd IAT Mean, Score: 0.00096
Feature: Fwd PSH Flags, Score: 0.00004
Feature: Bwd PSH Flags, Score: 0.00000
Feature: Fwd URG Flags, Score: 0.00000
Feature: Bwd URG Flags, Score: 0.00000
