In [58]:
from sklearn import svm
from timeit import default_timer as timer
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
import sklearn.feature_selection
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [62]:
data = pd.read_csv('classification/data.csv', index_col=0)
data

Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.121478,tcp,-,FIN,6,4,258,172,74.087490,252,...,1,1,0,0,0,1,1,0,Normal,0
2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,62,...,1,2,0,0,0,1,6,0,Normal,0
3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,62,...,1,3,0,0,0,2,6,0,Normal,0
4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,62,...,1,3,1,1,0,2,1,0,Normal,0
5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,254,...,1,40,0,0,0,2,39,0,Normal,0
6,0.380537,tcp,-,FIN,10,6,534,268,39.417980,254,...,1,40,0,0,0,2,39,0,Normal,0
7,0.637109,tcp,-,FIN,10,8,534,354,26.683033,254,...,1,40,0,0,0,1,39,0,Normal,0
8,0.521584,tcp,-,FIN,10,8,534,354,32.593026,254,...,1,40,0,0,0,3,39,0,Normal,0
9,0.542905,tcp,-,FIN,10,8,534,354,31.313031,254,...,1,40,0,0,0,3,39,0,Normal,0
10,0.258687,tcp,-,FIN,10,6,534,268,57.985135,254,...,1,40,0,0,0,3,39,0,Normal,0


In [63]:
# Convert categorical attributes to numerical
dataNum = data
dataNum.proto = pd.Categorical(data.proto).codes
dataNum.service = pd.Categorical(data.service).codes
dataNum.state = pd.Categorical(data.state).codes
dataNumLabels = dataNum.label
dataNum = dataNum.drop(['label','attack_cat'],axis=1)

In [87]:
# Normalazing original numerical attributes to range [0.0;1.0]
dataScaled = dataNum
for col in dataNum.select_dtypes(exclude='O').columns:
    if col not in ['proto','service','state']:
        scaler = MinMaxScaler()
        scaledValues = scaler.fit_transform(dataNum[col].values.reshape(-1,1))
        dataScaled[col] = scaledValues
dataScaled.shape

(175341, 42)

In [89]:
X = dataNum.values
XScaled = dataScaled.values
Y = dataNumLabels.values
assert X.shape==XScaled.shape

In [95]:
XReduced = PCA(n_components=20).fit_transform(X)
XScaledReduced = PCA(n_components=20).fit_transform(XScaled)
XScaled10 = PCA(n_components=10).fit_transform(XScaled)
XScaledReduced.shape

(175341, 20)

In [71]:
# SVM - KFold
def kfold_test(name,dataset,results):
    svmKF = KFold(n_splits=3, shuffle=True)
    for train_index, test_index in svmKF.split(dataset):
        X_train, X_test = dataset[train_index], dataset[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        svmClassificator = svm.SVC(gamma='scale',kernel='rbf',C=100000,max_iter=5000)
        start = timer()
        svmClassificator.fit(X_train, Y_train)
        end = timer()
        fitTime = end-start
        accuracy =  svmClassificator.score(X_test, Y_test)
        predicted = svmClassificator.predict(X_test)
        f1 = f1_score(Y_test, predicted)
        results.append({
            "Name": name,
            "Accuracy": accuracy,
            "F1-Score": f1,
            "Fit time [s]": fitTime
        })
        print("Done it")

results = []
# kfold_test("X 43 Components", X, results)
# kfold_test("Normalized X 43 Components", XScaled, results)
# kfold_test("X 20 Components", XReduced, results)
kfold_test("Normalized X 20 Components", XScaledReduced, results)
results = pd.DataFrame(results)
results

Done it
Done it
Done it


Unnamed: 0,Accuracy,F1-Score,Fit time [s],Name
0,0.601708,0.709814,30.680969,Normalized X 20 Components
1,0.554879,0.594539,31.009869,Normalized X 20 Components
2,0.369052,0.391217,33.799084,Normalized X 20 Components


In [69]:
# Decision trees - KFold
def decision_tree_kfold_test(name,dataset,results):
    svmKF = KFold(n_splits=3, shuffle=True)
    for train_index, test_index in svmKF.split(dataset):
        
        X_train, X_test = dataset[train_index], dataset[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        decTree = DecisionTreeClassifier(min_samples_leaf=10)
        start = timer()
        decTree.fit(X_train, Y_train)
        end = timer()
        fitCalcTime = end - start
        accuracy =  decTree.score(X_test, Y_test)
        predicted = decTree.predict(X_test)
        f1 = f1_score(Y_test, predicted)
        results.append({
            "Name": name,
            "Accuracy": accuracy,
            "F1-Score": f1,
            "Fit time [s]": fitCalcTime
        })
        # print(f"Done with iteration for {name}")
tree_results = []
decision_tree_kfold_test("X 20 Components", XReduced, tree_results)
decision_tree_kfold_test("Normalized X 20 Components", XScaledReduced, tree_results)
decision_tree_kfold_test("X 42 Components", X, tree_results)
decision_tree_kfold_test("Normalized X 42 Components", XScaled, tree_results)
tree_results = pd.DataFrame(tree_results)
tree_results

Unnamed: 0,Accuracy,F1-Score,Fit time [s],Name
0,0.938423,0.955057,1.956738,X 20 Components
1,0.937875,0.954402,1.847205,X 20 Components
2,0.938029,0.954476,1.992122,X 20 Components
3,0.93779,0.954382,1.892053,Normalized X 20 Components
4,0.939552,0.955787,1.896772,Normalized X 20 Components
5,0.936079,0.953144,1.877124,Normalized X 20 Components
6,0.946738,0.960843,1.24236,X 42 Components
7,0.945506,0.960085,1.25402,X 42 Components
8,0.947457,0.961569,1.238442,X 42 Components
9,0.945267,0.959908,1.336543,Normalized X 42 Components


In [91]:
def test_nn(trainData, trainGT, testData, testGT, layers,initial_learning_rate, activation_fn='tanh', iters=15000):
    nnModel = MLPClassifier(  hidden_layer_sizes=layers,
                            learning_rate_init=initial_learning_rate,
                            activation=activation_fn,
                            solver='adam',
                            verbose=False,
                            max_iter=iters)
    start = timer()
    trainedModel = nnModel.fit(trainData, trainGT)
    end = timer()
    time = end-start
    # probs = trainedModel.predict_proba(testData)
    score = trainedModel.score(testData, testGT)
    predictedLabels = trainedModel.predict(testData)
    f1 = f1_score(testGT, predictedLabels)
    return (score,f1,time)

In [97]:
def nn_kfold_test(name,dataset,results):
    svmKF = KFold(n_splits=3, shuffle=True)
    for train_index, test_index in svmKF.split(dataset):
        X_train, X_test = dataset[train_index], dataset[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]

        accuracy,f1,fitTime = test_nn(X_train, Y_train, X_test, Y_test, (8,5), 0.025,'tanh', 500)
        results.append({
            "Name": name,
            "Accuracy": accuracy,
            "F1-Score": f1,
            "Fit time [s]": fitTime
        })
        print(f"Done with iteration for {name}")

In [98]:
# Decision trees - KFold

nn_results = []
# nn_kfold_test("X 20 Components", XReduced, nn_results)
nn_kfold_test("Normalized X 20 Components", XScaledReduced, nn_results)
nn_kfold_test("Normalized X 10 Components", XScaled10, nn_results)
# nn_kfold_test("X 42 Components", X, nn_results)
# nn_kfold_test("Normalized X 42 Components", XScaled, nn_results)
nn_results = pd.DataFrame(nn_results)
nn_results


Done with iteration for Normalized X 20 Components
Done with iteration for Normalized X 20 Components
Done with iteration for Normalized X 20 Components
Done with iteration for Normalized X 10 Components
Done with iteration for Normalized X 10 Components
Done with iteration for Normalized X 10 Components


Unnamed: 0,Accuracy,F1-Score,Fit time [s],Name
0,0.933666,0.953439,12.055757,Normalized X 20 Components
1,0.93921,0.957035,19.196889,Normalized X 20 Components
2,0.937584,0.955703,22.410926,Normalized X 20 Components
3,0.935993,0.955137,20.895949,Normalized X 10 Components
4,0.935138,0.954011,41.33986,Normalized X 10 Components
5,0.936199,0.954483,27.384588,Normalized X 10 Components
