In [1]:
from sklearn import svm
from timeit import default_timer as timer
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
import sklearn.feature_selection
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('classification/data.csv', index_col=0)
df

Unnamed: 0_level_0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.121478,tcp,-,FIN,6,4,258,172,74.087490,252,...,1,1,0,0,0,1,1,0,Normal,0
2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,62,...,1,2,0,0,0,1,6,0,Normal,0
3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,62,...,1,3,0,0,0,2,6,0,Normal,0
4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,62,...,1,3,1,1,0,2,1,0,Normal,0
5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,254,...,1,40,0,0,0,2,39,0,Normal,0
6,0.380537,tcp,-,FIN,10,6,534,268,39.417980,254,...,1,40,0,0,0,2,39,0,Normal,0
7,0.637109,tcp,-,FIN,10,8,534,354,26.683033,254,...,1,40,0,0,0,1,39,0,Normal,0
8,0.521584,tcp,-,FIN,10,8,534,354,32.593026,254,...,1,40,0,0,0,3,39,0,Normal,0
9,0.542905,tcp,-,FIN,10,8,534,354,31.313031,254,...,1,40,0,0,0,3,39,0,Normal,0
10,0.258687,tcp,-,FIN,10,6,534,268,57.985135,254,...,1,40,0,0,0,3,39,0,Normal,0


In [3]:
# Get GT labels
dfLabels = df.label.values

In [4]:
# Drop GT columns from dataset.
dfNum = df.drop(['label','attack_cat'],axis=1)
# Convert categorical attributes to numerical
dfNum.proto = pd.Categorical(df.proto).codes
dfNum.service = pd.Categorical(df.service).codes
dfNum.state = pd.Categorical(df.state).codes

In [5]:
# Normalize only numerical data
dfNumNorm = dfNum
scaler = MinMaxScaler()
for col in dfNumNorm.select_dtypes(exclude='O').columns:
    # Skip originally categorial columns
    if col not in ['proto','service','state']:
        scaledValues = scaler.fit_transform(dfNum[col].values.reshape(-1,1))
        dfNumNorm[col] = scaledValues
dfNumNorm.shape

(175341, 42)

In [6]:
dfNumNormAll = dfNumNorm
scaler = MinMaxScaler()
# Normaliza also categorial data
for col in ['proto','service','state']:
    scaledValues = scaler.fit_transform(dfNum[col].values.reshape(-1,1))
    dfNumNormAll[col] = scaledValues
dfNumNormAll.shape

(175341, 42)

In [7]:
# Create normalized numerical dataset without categorial columns
dfNumOnly = dfNumNorm.drop(['proto','service','state'],axis=1)

In [8]:
dfNumOnly10Feat= PCA(n_components=10).fit_transform(dfNumOnly)
dfNumOnly20Feat= PCA(n_components=20).fit_transform(dfNumOnly)

In [9]:
# Dataset recapitulation
#dfNum numerical data without GT
print(f"dfNum {dfNum.shape} numerical data without labels")
print(f"dfNumNorm {dfNumNorm.shape} normalized numerical data without labels, converted categorial columns are not normalized")
print(f"dfNumNormAll {dfNumNormAll.shape} numerical data without labels, converted categorial columns are normalized")
print(f"dfNumOnly {dfNumOnly.shape} only numerical data withou categorial columns")
print(f"dfNumOnly10Feat {dfNumOnly10Feat.shape} only numerical data withou categorial columns, reduced to 10 features")
print(f"dfNumOnly20Feat {dfNumOnly20Feat.shape} only numerical data withou categorial columns, reduced to 20 features")

dfNum (175341, 42) numerical data without labels
dfNumNorm (175341, 42) normalized numerical data without labels, converted categorial columns are not normalized
dfNumNormAll (175341, 42) numerical data without labels, converted categorial columns are normalized
dfNumOnly (175341, 39) only numerical data withou categorial columns
dfNumOnly10Feat (175341, 10) only numerical data withou categorial columns, reduced to 10 features
dfNumOnly20Feat (175341, 20) only numerical data withou categorial columns, reduced to 20 features


In [10]:
def k_fold_test(classifierName,datasetName,dataset, labels, classifier, k=3, shuffleDataset=True):
    kFold = KFold(n_splits=k, shuffle=shuffleDataset)
    kFoldResult = []
    kFoldIndex = 0
    for train_index, test_index in kFold.split(dataset):
        X_train, X_test = dataset[train_index], dataset[test_index]
        Y_train, Y_test = labels[train_index], labels[test_index]
        start = timer()
        classifier.fit(X_train, Y_train)
        fitTime = timer() - start
        
        start = timer()
        predictedLabels = classifier.predict(X_test)
        predictionTime = timer() - start

        predictionProbs = classifier.predict_proba(X_test)
        rocAucScore = roc_auc_score(Y_test, predictionProbs[:,1])
        
        score = float(sum([p[0] == p[1] for p in zip(predictedLabels, Y_test)])) / float(len(Y_test))
        f1Score = f1_score(Y_test, predictedLabels)

        kFoldResult.append({
            "Dataset": datasetName,
            "Classifier": classifierName,
            "KFoldIndex": kFoldIndex,
            "FitTime": fitTime,
            "PredictionTime": predictionTime,
            "Precision": score,
            "F1Score": f1Score,
            "RocAucScore": rocAucScore
          })
        kFoldIndex += 1
    print(f"Finished classification for {classifierName}-{datasetName}")
    return kFoldResult

In [36]:
# Decision tree test
decisionTreeClassification = []
decisionTreeClassifier = DecisionTreeClassifier(min_samples_leaf=100)
decisionTreeClassification += k_fold_test("DecTree MSF=100","dfNum",dataset=dfNum.values, labels=dfLabels, classifier=decisionTreeClassifier,k=3)
decisionTreeClassification += k_fold_test("DecTree MSF=100","dfNumNorm",dataset=dfNumNorm.values, labels=dfLabels, classifier=decisionTreeClassifier,k=3)
decisionTreeClassification += k_fold_test("DecTree MSF=100","dfNumNormAll",dataset=dfNumNormAll.values, labels=dfLabels, classifier=decisionTreeClassifier,k=3)
decisionTreeClassification += k_fold_test("DecTree MSF=100","dfNumOnly",dataset=dfNumOnly.values, labels=dfLabels, classifier=decisionTreeClassifier,k=3)
decisionTreeClassification += k_fold_test("DecTree MSF=100","dfNumOnly10Feat",dataset=dfNumOnly10Feat, labels=dfLabels, classifier=decisionTreeClassifier,k=3)
decisionTreeClassification += k_fold_test("DecTree MSF=100","dfNumOnly20Feat",dataset=dfNumOnly20Feat, labels=dfLabels, classifier=decisionTreeClassifier,k=3)

Finished classification for DecTree MSF=100-dfNum
Finished classification for DecTree MSF=100-dfNumNorm
Finished classification for DecTree MSF=100-dfNumNormAll
Finished classification for DecTree MSF=100-dfNumOnly
Finished classification for DecTree MSF=100-dfNumOnly10Feat
Finished classification for DecTree MSF=100-dfNumOnly20Feat


In [38]:
decisionTreeDf = pd.DataFrame(decisionTreeClassification).set_index("Dataset")
decisionTreeDf = decisionTreeDf[['Classifier','KFoldIndex','FitTime','PredictionTime','Precision','F1Score','RocAucScore']]
decisionTreeDf

Unnamed: 0_level_0,Classifier,KFoldIndex,FitTime,PredictionTime,Precision,F1Score,RocAucScore
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
dfNum,DecTree MSF=100,0,1.075743,0.011567,0.947765,0.962071,0.99002
dfNum,DecTree MSF=100,1,1.101148,0.011524,0.945694,0.960377,0.98897
dfNum,DecTree MSF=100,2,1.049477,0.011698,0.948586,0.962762,0.989704
dfNumNorm,DecTree MSF=100,0,1.114521,0.010796,0.946858,0.961363,0.989167
dfNumNorm,DecTree MSF=100,1,1.031433,0.011641,0.946841,0.961317,0.989526
dfNumNorm,DecTree MSF=100,2,1.050809,0.01182,0.946105,0.960879,0.989558
dfNumNormAll,DecTree MSF=100,0,1.059332,0.011243,0.947097,0.961631,0.988972
dfNumNormAll,DecTree MSF=100,1,1.103978,0.012318,0.947953,0.962111,0.989949
dfNumNormAll,DecTree MSF=100,2,1.080923,0.010902,0.946858,0.961212,0.989309
dfNumOnly,DecTree MSF=100,0,1.069072,0.01043,0.946858,0.961327,0.989317


In [39]:
decisionTreeDf.to_csv("classification/DecisionTreeCache.csv")

In [40]:
# Neural networks test
MLPClassification = []
nnClassifier = MLPClassifier(hidden_layer_sizes=(5,5),learning_rate_init=0.05, activation='tanh',solver='adam',max_iter=1000)
MLPClassification += k_fold_test("MLPC_tanh_(5,5)","dfNum",dataset=dfNum.values, labels=dfLabels, classifier=nnClassifier,k=3)
MLPClassification += k_fold_test("MLPC_tanh_(5,5)","dfNumNorm",dataset=dfNumNorm.values, labels=dfLabels, classifier=nnClassifier,k=3)
MLPClassification += k_fold_test("MLPC_tanh_(5,5)","dfNumNormAll",dataset=dfNumNormAll.values, labels=dfLabels, classifier=nnClassifier,k=3)
MLPClassification += k_fold_test("MLPC_tanh_(5,5)","dfNumOnly",dataset=dfNumOnly.values, labels=dfLabels, classifier=nnClassifier,k=3)
MLPClassification += k_fold_test("MLPC_tanh_(5,5)","dfNumOnly10Feat",dataset=dfNumOnly10Feat, labels=dfLabels, classifier=nnClassifier,k=3)
MLPClassification += k_fold_test("MLPC_tanh_(5,5)","dfNumOnly20Feat",dataset=dfNumOnly20Feat, labels=dfLabels, classifier=nnClassifier,k=3)

Finished classification for MLPC_tanh_(5,5)-dfNum
Finished classification for MLPC_tanh_(5,5)-dfNumNorm
Finished classification for MLPC_tanh_(5,5)-dfNumNormAll
Finished classification for MLPC_tanh_(5,5)-dfNumOnly
Finished classification for MLPC_tanh_(5,5)-dfNumOnly10Feat
Finished classification for MLPC_tanh_(5,5)-dfNumOnly20Feat


In [42]:
# Neural networks test
nnClassifier2 = MLPClassifier(hidden_layer_sizes=(5,5),learning_rate_init=0.05, activation='relu',solver='adam',max_iter=1000)
MLPClassification += k_fold_test("MLPC_relu_(5,5)","dfNumOnly10Feat",dataset=dfNumOnly10Feat, labels=dfLabels, classifier=nnClassifier2,k=3)
MLPClassification += k_fold_test("MLPC_relu_(5,5)","dfNumOnly20Feat",dataset=dfNumOnly20Feat, labels=dfLabels, classifier=nnClassifier2,k=3)

Finished classification for MLPC_relu_(5,5)-dfNumOnly10Feat
Finished classification for MLPC_relu_(5,5)-dfNumOnly20Feat


In [43]:
MLPCDf = pd.DataFrame(MLPClassification).set_index("Dataset")
MLPCDf = MLPCDf[['Classifier','KFoldIndex','FitTime','PredictionTime','Precision','F1Score','RocAucScore']]
MLPCDf

Unnamed: 0_level_0,Classifier,KFoldIndex,FitTime,PredictionTime,Precision,F1Score,RocAucScore
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
dfNum,"MLPC_tanh_(5,5)",0,5.739113,0.012943,0.938115,0.954492,0.985528
dfNum,"MLPC_tanh_(5,5)",1,14.195107,0.010571,0.936096,0.954661,0.986799
dfNum,"MLPC_tanh_(5,5)",2,10.313619,0.012569,0.936507,0.955388,0.985927
dfNumNorm,"MLPC_tanh_(5,5)",0,6.305877,0.013022,0.937105,0.953196,0.986342
dfNumNorm,"MLPC_tanh_(5,5)",1,6.352732,0.010516,0.934659,0.954326,0.984498
dfNumNorm,"MLPC_tanh_(5,5)",2,13.320381,0.010589,0.934779,0.951717,0.98539
dfNumNormAll,"MLPC_tanh_(5,5)",0,15.123451,0.011176,0.935702,0.952734,0.985468
dfNumNormAll,"MLPC_tanh_(5,5)",1,12.209073,0.011845,0.939141,0.955952,0.98638
dfNumNormAll,"MLPC_tanh_(5,5)",2,12.150476,0.014718,0.939484,0.955994,0.985527
dfNumOnly,"MLPC_tanh_(5,5)",0,17.834378,0.013629,0.934796,0.954196,0.985772


In [44]:
MLPCDf.to_csv("classification/mplc.csv")

In [11]:
# Neural networks test
SVMClassification = []
svmClassifier = svm.SVC(gamma='scale',kernel='rbf',C=100000,max_iter=1200,probability=True)

SVMClassification += k_fold_test("SVM_rbf_1200iters","dfNum",dataset=dfNum.values, labels=dfLabels, classifier=svmClassifier ,k=3)
SVMClassification += k_fold_test("SVM_rbf_1200iters","dfNumNorm",dataset=dfNumNorm.values, labels=dfLabels, classifier=svmClassifier ,k=3)
SVMClassification += k_fold_test("SVM_rbf_1200iters","dfNumNormAll",dataset=dfNumNormAll.values, labels=dfLabels, classifier=svmClassifier ,k=3)
SVMClassification += k_fold_test("SVM_rbf_1200iters","dfNumOnly",dataset=dfNumOnly.values, labels=dfLabels, classifier=svmClassifier ,k=3)
SVMClassification += k_fold_test("SVM_rbf_1200iters","dfNumOnly10Feat",dataset=dfNumOnly10Feat, labels=dfLabels, classifier=svmClassifier ,k=3)
SVMClassification += k_fold_test("SVM_rbf_1200iters","dfNumOnly20Feat",dataset=dfNumOnly20Feat, labels=dfLabels, classifier=svmClassifier ,k=3)

Finished classification for SVM_rbf_1200iters-dfNum
Finished classification for SVM_rbf_1200iters-dfNumNorm
Finished classification for SVM_rbf_1200iters-dfNumNormAll
Finished classification for SVM_rbf_1200iters-dfNumOnly
Finished classification for SVM_rbf_1200iters-dfNumOnly10Feat
Finished classification for SVM_rbf_1200iters-dfNumOnly20Feat


In [12]:
SVMdf = pd.DataFrame(SVMClassification).set_index("Dataset")
SVMdf = SVMdf[['Classifier','KFoldIndex','FitTime','PredictionTime','Precision','F1Score','RocAucScore']]
SVMdf

Unnamed: 0_level_0,Classifier,KFoldIndex,FitTime,PredictionTime,Precision,F1Score,RocAucScore
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
dfNum,SVM_rbf_1200iters,0,121.958534,2.287595,0.547573,0.515119,0.789618
dfNum,SVM_rbf_1200iters,1,121.80888,3.042531,0.623231,0.659091,0.792921
dfNum,SVM_rbf_1200iters,2,123.715567,3.13411,0.699488,0.726503,0.918362
dfNumNorm,SVM_rbf_1200iters,0,112.377424,2.738135,0.719558,0.747469,0.165702
dfNumNorm,SVM_rbf_1200iters,1,121.046128,2.80107,0.772717,0.813553,0.908808
dfNumNorm,SVM_rbf_1200iters,2,120.474517,2.660627,0.758499,0.804547,0.128011
dfNumNormAll,SVM_rbf_1200iters,0,128.014413,2.980967,0.514227,0.473149,0.795432
dfNumNormAll,SVM_rbf_1200iters,1,149.902981,2.765719,0.495492,0.425328,0.783079
dfNumNormAll,SVM_rbf_1200iters,2,143.527901,2.604083,0.78004,0.825312,0.177641
dfNumOnly,SVM_rbf_1200iters,0,131.636339,2.212968,0.355775,0.109921,0.892905


In [13]:
SVMdf.to_csv("classification/svm.csv")