In [1]:
import numpy as np
import pandas as pd
import scipy
import os
import math
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA,qPCA
from sklearn import metrics
from statsmodels.distributions.empirical_distribution import ECDF 
import dc_stat_think as dcst
from sklearn.QuantumUtility.Utility import *
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder,RobustScaler, QuantileTransformer
import warnings
warnings.filterwarnings("ignore")

# Load data

Load 'df_new.csv' for the dataset unbalanced towards anomalies. Otherwise, load 'df_DDoSPaper.csv' for dataset unbalanced towards normal samples. Load 'df_Darknet.csv' for Darknet data.

In [2]:
dataset='df_new.csv'
df_new = pd.read_csv(dataset)

df_new.columns = df_new.columns.str.strip()
df_new = df_new.drop(df_new[pd.isnull(df_new['Flow ID'])].index)
df_new.replace('Infinity', -1, inplace=True)
df_new[["Flow Bytes/s", "Flow Packets/s"]] = df_new[["Flow Bytes/s", "Flow Packets/s"]].apply(pd.to_numeric)
df_new.replace([np.inf, -np.inf, np.nan], -1, inplace=True)
string_features = list(df_new.select_dtypes(include=['object']).columns)
if dataset!='df_Darknet.csv':
    string_features.remove('Label')
le = LabelEncoder()
df_new[string_features] = df_new[string_features].apply(lambda col: le.fit_transform(col))

df_copy = df_new.drop(columns = 'Label')
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(df_copy)
constant_columns = [column for column in df_copy.columns if column not in df_copy.columns[constant_filter.get_support()]]


In [3]:
if dataset=='df_Darknet.csv':
    categorical_columns_train = ['Flow ID', 'Src IP', 'Src Port',
                              'Dst IP', 'Dst Port','Label.1','Timestamp']


else:

    categorical_columns_train = ['Flow ID', 'Source IP', 'Source Port',
                              'Destination IP', 'Destination Port', 'Timestamp']

columns_to_drop_train = categorical_columns_train + constant_columns


In [4]:
df_new = df_new.drop(columns = columns_to_drop_train)
df_new

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,6.0,1.0,2.0,0.0,12.0,0.0,6.0,6.0,6.000000,0.000000,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
1,6.0,1119.0,9.0,6.0,3160.0,3060.0,1565.0,0.0,351.111111,688.214982,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,6.0,953.0,7.0,4.0,2814.0,2836.0,1398.0,0.0,402.000000,680.402822,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
3,6.0,0.0,2.0,0.0,12.0,0.0,6.0,6.0,6.000000,0.000000,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,6.0,77216.0,19.0,11.0,3412.0,6654.0,1293.0,0.0,179.578947,400.318806,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
806551,6.0,41.0,1.0,1.0,2.0,6.0,2.0,2.0,2.000000,0.000000,...,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,PortScan
806552,6.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,DoS Hulk
806553,6.0,86760513.0,10.0,6.0,383.0,11595.0,377.0,0.0,38.300000,119.021987,...,20.0,1001.0,0.0,1001.0,1001.0,85800000.0,0.0,85800000.0,85800000.0,DoS Hulk
806554,6.0,7092885.0,5.0,0.0,30.0,0.0,6.0,6.0,6.000000,0.000000,...,20.0,5000.0,0.0,5000.0,5000.0,7087885.0,0.0,7087885.0,7087885.0,DDoS


In [5]:
if dataset=='df_DDoSPaper.csv':
    LEN_TRAIN = 158022
    LEN_VALIDATION = 60000
    n_quantils = 24
    threshold = .06632108379654125
    n_components = 32
    negative_label='BENIGN'
    tr = QuantileTransformer(n_quantiles=n_quantils, random_state=0)
elif dataset=='df_new.csv':
    LEN_TRAIN = 50000
    LEN_VALIDATION = 226966
    n_quantils = 751
    threshold = .4259394035517815
    n_components = 12
    tr = QuantileTransformer(n_quantiles=n_quantils, random_state=0)
    negative_label='BENIGN'
elif dataset=='df_Darknet.csv':
    LEN_TRAIN = 50000
    LEN_VALIDATION = 30000
    threshold = .4438816547139376
    n_components = 35
    tr = StandardScaler()
    negative_label=0

In [6]:
x = tr.fit_transform(df_new.drop(columns = 'Label'))

train = x[:LEN_TRAIN]
test = x[LEN_TRAIN+LEN_VALIDATION:]

In [7]:
qPca = qPCA(svd_solver="full", name='PCA',n_components=n_components).fit(train, theta_estimate=False, eps_theta=0, p=n_components,
                                   estimate_all=False, delta=0, eps=0, true_tomography=True,
                                   eta=0, norm='L2', condition_number_est=False, spectral_norm_est=False)

transform_X = qPca.transform(test)
inverse_X = qPca.inverse_transform(transform_X)
loss = np.sum((test - inverse_X) ** 2, axis=1)

attack_prediction = df_new[LEN_TRAIN+LEN_VALIDATION:].iloc[np.where(loss > threshold)[0]]['Label'].value_counts()
normal_prediction = df_new[LEN_TRAIN+LEN_VALIDATION:].iloc[np.where(loss <= threshold)[0]]['Label'].value_counts()

total_predicted_attack = attack_prediction.sum()
FP = attack_prediction[negative_label]
TP = total_predicted_attack - FP

total_predicted_negative = normal_prediction.sum()

TN = normal_prediction[negative_label]
FN = total_predicted_negative -TN


In [8]:
recall = (TP)/(TP+FN)
precision = (TP)/(TP+FP)
accuracy = (TP+TN)/(TP+FP+TN+FN)
f1 = 2/((1/recall)+(1/precision))

'recall:',recall,'precision:', precision,'f1_score:', f1,'accuracy:', accuracy

('recall:',
 0.9591852973638954,
 'precision:',
 0.9193791240423365,
 'f1_score:',
 0.9388604707253835,
 'accuracy:',
 0.9080987178760929)

# Cumulative score distribution of CICIDS

Run these cells only if tou have previously used 'df_new.csv' since it contains all the types of attacks.

In [9]:
indexes_to_map = np.arange(0,len(loss))
tmp_dict=dict(zip(df_new[LEN_VALIDATION+LEN_TRAIN:].index,indexes_to_map))

In [10]:
labels = df_new[LEN_VALIDATION+LEN_TRAIN:]['Label'].unique()
associative_dict ={}
for label in labels:
    loss_indexes = [tmp_dict[i] for i in df_new[LEN_VALIDATION+LEN_TRAIN:][df_new[LEN_VALIDATION+LEN_TRAIN:]['Label']==label].index]
    associative_dict.update({label:loss[loss_indexes]})
associative_dict

{'BENIGN': array([0.52694146, 0.46466027, 0.13417754, ..., 0.0524765 , 0.46957525,
        0.75291922]),
 'DDoS': array([0.85180086, 1.0103266 , 0.60814429, ..., 1.67281267, 1.77423902,
        0.48463835]),
 'DoS Hulk': array([1.66993674, 0.28569289, 0.49663163, ..., 2.62609684, 0.46488207,
        1.65714346]),
 'PortScan': array([2.03930068, 1.45048085, 2.05877285, ..., 2.03261439, 2.02081194,
        1.5337037 ]),
 'DoS Slowhttptest': array([2.64200413, 2.64036473, 2.64021346, ..., 0.58595504, 2.64242351,
        2.63937942]),
 'SSH-Patator': array([0.4631772 , 0.15905994, 0.14084616, ..., 0.17570221, 0.47050297,
        0.45906637]),
 'FTP-Patator': array([0.5091876 , 0.50819404, 0.49517072, ..., 2.46214054, 0.50960652,
        0.52007704]),
 'Bot': array([0.28281214, 0.13366567, 0.14104591, ..., 0.47332362, 0.17088459,
        1.87900388]),
 'DoS GoldenEye': array([0.34759773, 1.70014924, 1.74119209, ..., 4.1407184 , 1.02391708,
        0.58580485]),
 'DoS slowloris': array([1.41

In [11]:
# Cumulative anomaly score distribution for attacks type.
cwd = os.getcwd()
for label in associative_dict:
    
    x, y = dcst.ecdf(associative_dict[label])
    fig = plt.figure()
    plt.plot(x, y, linestyle='dotted', lw = 2,marker='o')
    plt.vlines(x = threshold, ymin=0,ymax=1 ,colors='r')
    plt.xlabel('anomaly score')
    plt.ylabel('percentile')
    #plt.title('Cumulative anomaly score distribution for '+label)
    #plt.show()
    plt.savefig(os.path.join(cwd,'CICIDS2017/AnomalyThreshold',label+'.pdf'))