# Machine Learning for Intrusion Detection System with KDD99 dataset

## Version Check

In [1]:
import pandas as pd
import numpy as np
import sys
import sklearn
print(pd.__version__)
print(np.__version__)
print(sys.version)
print(sklearn.__version__)

0.23.4
1.15.4
3.5.2 (default, Nov 23 2017, 16:37:01) 
[GCC 5.4.0 20160609]
0.20.1


## Load the Dataset

In [2]:
# attach the column names to the dataset
col_names = ["duration",
             "protocol_type",
             "service","flag",
             "src_bytes",             
             "dst_bytes",
             "land",
             "wrong_fragment",
             "urgent","hot",
             "num_failed_logins",
             "logged_in",
             "num_compromised",
             "root_shell",
             "su_attempted",
             "num_root",
             "num_file_creations",
             "num_shells",
             "num_access_files",
             "num_outbound_cmds",
             "is_host_login",
             "is_guest_login",
             "count",
             "srv_count",
             "serror_rate",
             "srv_serror_rate",
             "rerror_rate",
             "srv_rerror_rate",
             "same_srv_rate",
             "diff_srv_rate",
             "srv_diff_host_rate",
             "dst_host_count",
             "dst_host_srv_count",
             "dst_host_same_srv_rate",
             "dst_host_diff_srv_rate",
             "dst_host_same_src_port_rate",
             "dst_host_srv_diff_host_rate",
             "dst_host_serror_rate",
             "dst_host_srv_serror_rate",
             "dst_host_rerror_rate",
             "dst_host_srv_rerror_rate",
             "label"]

# Load dataset
df = pd.read_csv("KDDTrain+.csv", header=None, names=col_names)
df_test = pd.read_csv("KDDTest+.csv", header=None, names=col_names)

# shape, this gives the dimensions of the dataset
print('Dimensions of the Training Set:', df.shape)
print('Dimensions of the Test Set:', df_test.shape)

Dimensions of the Training Set: (125973, 42)
Dimensions of the Test Set: (22544, 42)


## Sample view of the training dataset

In [3]:
# first ten rows
df.head(10)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
5,0,tcp,private,REJ,0,0,0,0,0,0,...,19,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune
6,0,tcp,private,S0,0,0,0,0,0,0,...,9,0.04,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
7,0,tcp,private,S0,0,0,0,0,0,0,...,15,0.06,0.07,0.0,0.0,1.0,1.0,0.0,0.0,neptune
8,0,tcp,remote_job,S0,0,0,0,0,0,0,...,23,0.09,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
9,0,tcp,private,S0,0,0,0,0,0,0,...,13,0.05,0.06,0.0,0.0,1.0,1.0,0.0,0.0,neptune


## Statistical Summary

In [4]:
df.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,...,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0,125973.0
mean,287.14465,45566.74,19779.11,0.000198,0.022687,0.000111,0.204409,0.001222,0.395736,0.27925,...,182.148945,115.653005,0.521242,0.082951,0.148379,0.032542,0.284452,0.278485,0.118832,0.12024
std,2604.51531,5870331.0,4021269.0,0.014086,0.25353,0.014366,2.149968,0.045239,0.48901,23.942042,...,99.206213,110.702741,0.448949,0.188922,0.308997,0.112564,0.444784,0.445669,0.306557,0.319459
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,82.0,10.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,63.0,0.51,0.02,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,276.0,516.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,255.0,1.0,0.07,0.06,0.02,1.0,1.0,0.0,0.0
max,42908.0,1379964000.0,1309937000.0,1.0,3.0,3.0,77.0,5.0,1.0,7479.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Label Distribution of Training and Test set

In [5]:
print('Label distribution Training set:')
print(df['label'].value_counts())
print()
print('Label distribution Test set:')
print(df_test['label'].value_counts())

Label distribution Training set:
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: label, dtype: int64

Label distribution Test set:
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgetattack       178


# Step 1: Data preprocessing:

## Identify categorical features

In [6]:
# Training Set:
# colums that are categorical and not binary yet: protocol_type (column 2), service (column 3), flag (column 4).
# explore categorical features
print('Training Set:')
for col_name in df.columns:
    if df[col_name].dtypes == 'object' :
        unique_cat = len(df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

# Distribution of 'protocol_type'
print()
print('Distribution of categories in protocol_type:')
print(df['protocol_type'].value_counts().sort_values(ascending=False).head())
        
# Distribution of 'service'
print()
print('Distribution of categories in service:')
print(df['service'].value_counts().sort_values(ascending=False).head())

# Distribution of 'flag'
print()
print('Distribution of categories in flag:')
print(df['flag'].value_counts().sort_values(ascending=False).head())

Training Set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 70 categories
Feature 'flag' has 11 categories
Feature 'label' has 23 categories

Distribution of categories in protocol_type:
tcp     102689
udp      14993
icmp      8291
Name: protocol_type, dtype: int64

Distribution of categories in service:
http        40338
private     21853
domain_u     9043
smtp         7313
ftp_data     6860
Name: service, dtype: int64

Distribution of categories in flag:
SF      74945
S0      34851
REJ     11233
RSTR     2421
RSTO     1562
Name: flag, dtype: int64


In [7]:
# Test Set:
print('Test Set:')
for col_name in df_test.columns:
    if df_test[col_name].dtypes == 'object' :
        unique_cat = len(df_test[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

Test Set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 64 categories
Feature 'flag' has 11 categories
Feature 'label' has 38 categories


## LabelEncoder

### Insert categorical features into a 2D numpy array

In [8]:
from sklearn.preprocessing import LabelEncoder

# insert code to get a list of categorical columns into a variable, categorical_columns
categorical_columns=['protocol_type', 'service', 'flag'] 

# Get the categorical values into a 2D numpy array
df_categorical_values = df[categorical_columns]
testdf_categorical_values = df_test[categorical_columns]

df_categorical_values.head(10)

Unnamed: 0,protocol_type,service,flag
0,tcp,ftp_data,SF
1,udp,other,SF
2,tcp,private,S0
3,tcp,http,SF
4,tcp,http,SF
5,tcp,private,REJ
6,tcp,private,S0
7,tcp,private,S0
8,tcp,remote_job,S0
9,tcp,private,S0


### Transform categorical features into numbers using LabelEncoder()

In [9]:
# training set
df_categorical_values_enc=df_categorical_values.apply(LabelEncoder().fit_transform)
print(df_categorical_values_enc.head(10))

# test set
testdf_categorical_values_enc=testdf_categorical_values.apply(LabelEncoder().fit_transform)

   protocol_type  service  flag
0              1       20     9
1              2       44     9
2              1       49     5
3              1       24     9
4              1       24     9
5              1       49     1
6              1       49     5
7              1       49     5
8              1       51     5
9              1       49     5


In [10]:
dumcols = ["protocol_type_enc", "service_enc", "flag_enc"]

# trainging set
df_cat_data = pd.DataFrame(np.array(df_categorical_values_enc), columns=dumcols)

# test set
testdf_cat_data = pd.DataFrame(np.array(testdf_categorical_values_enc), columns=dumcols)

df_cat_data.head(10)

Unnamed: 0,protocol_type_enc,service_enc,flag_enc
0,1,20,9
1,2,44,9
2,1,49,5
3,1,24,9
4,1,24,9
5,1,49,1
6,1,49,5
7,1,49,5
8,1,51,5
9,1,49,5


### Join encoded categorical dataframe with the non-categorical dataframe

In [11]:
# training data
newdf=df.join(df_cat_data)
newdf.drop('flag', axis=1, inplace=True)
newdf.drop('protocol_type', axis=1, inplace=True)
newdf.drop('service', axis=1, inplace=True)

# test data
newdf_test=df_test.join(testdf_cat_data)
newdf_test.drop('flag', axis=1, inplace=True)
newdf_test.drop('protocol_type', axis=1, inplace=True)
newdf_test.drop('service', axis=1, inplace=True)

# Result 
print(newdf.shape)
print(newdf_test.shape)
newdf.head(10)

(125973, 42)
(22544, 42)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,protocol_type_enc,service_enc,flag_enc
0,0,491,0,0,0,0,0,0,0,0,...,0.17,0.0,0.0,0.0,0.05,0.0,normal,1,20,9
1,0,146,0,0,0,0,0,0,0,0,...,0.88,0.0,0.0,0.0,0.0,0.0,normal,2,44,9
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,1.0,1.0,0.0,0.0,neptune,1,49,5
3,0,232,8153,0,0,0,0,0,1,0,...,0.03,0.04,0.03,0.01,0.0,0.01,normal,1,24,9
4,0,199,420,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,normal,1,24,9
5,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,1.0,neptune,1,49,1
6,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,1.0,1.0,0.0,0.0,neptune,1,49,5
7,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,1.0,1.0,0.0,0.0,neptune,1,49,5
8,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,1.0,1.0,0.0,0.0,neptune,1,51,5
9,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,1.0,1.0,0.0,0.0,neptune,1,49,5


# Select categories of DoS attack from Dataset
## Rename every attack label: 0=normal, 1-6=DoS, 1=Neptune, 2=Back, 3=Land, 4=Pod, 5=Smurf, 6=Teardrop and other is 7
## Replace labels column with new labels column

In [12]:
# take label column
labeldf=newdf['label']
labeldf_test=newdf_test['label']
print(labeldf.head(10))

# change the label column
newlabeldf=labeldf.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 2, 'land': 3, 'pod': 4, 'smurf': 5, 'teardrop': 6,'mailbomb': 7, 'apache2': 7, 'processtable': 7, 'udpstorm': 7, 'worm': 7,
                           'ipsweep' : 7,'nmap' : 7,'portsweep' : 7,'satan' : 7,'mscan' : 7,'saint' : 7,
                           'ftp_write': 7,'guess_passwd': 7,'imap': 7,'multihop': 7,'phf': 7,'spy': 7,'warezclient': 7,'warezmaster': 7,'sendmail': 7,'named': 7,'snmpgetattack': 7,'snmpguess': 7,'xlock': 7,'xsnoop': 7,'httptunnel': 7,
                           'buffer_overflow': 7,'loadmodule': 7,'perl': 7,'rootkit': 7,'ps': 7,'sqlattack': 7,'xterm': 7})
newlabeldf_test=labeldf_test.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 2, 'land': 3, 'pod': 4, 'smurf': 5, 'teardrop': 6,'mailbomb': 7, 'apache2': 7, 'processtable': 7, 'udpstorm': 7, 'worm': 7,
                           'ipsweep' : 7,'nmap' : 7,'portsweep' : 7,'satan' : 7,'mscan' : 7,'saint' : 7,
                           'ftp_write': 7,'guess_passwd': 7,'imap': 7,'multihop': 7,'phf': 7,'spy': 7,'warezclient': 7,'warezmaster': 7,'sendmail': 7,'named': 7,'snmpgetattack': 7,'snmpguess': 7,'xlock': 7,'xsnoop': 7,'httptunnel': 7,
                           'buffer_overflow': 7,'loadmodule': 7,'perl': 7,'rootkit': 7,'ps': 7,'sqlattack': 7,'xterm': 7})
# put the new label column back
newdf['label'] = newlabeldf
newdf_test['label'] = newlabeldf_test
print(newdf['label'].head(10))

0     normal
1     normal
2    neptune
3     normal
4     normal
5    neptune
6    neptune
7    neptune
8    neptune
9    neptune
Name: label, dtype: object
0    0
1    0
2    1
3    0
4    0
5    1
6    1
7    1
8    1
9    1
Name: label, dtype: int64


In [13]:
to_drop_Normal = [1,2,3,4,5,6,7]
to_drop_DoS = [7]
to_drop_Neptune = [2,3,4,5,6,7]
to_drop_Back = [1,3,4,5,6,7]
to_drop_Land = [1,2,4,5,6,7]
to_drop_Pod = [1,2,3,5,6,7]
to_drop_Smurf = [1,2,3,4,6,7]
to_drop_Teardrop = [1,2,3,4,5,7]

# training data
Normal_df = newdf[~newdf['label'].isin(to_drop_Normal)]
DoS_df = newdf[~newdf['label'].isin(to_drop_DoS)]

Neptune_df = newdf[~newdf['label'].isin(to_drop_Neptune)]
Back_df = newdf[~newdf['label'].isin(to_drop_Back)]
Land_df = newdf[~newdf['label'].isin(to_drop_Land)]
Pod_df = newdf[~newdf['label'].isin(to_drop_Pod)]
Smurf_df = newdf[~newdf['label'].isin(to_drop_Smurf)]
Teardrop_df = newdf[~newdf['label'].isin(to_drop_Teardrop)]

# test data
Normal_df_test = newdf_test[~newdf_test['label'].isin(to_drop_Normal)]
DoS_df_test = newdf_test[~newdf_test['label'].isin(to_drop_DoS)]

Neptune_df_test = newdf_test[~newdf_test['label'].isin(to_drop_Neptune)]
Back_df_test = newdf_test[~newdf_test['label'].isin(to_drop_Back)]
Land_df_test = newdf_test[~newdf_test['label'].isin(to_drop_Land)]
Pod_df_test = newdf_test[~newdf_test['label'].isin(to_drop_Pod)]
Smurf_df_test = newdf_test[~newdf_test['label'].isin(to_drop_Smurf)]
Teardrop_df_test = newdf_test[~newdf_test['label'].isin(to_drop_Teardrop)]

print('Train:')
print('Dimensions of Normal:', Normal_df.shape)
print('Dimensions of DoS:', DoS_df.shape)
print('Dimensions of Neptune:', Neptune_df.shape)
print('Dimensions of Back:', Back_df.shape)
print('Dimensions of Land:', Land_df.shape)
print('Dimensions of Pod:', Pod_df.shape)
print('Dimensions of Smurf:', Smurf_df.shape)
print('Dimensions of Teardrop:', Teardrop_df.shape)
print()
print('Test:')
print('Dimensions of Normal:', Normal_df_test.shape)
print('Dimensions of DoS:', DoS_df_test.shape)
print('Dimensions of Neptune:', Neptune_df_test.shape)
print('Dimensions of Back:', Back_df_test.shape)
print('Dimensions of Land:', Land_df_test.shape)
print('Dimensions of Pod:', Pod_df_test.shape)
print('Dimensions of Smurf:', Smurf_df_test.shape)
print('Dimensions of Teardrop:', Teardrop_df_test.shape)

DoS_df.head(10)

Train:
Dimensions of Normal: (67343, 42)
Dimensions of DoS: (113270, 42)
Dimensions of Neptune: (108557, 42)
Dimensions of Back: (68299, 42)
Dimensions of Land: (67361, 42)
Dimensions of Pod: (67544, 42)
Dimensions of Smurf: (69989, 42)
Dimensions of Teardrop: (68235, 42)

Test:
Dimensions of Normal: (9711, 42)
Dimensions of DoS: (15452, 42)
Dimensions of Neptune: (14368, 42)
Dimensions of Back: (10070, 42)
Dimensions of Land: (9718, 42)
Dimensions of Pod: (9752, 42)
Dimensions of Smurf: (10376, 42)
Dimensions of Teardrop: (9723, 42)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,protocol_type_enc,service_enc,flag_enc
0,0,491,0,0,0,0,0,0,0,0,...,0.17,0.0,0.0,0.0,0.05,0.0,0,1,20,9
1,0,146,0,0,0,0,0,0,0,0,...,0.88,0.0,0.0,0.0,0.0,0.0,0,2,44,9
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,1.0,1.0,0.0,0.0,1,1,49,5
3,0,232,8153,0,0,0,0,0,1,0,...,0.03,0.04,0.03,0.01,0.0,0.01,0,1,24,9
4,0,199,420,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1,24,9
5,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,1.0,1,1,49,1
6,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,1.0,1.0,0.0,0.0,1,1,49,5
7,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,1.0,1.0,0.0,0.0,1,1,49,5
8,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,1.0,1.0,0.0,0.0,1,1,51,5
9,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,1.0,1.0,0.0,0.0,1,1,49,5


# Step 2: Feature Scaling:
## Split dataframe into feature set and target output
Assign X as a dataframe of feautures and Y as a series of outcome variables

In [14]:
# training set
X_Normal = Normal_df.drop('label',1)
Y_Normal = Normal_df['label']
X_DoS = DoS_df.drop('label',1)
Y_DoS = DoS_df['label'].replace({2:1, 3:1, 4:1, 5:1, 6:1})
X_Neptune = Neptune_df.drop('label',1)
Y_Neptune = Neptune_df['label']
X_Back = Back_df.drop('label',1)
Y_Back = Back_df['label'].replace({2:1})
X_Land = Land_df.drop('label',1)
Y_Land = Land_df['label'].replace({3:1})
X_Pod = Pod_df.drop('label',1)
Y_Pod = Pod_df['label'].replace({4:1})
X_Smurf = Smurf_df.drop('label',1)
Y_Smurf = Smurf_df['label'].replace({5:1})
X_Teardrop = Teardrop_df.drop('label',1)
Y_Teardrop = Teardrop_df['label'].replace({6:1})

# test set
X_Normal_test = Normal_df_test.drop('label',1)
Y_Normal_test = Normal_df_test['label']
X_DoS_test = DoS_df_test.drop('label',1)
Y_DoS_test = DoS_df_test['label'].replace({2:1, 3:1, 4:1, 5:1, 6:1})
X_Neptune_test = Neptune_df_test.drop('label',1)
Y_Neptune_test = Neptune_df_test['label']
X_Back_test = Back_df_test.drop('label',1)
Y_Back_test = Back_df_test['label'].replace({2:1})
X_Land_test = Land_df_test.drop('label',1)
Y_Land_test = Land_df_test['label'].replace({3:1})
X_Pod_test = Pod_df_test.drop('label',1)
Y_Pod_test = Pod_df_test['label'].replace({4:1})
X_Smurf_test = Smurf_df_test.drop('label',1)
Y_Smurf_test = Smurf_df_test['label'].replace({5:1})
X_Teardrop_test = Teardrop_df_test.drop('label',1)
Y_Teardrop_test = Teardrop_df_test['label'].replace({6:1})


Y_DoS[1:10]
#Smurf_df['label'].head(10)

1    0
2    1
3    0
4    0
5    1
6    1
7    1
8    1
9    1
Name: label, dtype: int64

## Use StandardScaler() to scale the dataframes

In [15]:
from sklearn import preprocessing
# training set
scaler0 = preprocessing.StandardScaler().fit(X_DoS)
X_DoS=scaler0.transform(X_DoS)
scaler1 = preprocessing.StandardScaler().fit(X_Neptune)
X_Neptune=scaler1.transform(X_Neptune)
scaler2 = preprocessing.StandardScaler().fit(X_Back)
X_Back=scaler2.transform(X_Back)
scaler3 = preprocessing.StandardScaler().fit(X_Land)
X_Land=scaler3.transform(X_Land)
scaler4 = preprocessing.StandardScaler().fit(X_Pod)
X_Pod=scaler4.transform(X_Pod)
scaler5 = preprocessing.StandardScaler().fit(X_Smurf)
X_Smurf=scaler5.transform(X_Smurf)
scaler6 = preprocessing.StandardScaler().fit(X_Teardrop)
X_Teardrop=scaler6.transform(X_Teardrop)

# test data
scaler10 = preprocessing.StandardScaler().fit(X_DoS_test)
X_DoS_test=scaler10.transform(X_DoS_test)
scaler11 = preprocessing.StandardScaler().fit(X_Neptune_test)
X_Neptune_test=scaler11.transform(X_Neptune_test)
scaler12 = preprocessing.StandardScaler().fit(X_Back_test)
X_Back_test=scaler12.transform(X_Back_test)
scaler13 = preprocessing.StandardScaler().fit(X_Land_test)
X_Land_test=scaler13.transform(X_Land_test)
scaler14 = preprocessing.StandardScaler().fit(X_Pod_test)
X_Pod_test=scaler14.transform(X_Pod_test)
scaler15 = preprocessing.StandardScaler().fit(X_Smurf_test)
X_Smurf_test=scaler15.transform(X_Smurf_test)
scaler16 = preprocessing.StandardScaler().fit(X_Teardrop_test)
X_Teardrop_test=scaler16.transform(X_Teardrop_test)



  return self.partial_fit(X, y)
  after removing the cwd from sys.path.
  return self.partial_fit(X, y)
  
  return self.partial_fit(X, y)
  
  return self.partial_fit(X, y)
  # Remove the CWD from sys.path while we load stuff.
  return self.partial_fit(X, y)
  if sys.path[0] == '':
  return self.partial_fit(X, y)
  
  return self.partial_fit(X, y)
  app.launch_new_instance()
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


### Feature after scaling

In [16]:
print(X_DoS.std(axis=0))
X_DoS[1:5]

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


array([[-9.93194524e-02, -2.52393320e-02, -5.23087470e-02,
        -1.48580045e-02, -9.44133763e-02, -6.64411495e-03,
        -8.55959191e-02, -2.15171149e-02, -8.70242969e-01,
        -1.22491853e-02, -3.47989093e-02, -2.56356488e-02,
        -1.30071692e-02, -2.63597259e-02, -1.81631753e-02,
        -4.27709864e-02,  0.00000000e+00, -2.97128607e-03,
        -8.81312159e-02, -6.62916206e-01, -3.77093040e-01,
        -6.77468066e-01, -6.74869545e-01, -3.12759485e-01,
        -3.15134925e-01, -1.30153294e+00,  8.80517558e-01,
        -3.50012854e-01,  7.13518867e-01, -1.09870257e+00,
        -1.19773419e+00,  5.15441238e+00,  3.38271803e+00,
        -2.83192321e-01, -6.79073200e-01, -6.66814438e-01,
        -3.28182176e-01, -3.18156129e-01,  2.39067437e+00,
         8.01856893e-01,  7.49220711e-01],
       [-9.93194524e-02, -2.56920747e-02, -5.23087470e-02,
        -1.48580045e-02, -9.44133763e-02, -6.64411495e-03,
        -8.55959191e-02, -2.15171149e-02, -8.70242969e-01,
        -1.22

# Step 3: Build the model:
## The model for DoS classifier

In [17]:
from sklearn.neural_network import MLPClassifier

clf_DoS = MLPClassifier(random_state=0)
clf_DoS.fit(X_DoS, Y_DoS)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=0, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

## Neptune detector

In [18]:
clf_Neptune = MLPClassifier(random_state=0)
clf_Neptune.fit(X_Neptune, Y_Neptune)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=0, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

## Other detector

In [19]:
clf_Back = MLPClassifier(random_state=0)
clf_Land = MLPClassifier(random_state=0)
clf_Pod = MLPClassifier(random_state=0)
clf_Smurf = MLPClassifier(random_state=0)
clf_Teardrop = MLPClassifier(random_state=0)

clf_Back.fit(X_Back, Y_Back)
clf_Land.fit(X_Land, Y_Land)
clf_Pod.fit(X_Pod, Y_Pod)
clf_Smurf.fit(X_Smurf, Y_Smurf)
clf_Teardrop.fit(X_Teardrop, Y_Teardrop)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=0, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

# Step 4: Prediction & Evaluation (Validation):

## Prediction

### DoS classifier

In [20]:
# Apply the classifier we trained to the test data (which it has never seen before)
clf_DoS.predict(X_DoS_test)

array([1, 1, 0, ..., 0, 1, 0])

In [21]:
# View the predicted probabilities of the first 10 observations
clf_DoS.predict_proba(X_DoS_test)[0:10]

array([[3.14629160e-06, 9.99996854e-01],
       [9.39909004e-05, 9.99906009e-01],
       [9.99999674e-01, 3.26012505e-07],
       [9.99999990e-01, 1.04416360e-08],
       [1.00000000e+00, 5.10990251e-16],
       [9.99999992e-01, 7.96520020e-09],
       [1.00000000e+00, 7.37515502e-12],
       [1.72583099e-04, 9.99827417e-01],
       [1.53210777e-14, 1.00000000e+00],
       [1.00000000e+00, 1.06161961e-31]])

In [22]:
Y_DoS_pred=clf_DoS.predict(X_DoS_test)
# Create confusion matrix
pd.crosstab(Y_DoS_test, Y_DoS_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,1
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9617,94
1,21,5720


### Neptune detector

In [23]:
Y_Neptune_pred=clf_Neptune.predict(X_Neptune_test)
# Create confusion matrix
pd.crosstab(Y_Neptune_test, Y_Neptune_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,1
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9693,18
1,5,4652


### Back

In [24]:
Y_Back_pred=clf_Back.predict(X_Back_test)
# Create confusion matrix
pd.crosstab(Y_Back_test, Y_Back_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,1
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9702,9
1,21,338


### Land

In [25]:
Y_Land_pred=clf_Land.predict(X_Land_test)
# Create confusion matrix
pd.crosstab(Y_Land_test, Y_Land_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0
Actual attacks,Unnamed: 1_level_1
0,9711
1,7


### Pod

In [26]:
Y_Pod_pred=clf_Pod.predict(X_Pod_test)
# Create confusion matrix
pd.crosstab(Y_Pod_test, Y_Pod_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,1
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9674,37
1,35,6


### Smurf

In [27]:
Y_Smurf_pred=clf_Smurf.predict(X_Smurf_test)
# Create confusion matrix
pd.crosstab(Y_Smurf_test, Y_Smurf_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,1
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9711,0
1,0,665


### Teardrop

In [28]:
Y_Teardrop_pred=clf_Teardrop.predict(X_Teardrop_test)
# Create confusion matrix
pd.crosstab(Y_Teardrop_test, Y_Teardrop_pred, rownames=['Actual attacks'], colnames=['Predicted attacks'])

Predicted attacks,0,1
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9670,41
1,0,12


## Cross Validation: Accuracy, Precision, Recall, F-measure

### DoS Classifier

In [29]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
accuracy = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_DoS, X_DoS_test, Y_DoS_test, cv=10, scoring='f1')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.99780 (+/- 0.00353)
Precision: 0.99704 (+/- 0.00641)
Recall: 0.99704 (+/- 0.00414)
F-measure: 0.99704 (+/- 0.00475)


### Neptune

In [30]:
accuracy = cross_val_score(clf_Neptune, X_Neptune_test, Y_Neptune_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_Neptune, X_Neptune_test, Y_Neptune_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_Neptune, X_Neptune_test, Y_Neptune_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_Neptune, X_Neptune_test, Y_Neptune_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.99937 (+/- 0.00131)
Precision: 0.99926 (+/- 0.00186)
Recall: 0.99931 (+/- 0.00143)
F-measure: 0.99929 (+/- 0.00150)


### Back

In [31]:
accuracy = cross_val_score(clf_Back, X_Back_test, Y_Back_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_Back, X_Back_test, Y_Back_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_Back, X_Back_test, Y_Back_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_Back, X_Back_test, Y_Back_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.99921 (+/- 0.00173)
Precision: 0.99045 (+/- 0.01755)
Recall: 0.99825 (+/- 0.00879)
F-measure: 0.99430 (+/- 0.01240)


### Smurf

In [32]:
accuracy = cross_val_score(clf_Smurf, X_Smurf_test, Y_Smurf_test, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
precision = cross_val_score(clf_Smurf, X_Smurf_test, Y_Smurf_test, cv=10, scoring='precision_macro')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
recall = cross_val_score(clf_Smurf, X_Smurf_test, Y_Smurf_test, cv=10, scoring='recall_macro')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))
f = cross_val_score(clf_Smurf, X_Smurf_test, Y_Smurf_test, cv=10, scoring='f1_macro')
print("F-measure: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

Accuracy: 0.99971 (+/- 0.00088)
Precision: 0.99847 (+/- 0.00588)
Recall: 0.99914 (+/- 0.00450)
F-measure: 0.99879 (+/- 0.00368)
