In [1]:
#import all the required packages
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.preprocessing import LabelEncoder

#create a list with all the variable names - 42 of them
names = [
"duration",
"protocol_type",
"service",
"flag",
"src_bytes",
"dst_bytes",
"land",
"wrong_fragment",
"urgent",
"hot",
"num_failed_logins",
"logged_in",
"num_compromised",
"root_shell",
"su_attempted",
"num_root",
"num_file_creations",
"num_shells",
"num_access_files",
"num_outbound_cmds",
"is_host_login",
"is_guest_login",
"count",
"srv_count",
"serror_rate",
"srv_serror_rate",
"rerror_rate",
"srv_rerror_rate",
"same_srv_rate",
"diff_srv_rate",
"srv_diff_host_rate",
"dst_host_count",
"dst_host_srv_count",
"dst_host_same_srv_rate",
"dst_host_diff_srv_rate",
"dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate",
"dst_host_serror_rate",
"dst_host_srv_serror_rate",
"dst_host_rerror_rate",
"dst_host_srv_rerror_rate",
"connection_type"
]

#load the kddcup.data_10_percent_corrected dataset to build the model
#this dataset has 10 percent of the overall data - approximately 500000 records of the 5 million

kddcup_99 = pd.read_csv('kddcup.data_10_percent_corrected', sep=",", names = names)

#check the first 50 records to ensure that the dataset has been imported correctly
kddcup_99.head(50)
kddcup_99.describe()


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,...,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0,494021.0
mean,47.979302,3025.61,868.5324,4.5e-05,0.006433,1.4e-05,0.034519,0.000152,0.148247,0.010212,...,232.470778,188.66567,0.75378,0.030906,0.601935,0.006684,0.176754,0.176443,0.058118,0.057412
std,707.746472,988218.1,33040.0,0.006673,0.134805,0.00551,0.782103,0.01552,0.355345,1.798326,...,64.74538,106.040437,0.410781,0.109259,0.481309,0.042133,0.380593,0.380919,0.23059,0.23014
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,46.0,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,520.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.04,1.0,0.0,0.0,0.0,0.0,0.0
max,58329.0,693375600.0,5155468.0,1.0,3.0,3.0,30.0,5.0,1.0,884.0,...,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [2]:

#check for missing values
null_data = kddcup_99[kddcup_99.isnull().any(axis=1)]

#check the datatypes of all the variables
kddcup_99.dtypes

#the 3 variables, protocol_type, service and flag are strings
#these variables can either be deleted (using the command in the next line) or convereted to numeric data type by encoding numbers for each of the values
##kddcup_99 = kddcup_99.drop(kddcup_99.columns[[1, 2, 3]], axis=1)

#check the count of the values for each of the 3 variables
kddcup_99['protocol_type'].value_counts()
kddcup_99['service'].value_counts()
kddcup_99['flag'].value_counts()



SF        378440
S0         87007
REJ        26875
RSTR         903
RSTO         579
SH           107
S1            57
S2            24
RSTOS0        11
S3            10
OTH            8
Name: flag, dtype: int64

In [3]:

#encode the values as numbers - for example, the 3 protocol types are numbered as 0, 1 and 2
label_E = LabelEncoder()
kddcup_99['protocol_type'] = label_E.fit_transform(kddcup_99['protocol_type'])
kddcup_99['service'] = label_E.fit_transform(kddcup_99['service'])
kddcup_99['flag'] = label_E.fit_transform(kddcup_99['flag'])


In [4]:
#check the description and datatypes for the dataset and also check the value count for the 3 values after encoding
kddcup_99.describe()
kddcup_99.dtypes
kddcup_99['protocol_type'].value_counts()
kddcup_99['service'].value_counts()
kddcup_99['flag'].value_counts()


9     378440
5      87007
1      26875
4        903
2        579
10       107
6         57
7         24
3         11
8         10
0          8
Name: flag, dtype: int64

In [5]:

#check for missing values
null_data = kddcup_99[kddcup_99.isnull().any(axis=1)]

#check the datatypes of all the variables
kddcup_99.dtypes

#the 3 variables, protocol_type, service and flag are strings
#these variables can either be deleted (using the command in the next line) or convereted to numeric data type by encoding numbers for each of the values
##kddcup_99 = kddcup_99.drop(kddcup_99.columns[[1, 2, 3]], axis=1)

#check the count of the values for each of the 3 variables
kddcup_99['protocol_type'].value_counts()
kddcup_99['service'].value_counts()
kddcup_99['flag'].value_counts()

#encode the values as numbers - for example, the 3 protocol types are numbered as 0, 1 and 2
label_E = LabelEncoder()
kddcup_99['protocol_type'] = label_E.fit_transform(kddcup_99['protocol_type'])
kddcup_99['service'] = label_E.fit_transform(kddcup_99['service'])
kddcup_99['flag'] = label_E.fit_transform(kddcup_99['flag'])

#check the description and datatypes for the dataset and also check the value count for the 3 values after encoding
kddcup_99.describe()
kddcup_99.dtypes
kddcup_99['protocol_type'].value_counts()
kddcup_99['service'].value_counts()
kddcup_99['flag'].value_counts()

#feature selection
#to reduce the number of features without impacting the model, 
#first check the variance of all the features, if there is minimal variance, then such features can be ignored
kdd_var = np.var(kddcup_99)

#delete the features which have minimal variance (<0.05)
kddcup_99 = kddcup_99.drop(kddcup_99.columns[[7, 10, 13, 16, 17, 18, 19, 20, 21, 29, 30, 34]], axis=1)

#next, calculate the correlation between the features
#if two features have high correlation, then one of those features can be retained instead of both
kdd_corr = kddcup_99.corr()
kddcup_99 = kddcup_99.drop(kddcup_99.columns[[12, 19]], axis=1)


In [6]:
#the target variable contains details of the connection type,
#i.e., if the connection type is normal, it is a good connection, 
#the rest of the connection types are bad connections or attacks
kddcup_99['connection_type'].value_counts()


smurf.              280790
neptune.            107201
normal.              97278
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: connection_type, dtype: int64

In [7]:
#rename the connection type as either good or bad
kddcup_99.loc[kddcup_99.connection_type != 'normal.', 'connection_type'] = 'bad'
kddcup_99.loc[kddcup_99.connection_type == 'normal.', 'connection_type'] = 'good'

#Build predictive models on this final dataset

#Naive Bayes Model

#define the predictor and target variables
X = kddcup_99.ix[:,:27]
y = kddcup_99.ix[:,27:]

#split the data into train and test datasets (70 - 30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#fit the model and predict the target variable for the test dataset
gnb = GaussianNB()
y_pred_gnb = gnb.fit(X_train, y_train).predict(X_test)

#create the confusion matrix for the actual target value in the test data against the predicted value
cnf_matrix_gnb = confusion_matrix(y_test, y_pred_gnb)

#check the accuracy score of the model
print("Validation score: ",accuracy_score(y_test, y_pred_gnb))


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  # Remove the CWD from sys.path while we load stuff.
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  # This is added back by InteractiveShellApp.init_path()
  y = column_or_1d(y, warn=True)


Validation score:  0.9798659982322022


#using second algorithm for training of the dataset, and improving accuracy

In [8]:
#Decision Tree Model

#split the data into train and test datasets (70 - 30) and define the predictor and target variables
train_DT, test_DT = train_test_split(kddcup_99, test_size = 0.3)
X = train_DT.iloc[:,:27]
y = train_DT.iloc[:,27:]
x_test = test_DT.ix[:,:27]
y_test = test_DT.ix[:,27:]

#fit the model and predict the target variable for the test dataset
model = tree.DecisionTreeClassifier(criterion='gini')
model.fit(X, y)

#Predict Output
predicted = model.predict(x_test)

#check the model score
model.score(X, y)
print("Validation score: ",accuracy_score(y_test, predicted))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  import sys
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Validation score:  0.9996693813382633
