# This notebook contains the train/validation split exercise I did to create the model
### ** Background set-up notes **
#### I load in a file that I generated based on the data set given and added column names.
#### I export the data into a pickle format for quick load from a .csv
#### Code for these steps can be found here

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time 
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier

## File name variables to control data/model import and export

train_file_name = 'train_df.pkl'
test_file_name = 'test'
model_file_name = train_file_name + '_model.sav'

pd.set_option('display.max_columns',None)

#### Load the orginal data from CSV; add column headers and save data for re-use
#### Export the data to a pickle format for faster subsequent loads

In [2]:
# train_df = pd.read_csv(train_file, names=['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 
#                                           'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 
#                                           'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
#                                           'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 
#                                           'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 
#                                           'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 
#                                           'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 
#                                           'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 
#                                           'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 
#                                           'dst_host_srv_rerror_rate','category'])

In [3]:
#train_df.to_csv('train_df.csv')
#train_df.to_pickle('train_df.pkl')

### Get and load the data into a DF

In [4]:
train_df_orig = pd.read_pickle(train_file_name)
train_df_orig.shape

(4898431, 42)

In [5]:
#### Save off a clean copy of imported data frame

In [6]:
train_df = train_df_orig.copy()

In [7]:
train_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,category
0,0,tcp,http,SF,215,45076,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,162,4528,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,236,1228,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,233,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.


In [8]:
train_df.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0,4898431.0
mean,48.34243,1834.621,1093.623,5.716116e-06,0.0006487792,7.961733e-06,0.01243766,3.205108e-05,0.143529,0.008088304,6.81851e-05,3.674646e-05,0.01293496,0.001188748,7.430951e-05,0.001021143,0.0,4.08294e-07,0.0008351654,334.9734,295.2671,0.1779703,0.178037,0.05766509,0.0577301,0.7898842,0.02117961,0.0282608,232.9811,189.2142,0.7537132,0.03071111,0.605052,0.006464107,0.1780911,0.1778859,0.0579278,0.05765941
std,723.3298,941431.1,645012.3,0.002390833,0.04285434,0.007215084,0.4689782,0.007299408,0.3506116,3.856481,0.008257146,0.008082432,3.938075,0.1241857,0.00873759,0.03551048,0.0,0.0006389788,0.02888716,211.9908,245.9927,0.3818756,0.3822541,0.2322529,0.2326604,0.3892958,0.08271458,0.1405596,64.02094,105.9128,0.411186,0.1085432,0.4809877,0.04125978,0.3818382,0.3821774,0.2309428,0.2309777
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,121.0,10.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,49.0,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,520.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,510.0,510.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,511.0,511.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,255.0,1.0,0.04,1.0,0.0,0.0,0.0,0.0,0.0
max,58329.0,1379964000.0,1309937000.0,1.0,3.0,14.0,77.0,5.0,1.0,7479.0,1.0,2.0,7468.0,43.0,2.0,9.0,0.0,1.0,1.0,511.0,511.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
train_df.info(verbose=True,null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898431 entries, 0 to 4898430
Data columns (total 42 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   duration                     4898431 non-null  int64  
 1   protocol_type                4898431 non-null  object 
 2   service                      4898431 non-null  object 
 3   flag                         4898431 non-null  object 
 4   src_bytes                    4898431 non-null  int64  
 5   dst_bytes                    4898431 non-null  int64  
 6   land                         4898431 non-null  int64  
 7   wrong_fragment               4898431 non-null  int64  
 8   urgent                       4898431 non-null  int64  
 9   hot                          4898431 non-null  int64  
 10  num_failed_logins            4898431 non-null  int64  
 11  logged_in                    4898431 non-null  int64  
 12  num_compromised              4898431 non-n

### Create overarching groups of attacks and prep data for modelling

In [10]:
DOS = ['back.','land.','neptune.','pod.','smurf.','teardrop.']
R2L = ['ftp_write.','guess_passwd.','imap.','multihop.','phf.','spy.','warezclient.','warezmaster.']
U2R = ['buffer_overflow.', 'loadmodule.','perl.','rootkit.']
probing = ['ipsweep.','nmap.','portsweep.','satan.']
normal = 'normal.'

In [11]:
def get_group(x):
    if x in R2L:
        return 4
    elif x in U2R:
        return 3
    elif x in DOS:
        return 2
    elif x in probing:
        return 1
    elif x == normal:
        return 0
    else: return 'No Match'

#### Add column with mapping to various Attack Types
#### Will build the model based on these Attack Types

In [12]:
train_df['attack_type'] = train_df['category'].apply(get_group)

In [13]:
train_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,category,attack_type
0,0,tcp,http,SF,215,45076,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal.,0
1,0,tcp,http,SF,162,4528,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal.,0
2,0,tcp,http,SF,236,1228,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal.,0
3,0,tcp,http,SF,233,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal.,0
4,0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal.,0


#### Found that "num_outbound_cmds" is always 0
#### Since it's constant, won't help with prediction,

In [14]:
train_df.num_outbound_cmds.value_counts()

0    4898431
Name: num_outbound_cmds, dtype: int64

#### Remove initial columns I won't use in the model

In [15]:
train_df.drop(columns=['category','num_outbound_cmds'], inplace=True)

#### Encode catagorical features 

In [16]:
cat_feats = ['protocol_type','flag','service' ]

In [17]:
final_data_df = pd.get_dummies(train_df,columns=cat_feats,drop_first=True)

In [18]:
final_data_df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type,protocol_type_tcp,protocol_type_udp,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,service_csnet_ns,service_ctf,service_daytime,service_discard,service_domain,service_domain_u,service_echo,service_eco_i,service_ecr_i,service_efs,service_exec,service_finger,service_ftp,service_ftp_data,service_gopher,service_harvest,service_hostnames,service_http,service_http_2784,service_http_443,service_http_8001,service_imap4,service_iso_tsap,service_klogin,service_kshell,service_ldap,service_link,service_login,service_mtp,service_name,service_netbios_dgm,service_netbios_ns,service_netbios_ssn,service_netstat,service_nnsp,service_nntp,service_ntp_u,service_other,service_pm_dump,service_pop_2,service_pop_3,service_printer,service_private,service_red_i,service_remote_job,service_rje,service_shell,service_smtp,service_sql_net,service_ssh,service_sunrpc,service_supdup,service_systat,service_telnet,service_tftp_u,service_tim_i,service_time,service_urh_i,service_urp_i,service_uucp,service_uucp_path,service_vmnet,service_whois
0,0,215,45076,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,162,4528,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,236,1228,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,233,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
final_data_df.shape

(4898431, 119)

### Create train and validation sets for iniital training/test 

In [20]:
X = final_data_df.drop(['attack_type'],axis=1)
y = final_data_df['attack_type']

In [21]:
# separate train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=10)



In [22]:
X_train.shape, X_valid.shape

((3918744, 118), (979687, 118))

In [23]:
y_train.shape, y_valid.shape

((3918744,), (979687,))

### Check for correlated features
#### I didn't end up using this feature selection technique as it did not significantly improve the performance of the model.
#### Leaving the code in for reference, if I want to try it again at a later stage

In [24]:
# # find and remove correlated features
# def correlation(dataset, threshold):
#     col_corr = set()  # Set of all the names of correlated columns
#     corr_matrix = dataset.corr()
#     for i in range(len(corr_matrix.columns)):
#         for j in range(i):
#             if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
#                 colname = corr_matrix.columns[i]  # getting the name of column
#                 col_corr.add(colname)
#     return col_corr

# corr_features = correlation(X_train, 0.90)
# print('correlated features: ', len(set(corr_features)) )

In [25]:
# corr_features

In [26]:
# corrmat = X_train.corr()

# # we can make a heatmap with the package seaborn
# # and customise the colours of searborn's heatmap
# cmap = sns.diverging_palette(220, 20, as_cmap=True)

# # some more parameters for the figure
# fig, ax = plt.subplots()
# fig.set_size_inches(11,11)

# # and now plot the correlation matrix
# sns.heatmap(corrmat, cmap=cmap)

In [27]:
# X_train.drop(labels=corr_features, axis=1, inplace=True)
# X_valid.drop(labels=corr_features, axis=1, inplace=True)

### Select Feature by importance using random forest algorithm

In [28]:
sel_ = SelectFromModel(RandomForestClassifier(n_estimators=50, random_state=10))
tic = time.perf_counter()
sel_.fit(X_train, y_train)
toc = time.perf_counter()
print(f"Time to fit feature selection: {toc - tic:0.4f} seconds") 

# remove features with zero coefficient from dataset
# and parse again as dataframe
X_train_rf = pd.DataFrame(sel_.transform(X_train))
X_test_rf = pd.DataFrame(sel_.transform(X_valid))

# add the columns name
X_train_rf.columns = X_train.columns[(sel_.get_support())]
X_test_rf.columns = X_train.columns[(sel_.get_support())]


Time to fit feature selection: 457.2876 seconds


In [29]:
X_train_rf.head()

Unnamed: 0,duration,src_bytes,dst_bytes,logged_in,count,srv_count,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_srv_serror_rate,protocol_type_tcp,protocol_type_udp,flag_SF,service_ecr_i,service_http
0,0.0,0.0,0.0,0.0,141.0,17.0,0.12,0.06,0.0,255.0,18.0,0.07,0.07,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,0.0,1032.0,0.0,0.0,511.0,511.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,1032.0,0.0,0.0,511.0,511.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.0,520.0,0.0,0.0,444.0,444.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,1032.0,0.0,0.0,511.0,511.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [30]:
X_test_rf.head()

Unnamed: 0,duration,src_bytes,dst_bytes,logged_in,count,srv_count,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_srv_serror_rate,protocol_type_tcp,protocol_type_udp,flag_SF,service_ecr_i,service_http
0,0.0,1032.0,0.0,0.0,511.0,511.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.0,1032.0,0.0,0.0,511.0,511.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,146.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,255.0,1.0,0.0,0.76,0.98,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,0.0,1032.0,0.0,0.0,511.0,511.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,269.0,11.0,0.04,0.05,0.0,255.0,11.0,0.04,0.05,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


#### List of features recommended below; there are 21

In [31]:
X_test_rf.columns

Index(['duration', 'src_bytes', 'dst_bytes', 'logged_in', 'count', 'srv_count',
       'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
       'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_srv_serror_rate',
       'protocol_type_tcp', 'protocol_type_udp', 'flag_SF', 'service_ecr_i',
       'service_http'],
      dtype='object')

In [32]:
len(X_train.columns[(sel_.get_support())])

21

In [33]:
tic = time.perf_counter()
rfc = RandomForestClassifier(n_estimators=200, random_state=10, max_depth=4)
rfc.fit(X_train_rf, y_train)
toc = time.perf_counter()
print(f"Time to train classifier {toc - tic:0.4f} seconds") 

Time to train classifier 795.8987 seconds


## Save model to disk for re-use

In [34]:
w_file = open(model_file_name,'wb')
pickle.dump(rfc,w_file)
w_file.close()

In [35]:
y_pred_test = rfc.predict(X_test_rf)

In [36]:
X_test_rf.shape
#y_pred_test

(979687, 21)

In [37]:
X_test_rf.head()

Unnamed: 0,duration,src_bytes,dst_bytes,logged_in,count,srv_count,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_srv_serror_rate,protocol_type_tcp,protocol_type_udp,flag_SF,service_ecr_i,service_http
0,0.0,1032.0,0.0,0.0,511.0,511.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.0,1032.0,0.0,0.0,511.0,511.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,146.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,255.0,1.0,0.0,0.76,0.98,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,0.0,1032.0,0.0,0.0,511.0,511.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,269.0,11.0,0.04,0.05,0.0,255.0,11.0,0.04,0.05,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


## Evaluating the model
#### The model performed quite well with an F1 score of 99% and 100% for normal and DOS, respecitively
#### it was pretty good at classifying probing attacks with 87% F1 score.
#### The model did not perform well at all for the U2R and R2L attacks. 
#### More data to balance the tree, or perhaps create separate models
normal= 0
probing = 1
DOS = 2
U2R = 3
R2L = 4

In [38]:
print(confusion_matrix(y_valid,y_pred_test))

[[194012      0     43      0      0]
 [  1664   6274    267      0      0]
 [   658      0 776537      0      0]
 [    11      0      0      0      0]
 [   221      0      0      0      0]]


In [39]:
print(classification_report(y_valid, y_pred_test))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99    194055
           1       1.00      0.76      0.87      8205
           2       1.00      1.00      1.00    777195
           3       0.00      0.00      0.00        11
           4       0.00      0.00      0.00       221

    accuracy                           1.00    979687
   macro avg       0.60      0.55      0.57    979687
weighted avg       1.00      1.00      1.00    979687



## Test exported model to confirm saved appropiately
#### Do this by comparing confusion matrix and report numbers

In [40]:
i_file = open(model_file_name,'rb')
loaded_model = pickle.load(i_file)
i_file.close()

In [41]:
loaded_model_pred_test = loaded_model.predict(X_test_rf)

In [42]:
print(confusion_matrix(y_valid,loaded_model_pred_test))

[[194012      0     43      0      0]
 [  1664   6274    267      0      0]
 [   658      0 776537      0      0]
 [    11      0      0      0      0]
 [   221      0      0      0      0]]


In [43]:
print(classification_report(y_valid, loaded_model_pred_test))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99    194055
           1       1.00      0.76      0.87      8205
           2       1.00      1.00      1.00    777195
           3       0.00      0.00      0.00        11
           4       0.00      0.00      0.00       221

    accuracy                           1.00    979687
   macro avg       0.60      0.55      0.57    979687
weighted avg       1.00      1.00      1.00    979687

