## This notebook classifies the test data given against the model created at the link below:
#### [https://github.com/sweagle07/attack_classification/blob/main/AttackClassifier_train_valid_set.ipynb]

#### **To execute this notebook with new data, update the variable in the cell below to the data to classify file:**
#### test_file_name = 'test'

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time 
import pickle

from sklearn.metrics import classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier

## File name variables to control data/model import and export
## it expects a csv without headers similar to the sample data given

test_file_name = 'test'

In [2]:
train_file_name = 'train_df.pkl'
model_file_name = train_file_name + '_model.sav'

pd.set_option('display.max_columns',None)

### Read test data into data frame and add column headers

In [3]:
test_df = pd.read_csv(test_file_name, names=['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 
                                           'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 
                                           'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
                                           'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 
                                           'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 
                                           'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 
                                           'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 
                                           'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 
                                           'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 
                                           'dst_host_srv_rerror_rate','category'])

In [4]:
test_df.shape

(311029, 42)

#### Save off a clean copy of imported test data

In [5]:
test_df_orig = test_df.copy()

#### Take a quick look at test data to see if it's what I am expecting
#### No surprises so far

In [6]:
test_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,category
0,0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
1,0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
2,0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.
3,0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,snmpgetattack.
4,0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack.


In [7]:
test_df.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0,311029.0
mean,17.902736,1731.702,747.9937,2.9e-05,0.000762,5.1e-05,0.014677,0.002363,0.172476,0.011243,0.000199,2.3e-05,0.008359,0.000958,8.4e-05,0.000772,0.0,3.9e-05,0.002424,269.247019,235.580039,0.059215,0.059193,0.142585,0.142248,0.815654,0.024447,0.025349,235.282681,199.193914,0.793494,0.024953,0.547919,0.004566,0.058764,0.058791,0.142659,0.141693
std,407.6444,127656.7,16120.18,0.005379,0.040367,0.009821,0.312068,0.04999,0.377794,1.958325,0.014117,0.005947,2.165196,0.193119,0.01293,0.029453,0.0,0.006211,0.049177,219.834412,239.308028,0.233873,0.234818,0.347564,0.348233,0.371605,0.107061,0.125231,60.913298,100.30647,0.38709,0.096003,0.491963,0.035773,0.231296,0.232997,0.34438,0.346573
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,105.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,7.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,244.0,0.97,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,520.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,212.0,126.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1032.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,511.0,511.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255.0,255.0,1.0,0.01,1.0,0.0,0.0,0.0,0.0,0.0
max,57715.0,62825650.0,5203179.0,1.0,3.0,3.0,101.0,4.0,1.0,796.0,1.0,2.0,878.0,100.0,5.0,4.0,0.0,1.0,1.0,511.0,511.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,255.0,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311029 entries, 0 to 311028
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     311029 non-null  int64  
 1   protocol_type                311029 non-null  object 
 2   service                      311029 non-null  object 
 3   flag                         311029 non-null  object 
 4   src_bytes                    311029 non-null  int64  
 5   dst_bytes                    311029 non-null  int64  
 6   land                         311029 non-null  int64  
 7   wrong_fragment               311029 non-null  int64  
 8   urgent                       311029 non-null  int64  
 9   hot                          311029 non-null  int64  
 10  num_failed_logins            311029 non-null  int64  
 11  logged_in                    311029 non-null  int64  
 12  num_compromised              311029 non-null  int64  
 13 

## Perform the same transformation steps as done in modelling phase

### Create overarching groups of attacks and prep data for classification

In [9]:
DOS = ['back.','land.','neptune.','pod.','smurf.','teardrop.']
R2L = ['ftp_write.','guess_passwd.','imap.','multihop.','phf.','spy.','warezclient.','warezmaster.']
U2R = ['buffer_overflow.', 'loadmodule.','perl.','rootkit.']
probing = ['ipsweep.','nmap.','portsweep.','satan.']
normal = 'normal.'

In [10]:
def get_group(x):
    if x in R2L:
        return 4
    elif x in U2R:
        return 3
    elif x in DOS:
        return 2
    elif x in probing:
        return 1
    elif x == normal:
        return 0
    else: return 10

#### Add column with mapping to various Attack Types
#### Attack Type is the target feature for prediction

In [11]:
test_df['attack_type'] = test_df['category'].apply(get_group)

In [12]:
test_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,category,attack_type
0,0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.,0
1,0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.,0
2,0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal.,0
3,0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,snmpgetattack.,10
4,0,udp,private,SF,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,snmpgetattack.,10


#### Found that "num_outbound_cmds" is always 0 in train set, also 0s in test data
#### Since it's constant, won't help with prediction,

In [13]:
test_df.num_outbound_cmds.value_counts()

0    311029
Name: num_outbound_cmds, dtype: int64

#### Remove initial columns I removed during model creation

In [14]:
test_df.drop(columns=['category','num_outbound_cmds'], inplace=True)

In [15]:
test_df.shape

(311029, 41)

#### Encode catagorical features 

In [16]:
cat_feats = ['protocol_type','flag','service' ]

In [17]:
test_df_encoded = pd.get_dummies(test_df,columns=cat_feats,drop_first=True)

In [18]:
test_df_encoded.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type,protocol_type_tcp,protocol_type_udp,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH,service_X11,service_Z39_50,service_auth,service_bgp,service_courier,service_csnet_ns,service_ctf,service_daytime,service_discard,service_domain,service_domain_u,service_echo,service_eco_i,service_ecr_i,service_efs,service_exec,service_finger,service_ftp,service_ftp_data,service_gopher,service_hostnames,service_http,service_http_443,service_icmp,service_imap4,service_iso_tsap,service_klogin,service_kshell,service_ldap,service_link,service_login,service_mtp,service_name,service_netbios_dgm,service_netbios_ns,service_netbios_ssn,service_netstat,service_nnsp,service_nntp,service_ntp_u,service_other,service_pm_dump,service_pop_2,service_pop_3,service_printer,service_private,service_remote_job,service_rje,service_shell,service_smtp,service_sql_net,service_ssh,service_sunrpc,service_supdup,service_systat,service_telnet,service_tftp_u,service_tim_i,service_time,service_urp_i,service_uucp,service_uucp_path,service_vmnet,service_whois
0,0,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,254,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,10,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,105,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,255,254,1.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,10,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
test_df_encoded.shape

(311029, 114)

In [20]:
test_df_final = pd.DataFrame(test_df_encoded, columns=['duration', 'src_bytes', 'dst_bytes', 'logged_in', 'count', 'srv_count',
        'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
        'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
        'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate', 'dst_host_srv_serror_rate',
        'protocol_type_tcp', 'protocol_type_udp', 'flag_SF', 'service_ecr_i',
        'service_http','attack_type'])

In [21]:
test_df_final.shape

(311029, 22)

#### Separate data for model fitting

In [22]:
X = test_df_final.drop(['attack_type'],axis=1)
y = test_df_final['attack_type']

### Load in model previously created from training data

In [23]:
model_file_name

'train_df.pkl_model.sav'

In [24]:
i_file = open(model_file_name,'rb')
loaded_model = pickle.load(i_file)
i_file.close()

In [25]:
loaded_model_pred_test = loaded_model.predict(X)

In [26]:
#loaded_model.feature_importances_

## Evaluating the model

#### The model performed a bit worse on the normal and DOS classes
#### normal went from 99% to 83% and DOS from 100% to 82%
#### the probing classification actually improved from a F1 score if 87% to 100%!
#### There is still the same problem with low amount of data for U2R and R2L, so no change there
#### I do see there are new type of attacks that my model also does not recognize (class = 10)

#### **Notes from test/validation run below**
#### The model performed quite well with an F1 score of 99% and 100% for normal and DOS, respecitively
#### it was pretty good at classifying probing attacks with 87% F1 score.
#### The model did not perform well at all for the U2R and R2L attacks. 
#### More data to balance the tree, or perhaps create separate models
normal= 0
probing = 1
DOS = 2
U2R = 3
R2L = 4
unrecognized = 10

In [27]:
print(confusion_matrix(y,loaded_model_pred_test))

[[ 60460     40     93      0      0      0]
 [   251   2082     44      0      0      0]
 [  1211      0 222087      0      0      0]
 [    39      0      0      0      0      0]
 [  5993      0      0      0      0      0]
 [ 17892    594    243      0      0      0]]


In [28]:
print(classification_report(y, loaded_model_pred_test))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.70      1.00      0.83     60593
           1       0.77      0.88      0.82      2377
           2       1.00      0.99      1.00    223298
           3       0.00      0.00      0.00        39
           4       0.00      0.00      0.00      5993
          10       0.00      0.00      0.00     18729

    accuracy                           0.92    311029
   macro avg       0.41      0.48      0.44    311029
weighted avg       0.86      0.92      0.88    311029

