## Research Paper : Cost-Sensitive Intrusion Detection Through Attack Severity Prediction

--- 
### Feature Engineering 

In [203]:
### Libararies we will be using 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 


In [204]:
### Dataset we will be using 
data = pd.read_csv('../Datasets/final_dataset.csv')
data.drop(columns=['Unnamed: 0'], inplace = True)
data.head(5)

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,146,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,232,8153,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0,199,420,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


### Lets see the structure of our dataset



In [205]:
print(f"The shape of our dataset is {data.shape}, with {data.shape[0]} as total number of samples and {data.shape[1]} as total nubmer of features ")

The shape of our dataset is (148517, 65), with 148517 as total number of samples and 65 as total nubmer of features 


### Quick Stats

### Lets check the total number of null values, missing values and the overview of the data 

In [206]:
print(data.isnull().sum())

duration          0
src_bytes         0
dst_bytes         0
land              0
wrong_fragment    0
                 ..
flag_S1           0
flag_S2           0
flag_S3           0
flag_SF           0
flag_SH           0
Length: 65, dtype: int64


In [207]:
## Overview of the data 
print(data.describe)

<bound method NDFrame.describe of         duration  src_bytes  dst_bytes  land  wrong_fragment  urgent  hot  \
0              0        491          0     0               0       0    0   
1              0        146          0     0               0       0    0   
2              0          0          0     0               0       0    0   
3              0        232       8153     0               0       0    0   
4              0        199        420     0               0       0    0   
...          ...        ...        ...   ...             ...     ...  ...   
148512         0        794        333     0               0       0    0   
148513         0        317        938     0               0       0    0   
148514         0      54540       8314     0               0       0    2   
148515         0         42         42     0               0       0    0   
148516         0          0          0     0               0       0    0   

        num_failed_logins  logged_in  num

### Feature Engineering 


### bytes_ratio : the ration of source bytes to the destination bytes 

In [208]:
data['bytes_ratio'] = data['src_bytes'] / (data['dst_bytes'] + 1) ## Added 1 to avoid zero division error
data['total_bytes'] = data['src_bytes'] + data['dst_bytes'] ## Total bytes in the file transfer


### Checking skewness of all the numerical features !

In [209]:
### selecting only the numerical columns leaving the binary columns
only_numeric_data = data.select_dtypes('int64', 'float64')

### Out of this features lets compute the features which are non-binary 

non_binary_features = []
for col in only_numeric_data.columns:
    unique_values = data[col].unique()
    if len(unique_values) > 2:
        non_binary_features.append(col)
    elif set(unique_values) != {0,1}:
        non_binary_features.append(col)


In [210]:
### Checking the skewness of the non-binary features
non_binary_skew = data[non_binary_features].skew()
highly_skewed_columns = data[non_binary_features].columns[non_binary_skew >= 1]

for col in highly_skewed_columns:
    data[col] = np.log1p(data[col]) ## Here log1p is used to avoid the log(0) = Inf error

print(data[non_binary_features].skew())

duration               4.794398
src_bytes              0.331625
dst_bytes              0.450255
wrong_fragment        11.693862
urgent                97.787925
hot                    8.955275
num_failed_logins     16.825191
num_compromised       26.778624
su_attempted          43.760561
num_root              27.550960
num_file_creations    33.187829
num_shells            53.306905
num_access_files      23.439835
count                  0.171397
srv_count              0.944197
dst_host_count        -0.870274
dst_host_srv_count     0.219302
level                 -2.898413
total_bytes            0.122327
dtype: float64
