### Here we will making our dataset ready for the model by doing feature engineering.

In [31]:
### Libraries we will be using 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

In [25]:
### Dataset we will be using.
data = pd.read_csv('../Datasets/required.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)
data.head(5)


Unnamed: 0,Source Port,Destination Port,Packet Length,Packet Type,Malware Indicators,Anomaly Scores,Alerts/Warnings,Attack Signature,Month_of_received,hour_of_the_day,...,traffic_HTTP,Attack_Type_DDoS,Attack_Type_Intrusion,Attack_Type_Malware,Severity_High,Severity_Low,Severity_Medium,Segments_Segment A,Segments_Segment B,Segments_Segment C
0,31225,17616,503,1,1,28.67,1,0,5,6,...,1,0,0,1,0,1,0,1,0,0
1,17245,48166,1174,1,1,51.5,1,1,8,7,...,1,0,0,1,0,1,0,0,1,0
2,16811,53600,306,0,1,87.42,0,0,11,8,...,1,1,0,0,0,1,0,0,0,1
3,20018,32534,385,1,0,15.79,0,0,7,10,...,1,0,0,1,0,0,1,0,1,0
4,6131,26646,1462,1,0,0.52,0,0,7,13,...,0,1,0,0,0,1,0,0,0,1


In [26]:
### Lets see the total number of features in our dataset. 

print(f"The shape of the dataset is {data.shape}. The number of samples in the dataset is {data.shape[0]} and the number of features in the dataset are {data.shape[1]}")

The shape of the dataset is (40000, 36). The number of samples in the dataset is 40000 and the number of features in the dataset are 36


In [27]:
### Lets evaluate if there are any features with categorical dtypes in our dataset
print(data.select_dtypes('object'))


Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]

[40000 rows x 0 columns]


In [28]:
### Lets see the dtypes of all the features
print(data.dtypes)

Source Port                int64
Destination Port           int64
Packet Length              int64
Packet Type                int64
Malware Indicators         int64
Anomaly Scores           float64
Attack Signature           int64
Month_of_received          int64
hour_of_the_day            int64
day_of_the_week            int64
is_weekend                 int64
src_ip_1                   int64
src_ip_2                   int64
src_ip_3                   int64
src_ip_4                   int64
dest_ip_1                  int64
dest_ip_2                  int64
dest_ip_3                  int64
dest_ip_4                  int64
Is_Private                 int64
protocol_ICMP              int64
protocol_TCP               int64
protocol_UDP               int64
traffic_DNS                int64
traffic_FTP                int64
traffic_HTTP               int64
Attack_Type_DDoS           int64
Attack_Type_Intrusion      int64
Attack_Type_Malware        int64
Severity_High              int64
Severity_L

In [29]:
### Lets see our target variable and see if there is any class imbalance in the feature itself
print(data['Malware Indicators'].value_counts())


Malware Indicators
1    20000
0    20000
Name: count, dtype: int64


### Lets check the mean, median and standard deviation of the features used in the databases



In [44]:
### Lets figure out the variance of the non-binary column
def features_divider(dataset):
    binary_features = []
    continous_features = []
    for column in dataset.columns: ## In the range of number of columns 
        columns_data = dataset[column]
        unique_values = np.unique(columns_data)
        if len(unique_values) == 2 and set(unique_values) <= {0,1}:
            binary_features.append(column)
        else:
            continous_features.append(column)
    return binary_features,continous_features

binary_features, contionus_features = features_divider(data)
print(f"The total binary features in the dataset are {len(binary_features)}")
print(f"The total continous features in the dataset are {len(contionus_features)}")


The total binary features in the dataset are 21
The total continous features in the dataset are 15


In [None]:
### 

In [30]:
### Our dataset is evenly balanced which means that this dataset if perfect for model evaluation 
data.to_csv('../Datasets/featured.csv')