### Here we will making our dataset ready for the model by doing feature engineering.

In [310]:
### Libraries we will be using 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

In [311]:
### Dataset we will be using.
data = pd.read_csv('../Datasets/required.csv')
data.drop(columns=['Unnamed: 0'], inplace=True)
data.head(5)


Unnamed: 0,Source Port,Destination Port,Packet Length,Packet Type,Malware Indicators,Anomaly Scores,Alerts/Warnings,Attack Signature,Month_of_received,hour_of_the_day,...,traffic_HTTP,Attack_Type_DDoS,Attack_Type_Intrusion,Attack_Type_Malware,Severity_High,Severity_Low,Severity_Medium,Segments_Segment A,Segments_Segment B,Segments_Segment C
0,31225,17616,503,1,1,28.67,1,0,5,6,...,1,0,0,1,0,1,0,1,0,0
1,17245,48166,1174,1,1,51.5,1,1,8,7,...,1,0,0,1,0,1,0,0,1,0
2,16811,53600,306,0,1,87.42,0,0,11,8,...,1,1,0,0,0,1,0,0,0,1
3,20018,32534,385,1,0,15.79,0,0,7,10,...,1,0,0,1,0,0,1,0,1,0
4,6131,26646,1462,1,0,0.52,0,0,7,13,...,0,1,0,0,0,1,0,0,0,1


In [312]:
### Lets see the total number of features in our dataset. 

print(f"The shape of the dataset is {data.shape}. The number of samples in the dataset is {data.shape[0]} and the number of features in the dataset are {data.shape[1]}")

The shape of the dataset is (40000, 36). The number of samples in the dataset is 40000 and the number of features in the dataset are 36


In [313]:
### Lets evaluate if there are any features with categorical dtypes in our dataset
print(data.select_dtypes('object'))


Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]

[40000 rows x 0 columns]


In [314]:
### Lets see the dtypes of all the features
print(data.dtypes)

Source Port                int64
Destination Port           int64
Packet Length              int64
Packet Type                int64
Malware Indicators         int64
Anomaly Scores           float64
Attack Signature           int64
Month_of_received          int64
hour_of_the_day            int64
day_of_the_week            int64
is_weekend                 int64
src_ip_1                   int64
src_ip_2                   int64
src_ip_3                   int64
src_ip_4                   int64
dest_ip_1                  int64
dest_ip_2                  int64
dest_ip_3                  int64
dest_ip_4                  int64
Is_Private                 int64
protocol_ICMP              int64
protocol_TCP               int64
protocol_UDP               int64
traffic_DNS                int64
traffic_FTP                int64
traffic_HTTP               int64
Attack_Type_DDoS           int64
Attack_Type_Intrusion      int64
Attack_Type_Malware        int64
Severity_High              int64
Severity_L

In [315]:
### Lets see our target variable and see if there is any class imbalance in the feature itself
print(data['Malware Indicators'].value_counts())


Malware Indicators
1    20000
0    20000
Name: count, dtype: int64


### Lets check the mean, median and standard deviation of the features used in the databases



In [316]:
### Lets figure out the variance of the non-binary column
def features_divider(dataset):
    binary_features = []
    continous_features = []
    for column in dataset.columns: ## In the range of number of columns 
        columns_data = dataset[column]
        unique_values = np.unique(columns_data)
        if len(unique_values) == 2 and set(unique_values) <= {0,1}:
            binary_features.append(column)
        else:
            continous_features.append(column)
    return binary_features,continous_features

binary_features, continous_features = features_divider(data)
print(f"The total binary features in the dataset are {len(binary_features)}")
print(f"The total continous features in the dataset are {len(continous_features)}")


The total binary features in the dataset are 21
The total continous features in the dataset are 15


In [317]:
### Lets evaluate the variance of the model's continous features:
continous_dataset = pd.DataFrame({
    'columns': data[continous_features].columns,
    'variance_score': data[continous_features].var().values
})
continous_dataset = continous_dataset.sort_values(by = 'variance_score', ascending = False)
print(continous_dataset)

              columns  variance_score
1    Destination Port    3.450183e+08
0         Source Port    3.444894e+08
2       Packet Length    1.730928e+05
8            src_ip_2    5.511680e+03
14          dest_ip_4    5.485751e+03
9            src_ip_3    5.453238e+03
12          dest_ip_2    5.440258e+03
10           src_ip_4    5.437606e+03
13          dest_ip_3    5.406360e+03
7            src_ip_1    4.164699e+03
11          dest_ip_1    4.163768e+03
3      Anomaly Scores    8.325301e+02
5     hour_of_the_day    4.787424e+01
4   Month_of_received    1.110875e+01
6     day_of_the_week    4.018082e+00


### Looking at the variance of the continuous features, we observe that the non-binary variables exhibit substantial variability across the dataset. This indicates that these features contain sufficient information for the model to learn meaningful patterns and make decisions

In [318]:
columns_to_drop = ['src_ip_1', 'src_ip_2', 'src_ip_3',
       'src_ip_4', 'dest_ip_1', 'dest_ip_2', 'dest_ip_3', 'dest_ip_4']
data = data.drop(columns = columns_to_drop)


### The raw IP address for both sender and receiver were not in the proper format which was then divided into four parts of the octet in a single mission to find if the given IP address is private or public. Once figuring out if the data is private or public the columns has been removed to make sure that the model doesnot overfits, and model generalizes to new unseen data or the real life scenarios

In [319]:
### In the case of binary data, lets check the class imbalance! 
no_variance_columns = []
extreme_imbalance_columns = []
semi_varied_data = []
has_variation = []
for col in binary_features:
    column_data = data[col]
    ones_stats = column_data.sum()
    zeros_stats = len(column_data) - ones_stats
    ratio_of_one = ones_stats/len(column_data) ## This gives how dominant is the data with class '1' in our dataset.
    if ratio_of_one == 0 or ratio_of_one == 1:
        no_variance_columns.append(col)
    elif ratio_of_one < 0.01 or ratio_of_one > 0.99:
        extreme_imbalance_columns.append(col)
    elif ratio_of_one < 0.05 or ratio_of_one > 0.95:
        semi_varied_data.append(col)
    else:
        has_variation.append(col) ## Meets the criteria of 95% variance



### The 'is_private' column is retained despite its imbalanced distribution because  it captures essential network context that may be relevant for malware detection.  Even with few private IPs in this dataset, this feature could help identify unusual internal-external communication patterns.

In [320]:
## Converting the actual hours to sin and cos hours

data['hour_sin'] = np.sin((2 * np.pi * data['hour_of_the_day'])/24)
data['hour_cos'] = np.cos((2* np.pi * data['hour_of_the_day'])/24) 

## For the weekly data 
data['sin_weekly_day'] = np.sin((2 * np.pi * data['day_of_the_week'])/7)
data['cos_weekly_day'] = np.cos((2 * np.pi * data['day_of_the_week'])/7)

data['monthly_sin'] = np.sin((2 * np.pi * data['Month_of_received'])/12)
data['monthly_cos'] = np.cos((2 * np.pi * data['Month_of_received'])/12)


data = data.drop(columns= ['hour_of_the_day','day_of_the_week','Month_of_received'])

### The euclidean distance between the hours in the dataset, which in the 24 hour format,is misleading becuase, as per the dataset, the distance between 23rd hour and the 0th hours is 23 in the linear space, but in reality it is just one. 

### To solve this issue we have changed the hours of the day to the sin and cos hours, because taking sin-cos pair will create a perfect cyclic pattern and also enables models to learn a periodic pattern.

### Same pattern has been applied to the weekly and monthly data as well.

In [321]:
### The next important feature is port-based attack signatures !

### Attackers target specific port because they correspond to specific services. For examples port 21 is for File Transfer Protocol, 80/443 is for the HTTP/HTTPS. The port not only says which number it is, but also tells what type of data is the malicious IP targeting

### Lets figure out the destination port and see which type of port category does it fall in ?!

data['Well_Known_Ports'] = data['Destination Port'].apply(lambda x: 1 if x >= 0 and x <= 1023 else 0)
data['Registered_Ports'] = data['Destination Port'].apply(lambda x: 1 if x >= 1024 and x <= 49151 else 0)
data['Ephemeral_Ports'] = data['Destination Port'].apply(lambda x: 1 if x >= 49152 and x <= 65535 else 0)


data = data.drop(columns= ['Well_Known_Ports'])


### Here we have further seperated the Destination Port column into three categories : 
### 1. Well-Known Ports such as FTP(21), (80/443) for HTTP/HTTPS
### 2. Registered Ports such as the registered applications
### 3. Ephemeral Ports for the temporary 

### We also dropped the wellknown port becuase it has no variance.



In [322]:
### Lets see if the ports which are included fall in the list of most targeted ports for malware activites
most_targeted_malware_ports = {
    4444: 'meterpreter', 31337: 'back_orifice', 666: 'doom', 1337: 'leet_hacking',
    12345: 'netbus', 27374: 'subseven', 54321: 'bo2k', 12346: 'netbus_tcp',
    20034: 'netbus_pro', 27444: 'trinoo', 27665: 'trinoo', 31335: 'trinoo',
    33270: 'trinoo', 33568: 'trinoo', 34555: 'trinoo', 35555: 'trinoo',
    5000: 'upnp', 5554: 'sasser', 9996: 'sasser', 9898: 'dabber',
    9988: 'spybot', 17300: 'kuang2', 30100: 'netsphere', 60000: 'deepthroat',
    65000: 'devil'
}

data['Is_Malware_Port'] = data['Destination Port'].isin(most_targeted_malware_ports.keys()).astype(int)
## The is Malware port has extremely low variance 



In [323]:
### Lets see what time of the day has the attack been on 
### We will be dividing the whole hours on following three patterns:

## 1. Normal Human Traffic Hours (Typically between 9-5)
# Use your existing hour_sin, hour_cos to create categories
data['is_morning'] = ((data['hour_sin'] > 0.5) & (data['hour_cos'] > 0)).astype(int)     # ~9AM-12PM
data['is_afternoon'] = ((data['hour_sin'] > 0) & (data['hour_cos'] < 0)).astype(int)      # ~12PM-3PM
data['is_evening'] = ((data['hour_sin'] < -0.5) & (data['hour_cos'] < 0)).astype(int)     # ~6PM-9PM
data['is_night'] = ((data['hour_sin'] < 0) & (data['hour_cos'] > 0.5)).astype(int)        # ~12AM-3AM




### The hour_sin and hour_cos has been further sub-categorized into the timeframe of the day to make sure that the type of attack done on specific part of the day gets distinguished

In [324]:
### Lets evaluate the packet length 

data['Tiny_Packet']= (data['Packet Length'] < 128).astype(int)
data['Large_Packet'] = (data['Packet Length'] >= 1200).astype(int)




In [325]:
### Our dataset is evenly balanced which means that this dataset if perfect for model evaluation 
data.to_csv('../Datasets/featured.csv')