In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
os.chdir(r"D:\Analytixlabs\Internship\Project 4\Cyber Security")

## Concatenating the datasets

### Importing the datasets

In [3]:
Monday_WorkingHours = pd.read_csv("data/Monday-WorkingHours.csv")
Tuesday_WorkingHours = pd.read_csv("data/Tuesday-WorkingHours.csv")
Wednesday_WorkingHours = pd.read_csv("data/Wednesday-WorkingHours.csv")
Thursday_WorkingHours_Morning = pd.read_csv("data/Thursday-WorkingHours-Morning.csv")
Thursday_WorkingHours_Afternoon = pd.read_csv("data/Thursday-WorkingHours-Afternoon.csv")
Friday_WorkingHours_Morning = pd.read_csv("data/Friday-WorkingHours-Morning.csv")
Friday_WorkingHours_Afternoon_1 = pd.read_csv("data/Friday-WorkingHours-Afternoon-1.csv")
Friday_WorkingHours_Afternoon_2 = pd.read_csv("data/Friday-WorkingHours-Afternoon-2.csv")

In [4]:
network_data = pd.concat(
    [
        Monday_WorkingHours,
        Tuesday_WorkingHours,
        Wednesday_WorkingHours,
        Thursday_WorkingHours_Morning,
        Thursday_WorkingHours_Afternoon,
        Friday_WorkingHours_Morning,
        Friday_WorkingHours_Afternoon_1,
        Friday_WorkingHours_Afternoon_2
    ],
    axis = 0
).copy()

In [5]:
network_data.shape

(2830743, 79)

## Correcting the column names

In [6]:
network_data.columns = network_data.columns.str.strip()
network_data.columns = network_data.columns.str.replace(" ", "_")
network_data.columns = network_data.columns.str.lower()

In [7]:
cols_to_rename = {
    'flow_bytes/s': 'flow_bytes_per_sec',
    'flow_packets/s': 'flow_packets_per_sec',
    'fwd_packets/s': 'fwd_packets_per_sec',
    'bwd_packets/s': 'bwd_packets_per_sec',
    'down/up_ratio': 'down_up_ratio',
    'fwd_header_length.1': 'fwd_header_length',
    'fwd_avg_bytes/bulk': 'fwd_avg_bytes_per_bulk',
    'fwd_avg_packets/bulk': 'fwd_avg_packets_per_bulk',
    'bwd_avg_bytes/bulk': 'bwd_avg_bytes_per_bulk',
    'bwd_avg_packets/bulk': 'bwd_avg_packets_per_bulk',
}

In [8]:
network_data.rename(columns = cols_to_rename, inplace = True)

In [9]:
network_data.columns

Index(['destination_port', 'flow_duration', 'total_fwd_packets',
       'total_backward_packets', 'total_length_of_fwd_packets',
       'total_length_of_bwd_packets', 'fwd_packet_length_max',
       'fwd_packet_length_min', 'fwd_packet_length_mean',
       'fwd_packet_length_std', 'bwd_packet_length_max',
       'bwd_packet_length_min', 'bwd_packet_length_mean',
       'bwd_packet_length_std', 'flow_bytes_per_sec', 'flow_packets_per_sec',
       'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 'flow_iat_min',
       'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max',
       'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean', 'bwd_iat_std',
       'bwd_iat_max', 'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags',
       'fwd_urg_flags', 'bwd_urg_flags', 'fwd_header_length',
       'bwd_header_length', 'fwd_packets_per_sec', 'bwd_packets_per_sec',
       'min_packet_length', 'max_packet_length', 'packet_length_mean',
       'packet_length_std', 'packet_length_variance', 'fin_flag_cou

## Correcting the labels and creating the binary and multiclass labels

In [10]:
network_data['label'].unique()

array(['BENIGN', 'FTP-Patator', 'SSH-Patator', 'DoS slowloris',
       'DoS Slowhttptest', 'DoS Hulk', 'DoS GoldenEye', 'Heartbleed',
       'Web Attack � Brute Force', 'Web Attack � XSS',
       'Web Attack � Sql Injection', 'Infiltration', 'Bot', 'DDoS',
       'PortScan'], dtype=object)

In [11]:
conditions = [
    network_data['label'] == 'BENIGN',
    network_data['label'] == 'FTP-Patator',
    network_data['label'] == 'SSH-Patator',
    network_data['label'] == 'DoS slowloris',
    network_data['label'] == 'DoS Slowhttptest',
    network_data['label'] == 'DoS Hulk',
    network_data['label'] == 'DoS GoldenEye',
    network_data['label'] == 'Heartbleed',
    network_data['label'] == 'Web Attack � Brute Force',
    network_data['label'] == 'Web Attack � XSS',
    network_data['label'] == 'Web Attack � Sql Injection',
    network_data['label'] == 'Infiltration',
    network_data['label'] == 'Bot',
    network_data['label'] == 'DDoS',
    network_data['label'] == 'PortScan',
]

choices = [
    'benign', 
    'ftp_patator', 
    'ssh_patator', 
    'dos_slow_loris',
    'dos_slow_http_test', 
    'dos_hulk', 
    'dos_golden_eye', 
    'heartbleed',
    'web_attack_brute_force',
    'web_attack_xss',
    'web_attack_sql_injection', 
    'infiltration', 
    'bot', 
    'ddos',
    'portscan'
]

In [12]:
network_data['label'] = np.select(
    condlist = conditions,
    choicelist = choices,
    default = network_data['label']
)

## Binary Labels

In [13]:
network_data['attack'] = np.select(
    condlist = [network_data['label'] == 'benign'],
    choicelist = [0],
    default = 1
)

## Multiclass Labels

In [14]:
conditions_1 = [
    network_data['label'] == 'benign',
    network_data['label'].isin(['ftp_patator','ssh_patator','web_attack_brute_force']),
    network_data['label'].isin(['dos_slow_loris','dos_slow_http_test','dos_hulk','dos_golden_eye','ddos']),
    network_data['label'].isin(['web_attack_xss', 'web_attack_sql_injection'])
]

choices_1 = [
    'benign',
    'brute_force',
    'dos_ddos',
    'web_attack'
]

In [15]:
network_data['attack_type'] = np.select(
    condlist = conditions_1,
    choicelist = choices_1,
    default = network_data['label']
)

## Removing the duplicate column

In [16]:
network_data = network_data.loc[:, ~network_data.columns.duplicated(keep="last")]

## Removing the null, duplicated, and infinite value entries

In [17]:
network_data.isna().sum().sum()

1358

In [20]:
pd.DataFrame(network_data.isna().sum()).sort_values(by = 0, ascending = False)

Unnamed: 0,0
flow_bytes_per_sec,1358
destination_port,0
bwd_avg_bulk_rate,0
bwd_avg_bytes_per_bulk,0
fwd_avg_bulk_rate,0
...,...
bwd_iat_mean,0
bwd_iat_total,0
fwd_iat_min,0
fwd_iat_max,0


In [22]:
network_data.duplicated().sum()

308381

In [19]:
network_data.isin([np.inf, -np.inf]).sum().sum()

4376

In [26]:
pd.DataFrame(network_data.isin([np.inf, -np.inf]).sum()).sort_values(by = 0, ascending = False)

Unnamed: 0,0
flow_packets_per_sec,2867
flow_bytes_per_sec,1509
destination_port,0
avg_fwd_segment_size,0
bwd_avg_bytes_per_bulk,0
...,...
bwd_iat_std,0
bwd_iat_mean,0
bwd_iat_total,0
fwd_iat_min,0


# Replace Infinite values with null

In [30]:
network_data.replace([np.inf, -np.inf], np.nan, inplace = True)

# Drop the duplicates

In [32]:
network_data.drop_duplicates(inplace = True)

In [33]:
network_data.shape

(2522362, 80)

In [34]:
network_data.reset_index(drop = True, inplace = True)

## Saving the final dataset as csv and parquet files

In [63]:
network_data.to_csv(r'output/network_data.csv')

In [64]:
network_data.to_parquet(r'output/network_data.parquet')