In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import networkx as nx

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Auto-adjust width
pd.set_option('display.colheader_justify', 'left')  # Align headers properly
pd.set_option('display.max_colwidth', None)  # Don't truncate column values

  from pandas.core import (


In [37]:
ddos2018_csv = "/Users/emmalim/Desktop/CIC-IDS2018/Processed Traffic Data for ML Algorithms/Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv"
ddos2017_csv = "/Users/emmalim/Desktop/TrafficLabelling /Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
dos2017_csv = "/Users/emmalim/Desktop/TrafficLabelling /Wednesday-workingHours.pcap_ISCX.csv"
botnet2017_csv = "/Users/emmalim/Desktop/TrafficLabelling /Friday-WorkingHours-Morning.pcap_ISCX.csv"

In [40]:
ddos2018_df = pd.read_csv(ddos2018_csv)
ddos2017_df = pd.read_csv(ddos2017_csv)
dos2017_df = pd.read_csv(dos2017_csv)
botnet2017_df = pd.read_csv(botnet2017_csv)

## Data Cleaning

In [41]:
ddos2017_df.columns = ddos2017_df.columns.str.strip()
ddos2018_df.columns = ddos2018_df.columns.str.strip()
dos2017_df.columns = dos2017_df.columns.str.strip()
botnet2017_df.columns = botnet2017_df.columns.str.strip()

In [42]:
ddos2017_df = ddos2017_df.drop(columns=["Fwd Header Length.1"], errors="ignore")
dos2017_df = dos2017_df.drop(columns=["Fwd Header Length.1"], errors="ignore")
botnet2017_df = botnet2017_df.drop(columns=["Fwd Header Length.1"], errors="ignore")

In [27]:
column_mapping = {
    "Src IP": "Source IP",
    "Src Port": "Source Port",
    "Dst IP": "Destination IP",
    "Dst Port": "Destination Port",
    "Tot Fwd Pkts": "Total Fwd Packets",
    "Tot Bwd Pkts": "Total Backward Packets",
    "TotLen Fwd Pkts": "Total Length of Fwd Packets",
    "TotLen Bwd Pkts": "Total Length of Bwd Packets",
    "Fwd Pkt Len Max": "Fwd Packet Length Max",
    "Fwd Pkt Len Min": "Fwd Packet Length Min",
    "Fwd Pkt Len Mean": "Fwd Packet Length Mean",
    "Fwd Pkt Len Std": "Fwd Packet Length Std",
    "Bwd Pkt Len Max": "Bwd Packet Length Max",
    "Bwd Pkt Len Min": "Bwd Packet Length Min",
    "Bwd Pkt Len Mean": "Bwd Packet Length Mean",
    "Bwd Pkt Len Std": "Bwd Packet Length Std",
    "Flow Byts/s": "Flow Bytes/s",
    "Flow Pkts/s": "Flow Packets/s",
    "Fwd IAT Tot": "Fwd IAT Total",
    "Bwd IAT Tot": "Bwd IAT Total",
    "Fwd Header Len": "Fwd Header Length",
    "Bwd Header Len": "Bwd Header Length",
    "Fwd Pkts/s": "Fwd Packets/s",
    "Bwd Pkts/s": "Bwd Packets/s",
    "Pkt Len Min": "Min Packet Length",
    "Pkt Len Max": "Max Packet Length",
    "Pkt Len Mean": "Packet Length Mean",
    "Pkt Len Std": "Packet Length Std",
    "Pkt Len Var": "Packet Length Variance",
    "FIN Flag Cnt": "FIN Flag Count",
    "SYN Flag Cnt": "SYN Flag Count",
    "RST Flag Cnt": "RST Flag Count",
    "PSH Flag Cnt": "PSH Flag Count",
    "ACK Flag Cnt": "ACK Flag Count",
    "URG Flag Cnt": "URG Flag Count",
    "ECE Flag Cnt": "ECE Flag Count",
    "CWE Flag Count": "CWE Flag Count",
    "Pkt Size Avg": "Average Packet Size",
    "Fwd Seg Size Avg": "Avg Fwd Segment Size",
    "Bwd Seg Size Avg": "Avg Bwd Segment Size",
    "Init Fwd Win Byts": "Init_Win_bytes_forward",
    "Init Bwd Win Byts": "Init_Win_bytes_backward",
    "Fwd Act Data Pkts": "act_data_pkt_fwd",
    "Fwd Seg Size Min": "min_seg_size_forward",
    "Fwd Byts/b Avg": "Fwd Avg Bytes/Bulk",
    "Bwd Blk Rate Avg": "Bwd Avg Bulk Rate",
    "Subflow Bwd Byts": "Subflow Bwd Bytes",
    "Subflow Fwd Pkts": "Subflow Fwd Packets",
    "Subflow Fwd Byts": "Subflow Fwd Bytes",
    "Fwd Blk Rate Avg": "Fwd Avg Bulk Rate",
    "Subflow Bwd Pkts": "Subflow Bwd Packets",
    "Fwd Pkts/b Avg": "Fwd Avg Packets/Bulk",
    "Bwd Pkts/b Avg": "Bwd Avg Packets/Bulk",
    "Bwd Byts/b Avg": "Bwd Avg Bytes/Bulk"
}
ddos2018_df.rename(columns=column_mapping, inplace=True)


In [None]:
# Extract the hour and minute as separate columns (vectorized)
ddos2018_df['Timestamp'] = pd.to_datetime(ddos2018_df['Timestamp'])
ddos2018_df['hour'] = ddos2018_df['Timestamp'].dt.hour
ddos2018_df['minute'] = ddos2018_df['Timestamp'].dt.minute

# Define the time window for comparison
start_hour, start_minute = 10, 0
end_hour, end_minute = 11, 30

# Filter rows where Timestamp is between 10:00 and 11:30
filtered_df = ddos2018_df[(ddos2018_df['hour'] > start_hour) | 
                          ((ddos2018_df['hour'] == start_hour) & (ddos2018_df['minute'] >= start_minute))]

filtered_ddos2018_df = filtered_df[(filtered_df['hour'] < end_hour) | 
                          ((filtered_df['hour'] == end_hour) & (filtered_df['minute'] <= end_minute))]

## Save to CSV

In [None]:
# Save the cleaned 2017 dataframe
filtered_ddos2018_df.to_csv('/Users/emmalim/Desktop/DSA4263/ddos2018_cleaned.csv', index=False)

In [52]:
ddos2017_df.to_csv('/Users/emmalim/Desktop/DSA4263/ddos2017_cleaned.csv', index=False)
dos2017_df.to_csv('/Users/emmalim/Desktop/DSA4263/dos2017_cleaned.csv', index=False)
botnet2017_df.to_csv('/Users/emmalim/Desktop/DSA4263/botnet2017_cleaned.csv', index=False)

In [54]:
print(ddos2017_df['Label'].value_counts())

Label
DDoS      128027
BENIGN     97718
Name: count, dtype: int64


## Merge datasets

In [80]:
ddos2018_df_cleaned = pd.read_csv('/Users/emmalim/Desktop/DSA4263/ddos2018_cleaned.csv')
ddos2017_df_cleaned = pd.read_csv('/Users/emmalim/Desktop/DSA4263/ddos2017_cleaned.csv')
dos2017_df_cleaned = pd.read_csv('/Users/emmalim/Desktop/DSA4263/dos2017_cleaned.csv')
botnet2017_df_cleaned = pd.read_csv('/Users/emmalim/Desktop/DSA4263/botnet2017_cleaned.csv')

In [81]:
print("2018 DDoS: ")
print(ddos2018_df_cleaned['Label'].value_counts())
print("------------------")
print("2017 DDoS: ")
print(ddos2017_df_cleaned['Label'].value_counts())
print("------------------")
print("2017 DoS: ")
print(dos2017_df_cleaned['Label'].value_counts())
print("------------------")
print("2017 BotNet: ")
print(botnet2017_df_cleaned['Label'].value_counts())

2018 DDoS: 
Label
Benign                    1343290
DDoS attacks-LOIC-HTTP     575394
Name: count, dtype: int64
------------------
2017 DDoS: 
Label
DDoS      128027
BENIGN     97718
Name: count, dtype: int64
------------------
2017 DoS: 
Label
BENIGN              440031
DoS Hulk            231073
DoS GoldenEye        10293
DoS slowloris         5796
DoS Slowhttptest      5499
Heartbleed              11
Name: count, dtype: int64
------------------
2017 BotNet: 
Label
BENIGN    189067
Bot         1966
Name: count, dtype: int64


In [82]:

# Remove rows with Label == 'BENIGN'
ddos2018_df_cleaned = ddos2018_df_cleaned.drop(columns=['hour', 'minute'])
attack_ddos_2017 = ddos2017_df_cleaned[ddos2017_df_cleaned["Label"].str.strip() != "BENIGN"]
attack_dos_2017 = dos2017_df_cleaned[dos2017_df_cleaned["Label"].str.strip() != "BENIGN"]
attack_botnet_2017 = botnet2017_df_cleaned[botnet2017_df_cleaned["Label"].str.strip() != "BENIGN"]

In [83]:
attack_botnet_2017['Label'].value_counts()

Label
Bot    1966
Name: count, dtype: int64

In [84]:
merged_df = pd.concat(
    [ddos2018_df_cleaned, attack_ddos_2017, attack_dos_2017, attack_botnet_2017],
    axis=0,
    ignore_index=True
)
print(merged_df['Label'].value_counts())
print(merged_df.info())

Label
Benign                    1343290
DDoS attacks-LOIC-HTTP     575394
DoS Hulk                   231073
DDoS                       128027
DoS GoldenEye               10293
DoS slowloris                5796
DoS Slowhttptest             5499
Bot                          1966
Heartbleed                     11
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2301349 entries, 0 to 2301348
Data columns (total 84 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   Flow ID                      object 
 1   Source IP                    object 
 2   Source Port                  int64  
 3   Destination IP               object 
 4   Destination Port             int64  
 5   Protocol                     int64  
 6   Timestamp                    object 
 7   Flow Duration                int64  
 8   Total Fwd Packets            int64  
 9   Total Backward Packets       int64  
 10  Total Length of Fwd Packets  float64
 11

In [85]:
merged_df.to_csv('/Users/emmalim/Desktop/DSA4263/final_data.csv', index=False)