In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [25]:
dfs = []

for dirname, _, filenames in os.walk('/home/wahba/Documents/cicid/cicids2017/original/csv/TrafficLabelling/'):
    for filename in filenames:
        if filename.endswith('.csv'):
            filepath = os.path.join(dirname, filename)
            df = pd.read_csv(filepath, encoding='latin1')
            dfs.append(df)

  df = pd.read_csv(filepath, encoding='latin1')


In [26]:
# Data dimensions of each individual dataset
for i, data in enumerate(dfs, start=1):
    rows, cols = data.shape
    print(f'df{i} -> {rows} rows, {cols} columns')



df1 -> 458968 rows, 85 columns
df2 -> 692703 rows, 85 columns
df3 -> 288602 rows, 85 columns
df4 -> 286467 rows, 85 columns
df5 -> 225745 rows, 85 columns
df6 -> 445909 rows, 85 columns
df7 -> 191033 rows, 85 columns
df8 -> 529918 rows, 85 columns


In [27]:
data = pd.concat(dfs, ignore_index=True)
for df in dfs:
    del df  # Free memory

In [28]:
# Removal of leading/trailing whitespace
col_names = {col: col.strip() for col in data.columns}
data.rename(columns = col_names, inplace = True)

In [29]:
data.columns

Index(['Flow ID', 'Source IP', 'Source Port', 'Destination IP',
       'Destination Port', 'Protocol', 'Timestamp', 'Flow Duration',
       'Total Fwd Packets', 'Total Backward Packets',
       'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
  

In [30]:
data['flow_key'] = (
    data['Source IP'].astype(str) + ':' +
    data['Source Port'].astype(str) + '-' +
    data['Destination IP'].astype(str) + ':' +
    data['Destination Port'].astype(str) + '-' +
    data['Protocol'].astype(str)
    )

In [31]:
# Mapping the attacks to the new group
group_mapping = {
    'BENIGN': 'Normal Traffic',
    'DoS Hulk': 'DoS',
    'DDoS': 'DDoS',
    'PortScan': 'Port Scanning',
    'DoS GoldenEye': 'DoS',
    'FTP-Patator': 'Brute Force',
    'DoS slowloris': 'DoS',
    'DoS Slowhttptest': 'DoS',
    'SSH-Patator': 'Brute Force',
    'Bot': 'Bots',
    'Web Attack � Brute Force': 'Web Attacks',
    'Web Attack � XSS': 'Web Attacks',
    'Infiltration': 'Infiltration',
    'Web Attack � Sql Injection': 'Web Attacks',
    'Heartbleed': 'Miscellaneous'
}

# Map to new group column
data['Attack Type'] = data['Label'].map(group_mapping)



In [32]:
data['Attack Type'].value_counts()

Attack Type
Normal Traffic    2273097
DoS                252661
Port Scanning      158930
DDoS               128027
Brute Force         13835
Bots                 1966
Infiltration           36
Miscellaneous          11
Name: count, dtype: int64

In [36]:
data['flow_key'] = (
    data['Source IP'].astype(str) + ':' +
    data['Source Port'].astype(str) + '-' +
    data['Destination IP'].astype(str) + ':' +
    data['Destination Port'].astype(str) + '-' +
    data['Protocol'].astype(str)
    )

## Check overview of IP Address involved in each Attack Type

In [41]:
attack_types = ['Normal Traffic', 'DoS', 'DDoS', 'Port Scanning', 'Brute Force', 'Bots', 'Web Attacks', 'Infiltration', 'Miscellaneous']

for attack_type in attack_types:
    print(attack_type)
    print(data[data['Attack Type'] == attack_type]['Source IP'].value_counts())

Normal Traffic
Source IP
192.168.10.3      298990
192.168.10.8      214973
192.168.10.5      169930
192.168.10.9      152310
192.168.10.12     146250
                   ...  
146.20.128.117         1
54.208.87.104          1
165.254.34.233         1
52.218.17.3            1
159.127.42.62          1
Name: count, Length: 17005, dtype: int64
DoS
Source IP
172.16.0.1    252661
Name: count, dtype: int64
DDoS
Source IP
172.16.0.1       128024
192.168.10.50         3
Name: count, dtype: int64
Port Scanning
Source IP
172.16.0.1    158930
Name: count, dtype: int64
Brute Force
Source IP
172.16.0.1    13835
Name: count, dtype: int64
Bots
Source IP
205.174.165.73    705
192.168.10.15     371
192.168.10.8      271
192.168.10.9      226
192.168.10.14     209
192.168.10.5      180
192.168.10.12       2
192.168.10.17       2
Name: count, dtype: int64
Web Attacks
Series([], Name: count, dtype: int64)
Infiltration
Source IP
192.168.10.8    36
Name: count, dtype: int64
Miscellaneous
Source IP
172.16.0.1 

## Check if normal traffic includes 172.16.0.1

In [None]:
print(len(data[(data['Attack Type'] == 'Normal Traffic') & (data['Source IP'] == '172.16.0.1')]))
print(len(data[(data['Attack Type'] == 'Normal Traffic') & (data['Destination IP'] == '172.16.0.1')]))

3606
48295


: 