In [1]:
import os
import glob
import pandas as pd
import numpy as np
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

In [2]:
# get data file names
path =r'dataset'
filenames = glob.glob(path + "\*.csv")

dfs = []
for filename in filenames:
    dfs.append(pd.read_csv(filename))

# Concatenate all data into one DataFrame
data_df = pd.concat(dfs)
data_df.columns

Index(['ACK Flag Cnt', 'Active Max', 'Active Mean', 'Active Min', 'Active Std',
       'Bwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Header Len', 'Bwd IAT Max',
       'Bwd IAT Mean', 'Bwd IAT Min', 'Bwd IAT Std', 'Bwd IAT Tot',
       'Bwd PSH Flags', 'Bwd Pkt Len Max', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Min', 'Bwd Pkt Len Std', 'Bwd Pkts/b Avg', 'Bwd Pkts/s',
       'Bwd Seg Size Avg', 'Bwd URG Flags', 'CWE Flag Count', 'Down/Up Ratio',
       'Dst IP', 'Dst Port', 'ECE Flag Cnt', 'FIN Flag Cnt', 'Flow Byts/s',
       'Flow Duration', 'Flow IAT Max', 'Flow IAT Mean', 'Flow IAT Min',
       'Flow IAT Std', 'Flow ID', 'Flow Pkts/s', 'Fwd Act Data Pkts',
       'Fwd Blk Rate Avg', 'Fwd Byts/b Avg', 'Fwd Header Len', 'Fwd IAT Max',
       'Fwd IAT Mean', 'Fwd IAT Min', 'Fwd IAT Std', 'Fwd IAT Tot',
       'Fwd PSH Flags', 'Fwd Pkt Len Max', 'Fwd Pkt Len Mean',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Std', 'Fwd Pkts/b Avg', 'Fwd Pkts/s',
       'Fwd Seg Size Avg', 'Fwd Seg Size Min', 'Fw

In [3]:
data_df = data_df.drop(['Timestamp', 'Flow ID', 'Src IP', 'Dst IP', 'Protocol', 'Src Port'], axis=1)
data_df.head()
data_df['Label'].value_counts()

Benign                      13484708
DDOS attack-HOIC              686012
DDoS attacks-LOIC-HTTP        576191
DoS attacks-Hulk              461912
Bot                           286191
FTP-BruteForce                193360
SSH-Bruteforce                187589
Infilteration                 161934
DoS attacks-SlowHTTPTest      139890
DoS attacks-GoldenEye          41508
DoS attacks-Slowloris          10990
DDOS attack-LOIC-UDP            1730
Brute Force -Web                 611
Brute Force -XSS                 230
SQL Injection                     87
Label                             59
Name: Label, dtype: int64

In [4]:
le = preprocessing.LabelEncoder()
data_df['Label'] = le.fit_transform(data_df['Label'].values)
data_df['Label'].value_counts()

0     13484708
4       686012
6       576191
8       461912
1       286191
11      193360
15      187589
12      161934
9       139890
7        41508
10       10990
5         1730
2          611
3          230
14          87
13          59
Name: Label, dtype: int64

In [5]:
# We still have 2 features that are still "text" with the word "Infinity"
data_df = data_df.replace('Infinity', 'inf')
for column in data_df.columns:
    data_df[column] = pd.to_numeric(data_df[column], errors='coerce')
#data_df['Flow Pkts/s'] = pd.to_numeric(data_df['Flow Pkts/s'], errors='coerce')
data_df.dtypes

ACK Flag Cnt         float64
Active Max           float64
Active Mean          float64
Active Min           float64
Active Std           float64
Bwd Blk Rate Avg     float64
Bwd Byts/b Avg       float64
Bwd Header Len       float64
Bwd IAT Max          float64
Bwd IAT Mean         float64
Bwd IAT Min          float64
Bwd IAT Std          float64
Bwd IAT Tot          float64
Bwd PSH Flags        float64
Bwd Pkt Len Max      float64
Bwd Pkt Len Mean     float64
Bwd Pkt Len Min      float64
Bwd Pkt Len Std      float64
Bwd Pkts/b Avg       float64
Bwd Pkts/s           float64
Bwd Seg Size Avg     float64
Bwd URG Flags        float64
CWE Flag Count       float64
Down/Up Ratio        float64
Dst Port             float64
ECE Flag Cnt         float64
FIN Flag Cnt         float64
Flow Byts/s          float64
Flow Duration        float64
Flow IAT Max         float64
                      ...   
Fwd Pkts/b Avg       float64
Fwd Pkts/s           float64
Fwd Seg Size Avg     float64
Fwd Seg Size M

In [6]:
data_df = data_df[~data_df.isin([np.nan, np.inf, -np.inf]).any(1)]
data_df = data_df.fillna(0)
print(data_df['Flow Pkts/s'].loc[(~np.isfinite(data_df['Flow Pkts/s'])) & data_df['Flow Pkts/s'].notnull()])
print(data_df['Flow Byts/s'].loc[(~np.isfinite(data_df['Flow Byts/s'])) & data_df['Flow Byts/s'].notnull()])
print(data_df['Label'].value_counts())

Series([], Name: Flow Pkts/s, dtype: float64)
Series([], Name: Flow Byts/s, dtype: float64)
0     13390249
4       686012
6       576191
8       461912
1       286191
11      193354
15      187589
12      160639
9       139890
7        41508
10       10990
5         1730
2          611
3          230
14          87
13          59
Name: Label, dtype: int64


In [7]:
data_df.head()
data_df.to_pickle('processed_data_df.pkl')