In [11]:
import os
import json
import numpy as np
import pandas as pd
from pprint import pprint
from itertools import combinations, product

#sklearn
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer, MinMaxScaler
from sklearn.metrics import precision_recall_curve, auc, roc_curve, recall_score, precision_score, f1_score

#graph
import seaborn as sns
import matplotlib.pyplot as plt

In [12]:
dataset_csv_path = './'
csv_file_names = ['friday.csv', 
                  'monday.csv', 
                  'thursday.csv', 
                  'tuesday.csv', 
                  'wednesday.csv']

complete_paths = []
for csv_file_name in csv_file_names:
    complete_paths.append(os.path.join(dataset_csv_path, csv_file_name))

df = pd.concat(map(pd.read_csv, complete_paths), 
               ignore_index = True)

FileNotFoundError: [Errno 2] No such file or directory: './friday.csv'

In [None]:
def clean_df(df):
    # Remove the space before each feature names
    df.columns = df.columns.str.strip()
    print('dataset shape', df.shape)

    # This set of feature should have >= 0 values
    num = df._get_numeric_data()
    num[num < 0] = 0

    zero_variance_cols = []
    for col in df.columns:
        if len(df[col].unique()) == 1:
            zero_variance_cols.append(col)
    df.drop(zero_variance_cols, axis = 1, inplace = True)
    print('zero variance columns', zero_variance_cols, 'dropped')
    print('shape after removing zero variance columns:', df.shape)

    df.replace([np.inf, -np.inf], np.nan, inplace = True)
    print(df.isna().any(axis = 1).sum(), 'rows dropped')
    df.dropna(inplace = True)
    print('shape after removing nan:', df.shape)

    # Drop duplicate rows
    df.drop_duplicates(inplace = True)
    print('shape after dropping duplicates:', df.shape)

    column_pairs = [(i, j) for i, j in combinations(df, 2) if df[i].equals(df[j])]
    ide_cols = []
    for column_pair in column_pairs:
        ide_cols.append(column_pair[1])
    df.drop(ide_cols, axis = 1, inplace = True)
    print('columns which have identical values', column_pairs, 'dropped')
    print('shape after removing identical value columns:', df.shape)
    return df
df = clean_df(df)

dataset shape (2099976, 91)
zero variance columns [] dropped
shape after removing zero variance columns: (2099976, 91)
5 rows dropped
shape after removing nan: (2099971, 91)
shape after dropping duplicates: (2099971, 91)
columns which have identical values [] dropped
shape after removing identical value columns: (2099971, 91)


In [None]:
dropping_cols = ['id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 
                 'Dst Port', 'Timestamp']
improved_df = clean_df(df)
improved_df.drop(dropping_cols, axis = 1, inplace = True)
improved_df['Label'].value_counts()

dataset shape (2099971, 91)
zero variance columns [] dropped
shape after removing zero variance columns: (2099971, 91)
0 rows dropped
shape after removing nan: (2099971, 91)
shape after dropping duplicates: (2099971, 91)
columns which have identical values [] dropped
shape after removing identical value columns: (2099971, 91)


Label
BENIGN                                    1582561
Portscan                                   159066
DoS Hulk                                   158468
DDoS                                        95144
Infiltration - Portscan                     71767
DoS GoldenEye                                7567
Botnet - Attempted                           4067
FTP-Patator                                  3972
DoS Slowloris                                3859
DoS Slowhttptest - Attempted                 3368
SSH-Patator                                  2961
DoS Slowloris - Attempted                    1847
DoS Slowhttptest                             1740
Web Attack - Brute Force - Attempted         1292
Botnet                                        736
Web Attack - XSS - Attempted                  655
DoS Hulk - Attempted                          581
DoS GoldenEye - Attempted                      80
Web Attack - Brute Force                       73
Infiltration - Attempted                    

In [None]:
improved_df['Label'].value_counts()

Label
BENIGN                                    1582561
Portscan                                   159066
DoS Hulk                                   158468
DDoS                                        95144
Infiltration - Portscan                     71767
DoS GoldenEye                                7567
Botnet - Attempted                           4067
FTP-Patator                                  3972
DoS Slowloris                                3859
DoS Slowhttptest - Attempted                 3368
SSH-Patator                                  2961
DoS Slowloris - Attempted                    1847
DoS Slowhttptest                             1740
Web Attack - Brute Force - Attempted         1292
Botnet                                        736
Web Attack - XSS - Attempted                  655
DoS Hulk - Attempted                          581
DoS GoldenEye - Attempted                      80
Web Attack - Brute Force                       73
Infiltration - Attempted                    

In [None]:
improved_df['Attempted Category'].value_counts()

Attempted Category
0    2092837
1       4067
6       2804
5        138
4         75
3         27
2         23
Name: count, dtype: int64

In [None]:
improved_df

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,ICMP Code,ICMP Type,Total TCP Flow Time,Label,Attempted Category
0,6,112740690,32,16,6448,1152,403,0,201.500000,204.724205,...,343,1.610540e+07,4.988048e+05,16399772,15375229,0,0,112740690,BENIGN,0
1,6,112740560,32,16,6448,5056,403,0,201.500000,204.724205,...,285,1.610543e+07,4.987937e+05,16399782,15375263,0,0,112740560,BENIGN,0
2,0,113757377,545,0,0,0,0,0,0.000000,0.000000,...,19,1.221036e+07,6.935824e+06,20757030,5504997,0,0,0,BENIGN,0
3,17,91997219,388,0,37151,0,227,37,95.750000,55.785320,...,16,1.319764e+07,5.826905e+06,19776791,5817470,0,0,0,BENIGN,0
4,17,66966070,6,6,288,288,48,48,48.000000,0.000000,...,1968172,6.497443e+07,0.000000e+00,64974431,64974431,0,0,0,BENIGN,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2099971,6,5074745,7,6,582,1204,582,0,83.142857,219.975323,...,0,0.000000e+00,0.000000e+00,0,0,0,0,5074745,BENIGN,0
2099972,17,209,2,2,70,230,35,35,35.000000,0.000000,...,0,0.000000e+00,0.000000e+00,0,0,0,0,0,BENIGN,0
2099973,6,116281383,24,21,699,5411,322,0,29.125000,79.231808,...,23012,9.639210e+06,1.321437e+06,10024910,5443180,0,0,116281383,BENIGN,0
2099974,17,149,2,2,72,104,36,36,36.000000,0.000000,...,0,0.000000e+00,0.000000e+00,0,0,0,0,0,BENIGN,0


In [None]:
improved_df.isnull().sum() 

Protocol                      0
Flow Duration                 0
Total Fwd Packet              0
Total Bwd packets             0
Total Length of Fwd Packet    0
                             ..
ICMP Code                     0
ICMP Type                     0
Total TCP Flow Time           0
Label                         0
Attempted Category            0
Length: 84, dtype: int64

In [None]:
improved_df.drop_duplicates(inplace=True)

NameError: name 'improved_df' is not defined

In [None]:
df.duplicated().sum()