In [2]:
import pandas as pd
import numpy as np
import glob
import os

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from fast_ml.model_development import train_valid_test_split
from skimpy import clean_columns

DATA_DIR  = os.path.join(os.path.abspath("."), "data")

### Raw Dataset

In [3]:
data_path='/home/wendyunji/MLAC/data/cicids2017-data'

filenames = glob.glob(os.path.join(data_path, 'raw', '*.csv'))
datasets = [pd.read_csv(filename) for filename in filenames]

# Concatenate the datasets
data = pd.concat(datasets, axis=0, ignore_index=True)
data = clean_columns(data)

In [4]:
data.head()

Unnamed: 0,destination_port,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,fwd_packet_length_std,...,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [5]:
len(data)

2830743

In [6]:
print(len(data.columns))
raw_col = data.columns.to_list()
print(raw_col)

79
['destination_port', 'flow_duration', 'total_fwd_packets', 'total_backward_packets', 'total_length_of_fwd_packets', 'total_length_of_bwd_packets', 'fwd_packet_length_max', 'fwd_packet_length_min', 'fwd_packet_length_mean', 'fwd_packet_length_std', 'bwd_packet_length_max', 'bwd_packet_length_min', 'bwd_packet_length_mean', 'bwd_packet_length_std', 'flow_bytes_s', 'flow_packets_s', 'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 'flow_iat_min', 'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max', 'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean', 'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags', 'fwd_urg_flags', 'bwd_urg_flags', 'fwd_header_length', 'bwd_header_length', 'fwd_packets_s', 'bwd_packets_s', 'min_packet_length', 'max_packet_length', 'packet_length_mean', 'packet_length_std', 'packet_length_variance', 'fin_flag_count', 'syn_flag_count', 'rst_flag_count', 'psh_flag_count', 'ack_flag_count', 'urg_flag_count', 'cwe_flag_count', 'ece_flag_co

In [7]:
data['label'].value_counts()

BENIGN                        2273097
DoS Hulk                       231073
PortScan                       158930
DDoS                           128027
DoS GoldenEye                   10293
FTP-Patator                      7938
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1966
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: label, dtype: int64

### PreProcessing

In [8]:
#
data.drop_duplicates(inplace=True, keep=False, ignore_index=True)
#
data.dropna(axis=0, inplace=True, how="any")
#
data.replace([-np.inf, np.inf], np.nan, inplace=True)
data.dropna(axis=0, how='any', inplace=True)
#
data_std =data.std(numeric_only=True)
threshold=0.01
constant_features = [column for column, std in data_std.iteritems() if std < threshold]
data.drop(labels=constant_features, axis=1, inplace=True)
#
threshold=0.98
data_corr = data.corr()
mask = np.triu(np.ones_like(data_corr, dtype=bool))
tri_df = data_corr.mask(mask)
correlated_features = [c for c in tri_df.columns if any(tri_df[c] > threshold)]
data.drop(labels=correlated_features, axis=1, inplace=True)

data.head()

Unnamed: 0,destination_port,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_std,bwd_packet_length_min,bwd_packet_length_std,flow_bytes_s,flow_iat_mean,flow_iat_std,flow_iat_min,...,act_data_pkt_fwd,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_std,idle_max,idle_min,label
0,54865,6,6,0.0,0,0.0,4000000.0,3.0,0.0,3,...,1,20,0.0,0.0,0,0,0.0,0,0,BENIGN
1,55054,6,6,0.0,6,0.0,110091.7,109.0,0.0,109,...,0,20,0.0,0.0,0,0,0.0,0,0,BENIGN
2,55055,6,6,0.0,6,0.0,230769.2,52.0,0.0,52,...,0,20,0.0,0.0,0,0,0.0,0,0,BENIGN
3,46236,6,6,0.0,6,0.0,352941.2,34.0,0.0,34,...,0,20,0.0,0.0,0,0,0.0,0,0,BENIGN
4,54863,6,6,0.0,0,0.0,4000000.0,3.0,0.0,3,...,1,20,0.0,0.0,0,0,0.0,0,0,BENIGN


In [9]:
len(data)

2425727

In [10]:
data.columns

Index(['destination_port', 'fwd_packet_length_max', 'fwd_packet_length_min',
       'fwd_packet_length_std', 'bwd_packet_length_min',
       'bwd_packet_length_std', 'flow_bytes_s', 'flow_iat_mean',
       'flow_iat_std', 'flow_iat_min', 'fwd_iat_total', 'fwd_iat_mean',
       'fwd_iat_std', 'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean',
       'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min', 'bwd_header_length',
       'fwd_packets_s', 'bwd_packets_s', 'min_packet_length',
       'packet_length_std', 'packet_length_variance', 'fin_flag_count',
       'syn_flag_count', 'psh_flag_count', 'ack_flag_count', 'urg_flag_count',
       'ece_flag_count', 'down_up_ratio', 'average_packet_size',
       'avg_fwd_segment_size', 'avg_bwd_segment_size', 'fwd_header_length_1',
       'subflow_fwd_bytes', 'subflow_bwd_bytes', 'init_win_bytes_forward',
       'init_win_bytes_backward', 'act_data_pkt_fwd', 'min_seg_size_forward',
       'active_mean', 'active_std', 'active_max', 'active_min', 'idle_std',
 

In [11]:
pro_col = data.columns.to_list()

list(set(raw_col) -set(pro_col))

['bwd_packet_length_max',
 'total_length_of_bwd_packets',
 'bwd_avg_bytes_bulk',
 'rst_flag_count',
 'fwd_psh_flags',
 'bwd_urg_flags',
 'bwd_psh_flags',
 'idle_mean',
 'bwd_avg_packets_bulk',
 'fwd_packet_length_mean',
 'flow_duration',
 'flow_iat_max',
 'total_backward_packets',
 'fwd_iat_max',
 'subflow_fwd_packets',
 'total_length_of_fwd_packets',
 'fwd_avg_packets_bulk',
 'subflow_bwd_packets',
 'bwd_packet_length_mean',
 'max_packet_length',
 'fwd_avg_bulk_rate',
 'fwd_header_length',
 'cwe_flag_count',
 'total_fwd_packets',
 'bwd_avg_bulk_rate',
 'packet_length_mean',
 'fwd_urg_flags',
 'fwd_avg_bytes_bulk',
 'flow_packets_s']

In [12]:
data['label'].value_counts()

BENIGN                        2035505
DoS Hulk                       171509
DDoS                           128005
PortScan                        57305
DoS GoldenEye                   10279
FTP-Patator                      5480
DoS slowloris                    5289
DoS Slowhttptest                 5176
SSH-Patator                      3071
Bot                              1943
Web Attack � Brute Force         1445
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: label, dtype: int64

In [13]:
#너무 적은 라벨값 삭제
index_infiltration = data[data['label'] == 'Infiltration'].index
index_heartbleed = data[data['label'] == 'Heartbleed'].index
data = data.drop(index_infiltration)
data = data.drop(index_heartbleed)
data['label'].value_counts()

BENIGN                        2035505
DoS Hulk                       171509
DDoS                           128005
PortScan                        57305
DoS GoldenEye                   10279
FTP-Patator                      5480
DoS slowloris                    5289
DoS Slowhttptest                 5176
SSH-Patator                      3071
Bot                              1943
Web Attack � Brute Force         1445
Web Attack � XSS                  652
Web Attack � Sql Injection         21
Name: label, dtype: int64

In [14]:
attack_group = {
    'BENIGN':'Normal',
    'DoS Hulk' : 'DoS Hulk',                      
    'DDoS': 'DDoS',
    'PortScan' :'PortScan',
    'DoS GoldenEye' : 'DoS GoldenEye',
    'FTP-Patator' : 'FTP-Patator',
    'DoS slowloris' : 'DoS slowloris',
    'DoS Slowhttptest' : 'DoS Slowhttptest',
    'SSH-Patator' :'SSH-Patator',
    'Bot' : 'Bot',
    'Web Attack � Brute Force' :'Web Attack',
    'Web Attack � XSS' : 'Web Attack',
    'Web Attack � Sql Injection': 'Web Attack'
}
data['label'] = data['label'].map(lambda x: attack_group[x])

In [15]:
data['label'].value_counts()

Normal              2035505
DoS Hulk             171509
DDoS                 128005
PortScan              57305
DoS GoldenEye         10279
FTP-Patator            5480
DoS slowloris          5289
DoS Slowhttptest       5176
SSH-Patator            3071
Web Attack             2118
Bot                    1943
Name: label, dtype: int64

### Scale

In [16]:
print(len(data.columns))
categorical_features = data.select_dtypes(exclude=["number"]).columns
print(len(categorical_features))
print(categorical_features)

numeric_features = data.select_dtypes(exclude=[object]).columns
print(len(numeric_features))
print(numeric_features)

50
1
Index(['label'], dtype='object')
49
Index(['destination_port', 'fwd_packet_length_max', 'fwd_packet_length_min',
       'fwd_packet_length_std', 'bwd_packet_length_min',
       'bwd_packet_length_std', 'flow_bytes_s', 'flow_iat_mean',
       'flow_iat_std', 'flow_iat_min', 'fwd_iat_total', 'fwd_iat_mean',
       'fwd_iat_std', 'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean',
       'bwd_iat_std', 'bwd_iat_max', 'bwd_iat_min', 'bwd_header_length',
       'fwd_packets_s', 'bwd_packets_s', 'min_packet_length',
       'packet_length_std', 'packet_length_variance', 'fin_flag_count',
       'syn_flag_count', 'psh_flag_count', 'ack_flag_count', 'urg_flag_count',
       'ece_flag_count', 'down_up_ratio', 'average_packet_size',
       'avg_fwd_segment_size', 'avg_bwd_segment_size', 'fwd_header_length_1',
       'subflow_fwd_bytes', 'subflow_bwd_bytes', 'init_win_bytes_forward',
       'init_win_bytes_backward', 'act_data_pkt_fwd', 'min_seg_size_forward',
       'active_mean', 'active_std', 

In [17]:
columns = numeric_features.tolist()

y_data = data['label']
x_data = data.drop(labels=['label'], axis=1)
x_data = pd.DataFrame(QuantileTransformer().fit_transform(x_data), columns=columns)
x_data.head()

Unnamed: 0,destination_port,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_std,bwd_packet_length_min,bwd_packet_length_std,flow_bytes_s,flow_iat_mean,flow_iat_std,flow_iat_min,...,init_win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_std,idle_max,idle_min
0,0.939267,0.20971,0.576076,0.0,0.0,0.0,0.971471,0.024024,0.0,0.344344,...,0.0,0.479479,0.261762,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.941026,0.20971,0.576076,0.0,0.582583,0.0,0.772129,0.25951,0.0,0.79029,...,0.808809,0.0,0.261762,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.941036,0.20971,0.576076,0.0,0.582583,0.0,0.813512,0.125125,0.0,0.707708,...,0.808809,0.0,0.261762,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.869984,0.20971,0.576076,0.0,0.582583,0.0,0.841184,0.088589,0.0,0.624625,...,0.837337,0.0,0.261762,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.93925,0.20971,0.576076,0.0,0.0,0.0,0.971471,0.024024,0.0,0.344344,...,0.0,0.479479,0.261762,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
le = LabelEncoder()
y_data = pd.DataFrame(le.fit_transform(y_data), columns=["label"])
y_data.value_counts()

label
7        2035505
3         171509
1         128005
8          57305
2          10279
6           5480
5           5289
4           5176
9           3071
10          2118
0           1943
dtype: int64

In [20]:
x_data.to_csv('/home/wendyunji/MLAC/data/cicids2017-data/new/cicids_xdata.csv', index=False)
y_data.to_csv('/home/wendyunji/MLAC/data/cicids2017-data/new/cicids_ydata.csv', index=False)