In [9]:
import sys, time, os
import pandas as pd, numpy as np, tensorflow as tf
import matplotlib.pyplot as plt
from matplotlib import rcParams
from sklearn import preprocessing
from tqdm import tqdm
import category_encoders as ce
from collections import Counter
import tensorflow as tf
import pandas_profiling
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')
rcParams.update({'figure.autolayout': True, 'axes.titlepad': 20})
rcParams['figure.figsize'] = 20,8
import h5py
import codecs

# 1 - pre-process raw data data

In [10]:
def correct_unicode(filename):
    import codecs
    f1 = open(filename.split('.csv')[0] + '_corrected.csv', "w")
    with codecs.open(filename, 'r', encoding='utf-8', errors='ignore') as f:
        for l in f.readlines()[:170367]:
            f1.write(l)
    f1.close()
    
correct_unicode("data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")

In [11]:
dataset_files = [
        'data/Monday-WorkingHours.pcap_ISCX.csv',
        'data/Tuesday-WorkingHours.pcap_ISCX.csv',
        'data/Wednesday-workingHours.pcap_ISCX.csv',
        'data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX_corrected.csv',
        'data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
        'data/Friday-WorkingHours-Morning.pcap_ISCX.csv',
        'data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
        'data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'
]

In [12]:
class PrepareData():
    def __init__(self, source):
        
        self.all_labels = ['BENIGN', 'DoS Hulk', 'SSH-Patator', 'PortScan', 'DoS GoldenEye', 
                      'DDoS', 'Heartbleed', 'Web Attack  Brute Force', 'FTP-Patator', 
                      'Web Attack  XSS', 'DoS slowloris', 'Infiltration', 'Bot', 
                      'Web Attack  Sql Injection', 'DoS Slowhttptest']
        
        if '.pickle' in source:
            sys.stdout.write('\rfound previously created pickle file')
            data      = pd.read_pickle(source)
            self.Y    = data['label']
            self.data = data.drop(['label'], axis=1)
            sys.stdout.write('\rpickle read completed')
            
        elif type(source) == str:
            self.source = source
            self.data     = None

            self.read()
            self.correct_column_names()
            
            # remove object columns
            #self.data.drop(['flow_id', 'source_ip', 'destination_ip', 'timestamp'], axis=1, inplace=True)
            self.data.drop(['flow_id', 'timestamp'], axis=1, inplace=True)
            
            self.change_labels_into_numbers()
            self.to_numeric()
            self.drop_nan()
            
            # remove some benign specific features. they all zero for non-benign classes
            #benign_specific_features = ["fwd urg flags", "cwe flag count"]
            #self.data.drop(benign_specific_features, axis=1, inplace=True)
            #sys.stdout.write('\r{}: some extra columns removed because of being benign-specific'.format(self.source))
            
            self.Y = self.data['label']
            self.data.drop(['label'], axis=1, inplace=True)
            
            self.data.columns = list(map(lambda c: c.replace(' ', '_'), self.data.columns))
        elif type(source) == list:
            self.datasets = []
            self.Ys       = []
            counter       = 1
            for filename in tqdm(source):
                self.datasets.append(PrepareData(filename).data)
                self.Ys.append(PrepareData(filename).Y)
                counter += 1
                sys.stdout.write('\rprocess completed!')
            self.merge_sets()
            self.remove_allzeroes()
            
        ##########################################################################################
        
    def read(self):
        """ 
            description     : reads and returns given CSV file contents
        """
        sys.stdout.write('\r{}: file is being read...'.format(self.source))
        if self.source.split(".")[-1] == "csv":
            self.data = pd.read_csv(self.source)
        else:
            self.data = pd.read_pickle(self.source)
        ##########################################################################################

    
    def correct_column_names(self):
        """
            description     : removes prefix spaces from columns names if any exists
                              and makes all the names lower characters
        """
        
        columns_to_be_renamed = {}
        for c in self.data.columns:
            if c.startswith(' '):
                columns_to_be_renamed[c] = c.strip().replace('  ', ' ')
            

        time.sleep(1)
        self.data.rename(columns=columns_to_be_renamed, inplace=True)
        # lower all column names
        self.data.rename(columns={x: x.lower().replace(' ', '_') for x in self.data.columns}, inplace=True)
        
        sys.stdout.write('\r{}: columns renamed'.format(self.source))
        ##########################################################################################
        
    def change_labels_into_numbers(self):
        """
            description     : turns labels into numbers starting from 0 to N-1
        """
        # make labels counting from 0
        for index, label in enumerate(self.all_labels):
            self.data['label'] = np.where(self.data['label'] == label, index, self.data['label'])
        sys.stdout.write('\r{}: labels reindexed'.format(self.source))
        ##########################################################################################

    def to_numeric(self):
        """
            description     : try to change str values into float. Else, make them NaN so we drop them later
        """
        for i in self.data.columns:
            if i not in ['destination_ip', 'source_ip']:
                self.data[i] = pd.to_numeric(self.data[i], errors='coerce')
        sys.stdout.write('\r{}: values changed into float'.format(self.source))
        ##########################################################################################
        
    def drop_nan(self):
        """
            description     : converts -/+ inf values to NaN and removes the rows including any NaN values
        """
        # change -inf and inf values to NaN in order to drop them
        self.data.replace([np.inf, -np.inf], np.nan)

        # drop rows including NaN values
        before_row_count = self.data.shape[0]
        self.data = self.data.dropna()
        after_row_count = self.data.shape[0]
        
        sys.stdout.write("\r{}: dropped {} rows".format(self.source, before_row_count - after_row_count))
        ##########################################################################################
        
    def change_nan(self, into='mean'):
        """
            description     : change the NaN values into given options (min, max, mean, zero)
        """
        if into == 'zero': 
            self.data[i].fillna(0, inplace=True)
        else:
            self.data[i].fillna(self.data[i].describe()[into], inplace=True)
            
        sys.stdout.write('\r{}: NaN vlaues changed into {}'.format(self.source, into))
        ##########################################################################################
        
    def drop_non_float_columns(self, columns):
        """
            description     : drops given non-float(able) columns
            parameters
                columns     : list of string
                              column names that are known to have non-floatable entities
        """
        self.data.drop(columns, axis=1, inplace=True)
        
        sys.stdout.write('\r{}: non-float(able) columns deleted'.format(self.source))
        ##########################################################################################
        
    def cat_2_OHE(self, columns):
        """
            description     : turn given categorical columns into one-hot numerical columns
                columns     : list of string
                              column names that are known to have categorial entities
        """
        self.data = pd.get_dummies(self.data, columns=columns)
        sys.stdout.write('\r{}: categorical values changed into OHE'.format(self.source))
        ##########################################################################################
        
    def merge_sets(self):
        """
            description     : merges all subdatasets into one
        """
        self.data     = pd.concat(self.datasets)
        self.Y        = pd.concat(self.Ys)
        # unassign these
        self.datasets = None
        self.Ys       = None
        
        sys.stdout.write('\rall subsets merged')
        ##########################################################################################

    def remove_allzeroes(self):
        """
            description     : removes columns that include only zero values
        """
        
        allzeroes = []
        for i in self.data.columns:
            if self.data[i].min() == 0 and self.data[i].max() == 0:
                allzeroes.append(i)
        self.data.drop(allzeroes, axis=1, inplace=True)
        
        sys.stdout.write('\r {} columns including only zeroes removed: {}'.format(len(allzeroes), allzeroes))
        ##########################################################################################

    def cat_to_num(self, columns):
        """
            description     : turns categorical columns into numeral ones
        """
        
        encoder = ce.BinaryEncoder(cols = columns)
        encoder.fit(self.data, self.Y)
        
        self.data = encoder.transform(self.data)
        
        if (self.data.values[:,0].max() - self.data.values[:,0].min() == 0):
            self.data.drop([self.data.columns[0]], axis=1, inplace=True)
        
        sys.stdout.write('\rcategorical columns converted into numerical')
        ##########################################################################################
        
    def normalize_dataset(self):
        """
            description     : normalizes dataset with 0 mean and std 1
        """
        values            = self.data.values
        column_names      = self.data.columns
        values_normalized = preprocessing.normalize(values)
        self.data         = pd.DataFrame(values_normalized, columns=column_names)

        sys.stdout.write('\rdataset normalized')
        ##########################################################################################
        
    def normalize(self):
        """
            description     : normalizes sets with 0 mean and std 1
        """
        self.X_train = preprocessing.normalize(self.X_train)
        self.X_val = preprocessing.normalize(self.X_val)
        self.X_test = preprocessing.normalize(self.X_test)
        
        sys.stdout.write('\rtrain-test normalized')
        ##########################################################################################
        
    def standardize_dataset(self):
        """
            description     : standardizes dataset
        """
        values              = self.data.values
        column_names        = self.data.columns
        scaler              = preprocessing.StandardScaler()
        values_standardized = scaler.fit_transform(values)
        self.data           = pd.DataFrame(values_standardized, columns=column_names)

        sys.stdout.write('\rdataset standardized')
        ##########################################################################################
        
    def standardize(self):
        """
            description     : standardizes train-test
        """

        scaler              = preprocessing.StandardScaler()
        self.X_train        = scaler.fit_transform(self.X_train)
        
        scaler              = preprocessing.StandardScaler()
        self.X_val          = scaler.fit_transform(self.X_val)
        
        scaler              = preprocessing.StandardScaler()
        self.X_test         = scaler.fit_transform(self.X_test)
        
        
        sys.stdout.write('\rtrain-test standardized')
        ##########################################################################################
        
    def robustscale_dataset(self):
        """
            description     : scales dataset to overcome some outliers
        """
        
        values              = self.data.values
        column_names        = self.data.columns
        scaler              = preprocessing.RobustScaler()
        values_scaled       = scaler.fit_transform(values)
        self.data           = pd.DataFrame(values_scaled, columns=column_names)

        sys.stdout.write('\rdataset robust scaled')
        
        
    def robustscale(self):
        """
            description     : scales train-test to overcome some outliers
        """

        scaler1             = preprocessing.RobustScaler()
        self.X_train        = scaler1.fit_transform(self.X_train)

        scaler2             = preprocessing.RobustScaler()
        self.X_val          = scaler2.fit_transform(self.X_val)
        
        scaler3             = preprocessing.RobustScaler()
        self.X_test         = scaler3.fit_transform(self.X_test)

        sys.stdout.write('\rdataset robust scaled')
        ##########################################################################################
        
    def scale_dataset(self, minval=0, maxval=1):
        """
            description     : scales dataset between 0 - 1
        """
        column_names   = self.data.columns
        scaler         = preprocessing.MinMaxScaler(feature_range=(minval, maxval), copy=False)
        scaler.fit(self.data)
        values_scaled  = scaler.transform(self.data)
        self.data      = pd.DataFrame(values_scaled, columns=column_names)

        sys.stdout.write('\rdataset scaled')
        ##########################################################################################
        
    def scale(self):
        """
            description     : scales train-test between 0 - 1
        """

        scaler1             = preprocessing.MinMaxScaler(feature_range=(np.nextafter(0, 1), 1), copy=False)
        self.X_train        = scaler1.fit_transform(self.X_train)
        
        scaler2             = preprocessing.MinMaxScaler(feature_range=(np.nextafter(0, 1), 1), copy=False)
        self.X_val          = scaler2.fit_transform(self.X_val)
        
        scaler3             = preprocessing.MinMaxScaler(feature_range=(np.nextafter(0, 1), 1), copy=False)
        self.X_test         = scaler3.fit_transform(self.X_test)
        

        sys.stdout.write('\rtrain-test scaled')
        ##########################################################################################
        
    def train_test_split(self, test_ratio=0.3, add_val=False):
        column_names        = self.data.columns
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.data.values, self.Y, test_size=test_ratio, random_state=42)
        if add_val:
            self.X_train, self.X_val, self.y_train, self.y_val   = train_test_split(self.X_train, self.y_train, test_size=0.15, random_state=42)
        
        print("""

            train : {}
            val   : {}
            test  : {}
        """.format(self.X_train.shape[0], self.X_val.shape[0] if add_val else 0, self.X_test.shape[0]))
        
        self.y_train = self.y_train.values
        self.y_test  = self.y_test.values
        
        if add_val:
            self.y_val   = self.y_val.values
        
    def save_dataset(self, outname="generated_dataset", onlydataset=False, add_val=False):
        if not onlydataset:
            dataset = self.data.copy()
            dataset['label'] = self.Y.values
            dataset.to_pickle('{}.pickle'.format(outname))
            sys.stdout.write('\rdataset saved as {}.pickle'.format(outname))
        
        try:
            if onlydataset:
                h5f = h5py.File('sets/cicids_raw.h5', 'w')
                h5f.create_dataset('X_train', data=self.X_train)
                h5f.create_dataset('X_test',  data=self.X_test)
                h5f.create_dataset('y_train', data=self.y_train)
                h5f.create_dataset('y_test',  data=self.y_test)

                if add_val:
                    h5f.create_dataset('X_val',   data=self.X_val)
                    h5f.create_dataset('y_val',   data=self.y_val)
                h5f.close()
        except:
            pass
        ##########################################################################################
        
#import pickle
#pickle.dump(source, open('source_object.pickle','wb'))

In [13]:
source = PrepareData(dataset_files)

  0%|          | 0/8 [00:00<?, ?it/s]

data/Monday-WorkingHours.pcap_ISCX.csv: values changed into float

 12%|█▎        | 1/8 [00:24<02:48, 24.04s/it]

data/Tuesday-WorkingHours.pcap_ISCX.csv: values changed into float

 25%|██▌       | 2/8 [00:43<02:15, 22.64s/it]

data/Wednesday-workingHours.pcap_ISCX.csv: dropped 1297 rows

 38%|███▊      | 3/8 [01:13<02:04, 24.80s/it]

data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX_corrected.csv: values changed into float

 50%|█████     | 4/8 [01:21<01:19, 19.90s/it]

data/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv: values changed into float

 62%|██████▎   | 5/8 [01:35<00:54, 18.01s/it]

data/Friday-WorkingHours-Morning.pcap_ISCX.csv: values changed into float

 75%|███████▌  | 6/8 [01:45<00:31, 15.52s/it]

data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv: values changed into float

 88%|████████▊ | 7/8 [01:58<00:14, 14.95s/it]

data/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv: values changed into float

100%|██████████| 8/8 [02:10<00:00, 16.36s/it]

process completed!




 8 columns including only zeroes removed: ['bwd_psh_flags', 'bwd_urg_flags', 'fwd_avg_bytes/bulk', 'fwd_avg_packets/bulk', 'fwd_avg_bulk_rate', 'bwd_avg_bytes/bulk', 'bwd_avg_packets/bulk', 'bwd_avg_bulk_rate']

In [10]:
source.data['destination_ip']

0          192.168.10.5
1          192.168.10.5
2          192.168.10.5
3          192.168.10.5
4         192.168.10.14
              ...      
225740    192.168.10.15
225741    192.168.10.15
225742    192.168.10.15
225743    192.168.10.15
225744    192.168.10.15
Name: destination_ip, Length: 2827876, dtype: object

In [11]:
source.cat_to_num(['source_port', 'destination_port', 'protocol', 'source_ip', 'destination_ip'])

categorical columns converted into numerical

In [None]:
source.data.drop(['source_port', 'destination_port'], axis=1, inplace=True)

In [15]:
for i in source.data.columns:
    print(i) if source.data[i].max() == 0 else ""

source_port_0
destination_ip_0
destination_port_0
protocol_0


In [20]:
# remove allzero new binary columns
allzero_binaries = ['destination_port_0', 'protocol_0', 'destination_ip_0', 'source_port_0']
source.data.drop(allzero_binaries, axis=1, inplace=True)

In [21]:
source.data.shape

(2827876, 133)

## according to analysis, the following features are highly correlated: 
- fwd_header_length.1
- fwd_packet_length_mean
- syn_flag_count
- total_length_of_fwd_packets

In [22]:
# drop highly correlated features
highly_correlated = ["fwd_header_length.1", "fwd_packet_length_mean", "syn_flag_count"]
source.data.drop(highly_correlated, axis=1, inplace=True)

In [23]:
source.save_dataset(outname='raw_data', onlydataset=False)

dataset saved as raw_data.pickle

In [24]:
source.data.describe()

Unnamed: 0,source_ip_1,source_ip_2,source_ip_3,source_ip_4,source_ip_5,source_ip_6,source_ip_7,source_ip_8,source_ip_9,source_ip_10,...,act_data_pkt_fwd,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min
count,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,...,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0
mean,0.0006793084,0.02158228,0.2236884,0.2322828,0.2452848,0.05164901,0.1828659,0.3362506,0.3203623,0.2556679,...,5.423519,-2744.494,81634.0,41175.82,153337.8,58354.92,8324468.0,504354.8,8704568.0,7928061.0
std,0.0260547,0.1453151,0.4167157,0.4222885,0.430256,0.2213174,0.3865566,0.4724259,0.4666159,0.436236,...,636.7482,1085539.0,648923.4,393578.7,1026333.0,577381.8,23640570.0,4605289.0,24377660.0,23373900.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-536870700.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,2.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,213557.0,138.0,110000000.0,74200000.0,110000000.0,110000000.0,120000000.0,76900000.0,120000000.0,120000000.0


# start from here for raw data!

In [6]:
source = PrepareData('raw_data.pickle')

pickle read completed

In [7]:
source.data.describe()

Unnamed: 0,source_ip_1,source_ip_2,source_ip_3,source_ip_4,source_ip_5,source_ip_6,source_ip_7,source_ip_8,source_ip_9,source_ip_10,...,act_data_pkt_fwd,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min
count,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,...,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0,2827876.0
mean,0.0006793084,0.02158228,0.2236884,0.2322828,0.2452848,0.05164901,0.1828659,0.3362506,0.3203623,0.2556679,...,5.423519,-2744.494,81634.0,41175.82,153337.8,58354.92,8324468.0,504354.8,8704568.0,7928061.0
std,0.0260547,0.1453151,0.4167157,0.4222885,0.430256,0.2213174,0.3865566,0.4724259,0.4666159,0.436236,...,636.7482,1085539.0,648923.4,393578.7,1026333.0,577381.8,23640570.0,4605289.0,24377660.0,23373900.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-536870700.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,2.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,213557.0,138.0,110000000.0,74200000.0,110000000.0,110000000.0,120000000.0,76900000.0,120000000.0,120000000.0


In [25]:
source.scale_dataset(minval=0, maxval=1)

dataset scaled

In [28]:
source.save_dataset(outname='raw_data', onlydataset=True)