In [1]:
import numpy as np
import pandas as pd
import os
from os import walk
from sklearn import preprocessing

In [2]:
file_list = []
for (dir_path, dir_names, file_names) in walk('csv_data/'):
    file_list.extend(os.path.join(dir_path, filename) for filename in file_names)

df = pd.DataFrame()

# column names of 24 features
COLUMN_NAMES = ['duration', 'service', 'src_bytes', 'dest_bytes', 'count', 'same_srv_rate',
                'serror_rate', 'srv_serror_rate', 'dst_host_count', 'dst_host_srv_count',
                'dst_host_same_src_port_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
                'flag', 'ids_detection', 'malware_detection', 'ashula_detection', 'label', 'src_ip_add',
                'src_port_num', 'dst_ip_add', 'dst_port_num', 'start_time', 'protocol']

for file in file_list[0:5]:
        # the python engine was used to support mixed data types
        df = df.append(pd.read_csv(filepath_or_buffer=file, names=COLUMN_NAMES, engine='python'))
        print('Appending {}'.format(file))

Appending csv_data/04/20130420.csv
Appending csv_data/04/20130411.csv
Appending csv_data/04/20130419.csv
Appending csv_data/04/20130427.csv
Appending csv_data/04/20130426.csv


In [3]:
#Checking for NaN values
print('Current DataFrame shape: {}'.format(df.shape))

# drop rows with NaN values
df[COLUMN_NAMES] = df[COLUMN_NAMES].dropna(axis=0, how='any')
print('DataFrame shape after NaN values removal: {}'.format(df.shape))

#  Replace string values to integers
df['malware_detection'] = df['malware_detection'].apply(lambda malware_detection: 1 if malware_detection != '0' else 0)
df['ashula_detection'] = df['ashula_detection'].apply(lambda ashula_detection: 1 if ashula_detection != '0' else 0)
df['ids_detection'] = df['ids_detection'].apply(lambda ids_detection: 1 if ids_detection != '0' else 0)


# replace -1 & -2 (attack) with 1, and 1 (no attack) with 0
df['label'] = df['label'].apply(lambda label: 1 if label == -1 or label == -2 else 0)

#  make time data continuous
df['start_time'] = df['start_time'].apply(lambda time: int(time.split(':')[0]) + (int(time.split(':')[1]) * (1 / 60)) +
                                            (int(time.split(':')[2]) * (1 / 3600)))

COLUMN_TO_STANDARDIZE = ['duration', 'src_bytes', 'dest_bytes', 'count',
                         'same_srv_rate', 'serror_rate', 'srv_serror_rate',
                         'dst_host_count', 'dst_host_srv_count',
                         'dst_host_same_src_port_rate', 'dst_host_serror_rate',
                         'dst_host_srv_serror_rate', 'src_port_num',
                         'dst_port_num', 'start_time']

# categorical data
COLUMN_TO_INDEX = ['ashula_detection', 'dst_ip_add', 'flag', 'ids_detection', 'label', 'malware_detection', 'protocol', 'service', 'src_ip_add']

# index categorical data 
df[COLUMN_TO_INDEX] = df[COLUMN_TO_INDEX].apply(preprocessing.LabelEncoder().fit_transform)

# standardize features
df[COLUMN_TO_STANDARDIZE] = preprocessing.StandardScaler().fit_transform(df[COLUMN_TO_STANDARDIZE])

# print('DataFrame shape before split: {}'.format(df.shape))
# df = np.array_split(df, 24)
# print('DataFrame shape after split: {}'.format(len(df)))

Current DataFrame shape: (778812, 24)
DataFrame shape after NaN values removal: (778812, 24)


In [4]:
# decile binning
COLUMN_TO_STANDARDIZE.append('service')
COLUMN_TO_STANDARDIZE.append('flag')

df = df.drop(labels=['dst_ip_add', 'src_ip_add'], axis=1)
for index in range(len(COLUMN_TO_STANDARDIZE)):
    df[COLUMN_TO_STANDARDIZE[index]] = pd.qcut(df[COLUMN_TO_STANDARDIZE[index]], 10, labels=False, duplicates='drop')
    print('min : {}, max : {}'.format(df[COLUMN_TO_STANDARDIZE[index]].min(), df[COLUMN_TO_STANDARDIZE[index]].max()))

data = np.array(df)
np.save(file='processed_data.npy', arr=data)


min : 0, max : 7
min : 0, max : 4
min : 0, max : 4
min : 0, max : 3
min : 0, max : 1
min : 0, max : 1
min : 0, max : 3
min : 0, max : 5
min : 0, max : 7
min : 0, max : 0
min : 0, max : 2
min : 0, max : 2
min : 0, max : 9
min : 0, max : 6
min : 0, max : 9
min : 0, max : 2
min : 0, max : 3
