In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

seed = 0

Probe = ['ipsweep', 'mscan', 'nmap', 'portsweep', 'saint', 'satan']

DoS = ['apache2', 'back', 'land', 'mailbomb', 'neptune', 'pod', 'processtable', 'smurf', 'teardrop', 'udpstorm']

U2R = ['buffer_overflow', 'httptunnel', 'loadmodule', 'perl', 'ps', 'rootkit', 'sqlattack', 'xterm']

R2L = ['ftp_write', 'guess_passwd', 'imap', 'multihop', 'named', 'phf', 'sendmail', 'snmpgetattack', 'snmpguess',
       'spy', 'warezclient', 'warezmaster', 'worm', 'xlock', 'xsnoop']

## Import data and assign feature name to each column:

In [2]:
columns = []
with open('kddcup.names') as f:
    next(f)
    for line in f:
        feature = line.split(':')[0]
        columns.append(feature)
columns.append('label')

train_data = pd.read_csv('../data/kddcup.data', header=None)
train_data.columns = columns
train_data['label'] = train_data['label'].apply(lambda x: x.replace('.', ''))

test_data = pd.read_csv('../data/corrected', header=None)
test_data.columns = columns
test_data['label'] = test_data['label'].apply(lambda x: x.replace('.', ''))

boundary = train_data.shape[0]
whole_data = pd.concat([train_data, test_data], axis=0)

In [None]:
#data_type = []
#with open('kddcup.names') as f:
#    next(f)
#    for line in f:
#        feature = line.split(': ')[1]
#        data_type.append(feature[:-2])

In [None]:
#train_data = whole_data.iloc[:boundary]
#idx_symb = []
#idx_cont = []
#for idx, data_type in enumerate(data_type):
#    if data_type == 'symbolic':
#        idx_symb.append(idx)
#    else:
#        idx_cont.append(idx)

## One-hot encoding:

In [3]:
categorical = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']
whole_data_categorical = whole_data[categorical].copy()
whole_data.drop(columns=categorical, inplace=True)
whole_data_categorical_T = pd.get_dummies(whole_data_categorical)
whole_data = pd.concat([whole_data_categorical_T, whole_data], axis=1)

## Split data into training and test set:

In [5]:
train_data = whole_data.iloc[:boundary]
test_data = whole_data.iloc[boundary:]

X_train, y_train = train_data.drop(columns='label'), train_data['label']
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=seed)
X_test, y_test = test_data.drop(columns='label'), test_data['label']

## Category wise Outlier Detection 

In [18]:
# list to store outlier indices
instances_as_outliers = [] 

# Threshold to select which label data to clean
process_thresh = 1

labels = train_data['label'].unique()
for label in labels:
    label_data = train_data.loc[train_data['label'] == label]
    if label_data.shape[0] > process_thresh:
        label_data = label_data.drop(columns='label')
        mean = label_data.describe().loc['mean']
        std = label_data.describe().loc['std']
        
        # Begin to filter outlier
        for column in label_data:
            feature_value = label_data[column].values
            upper_bound = mean[column] + 3*std[column]
            instances_as_outliers.append(label_data[feature_value>upper_bound].index)

# Clean the repeated indices
outlier_idx = []
for idx_set in instances_as_outliers:
    for idx in idx_set:
        outlier_idx.append(idx)       
outlier_idx = list(set(outlier_idx)) 

train_data_clean = train_data.drop(index=outlier_idx, axis=0)

## Label distribution in Outliers

In [19]:
outliers = train_data.iloc[list(outlier_idx),:]
outliers['label'].value_counts()

normal             439586
neptune             74634
satan                1805
smurf                1778
portsweep            1658
ipsweep               975
nmap                  289
back                  284
warezclient           167
teardrop              114
pod                    39
guess_passwd            8
buffer_overflow         7
land                    4
imap                    3
warezmaster             3
Name: label, dtype: int64

## Label distribution in Origninal Data

In [17]:
train_data['label'].value_counts()

smurf              2807886
neptune            1072017
normal              972781
satan                15892
ipsweep              12481
portsweep            10413
nmap                  2316
back                  2203
warezclient           1020
teardrop               979
pod                    264
guess_passwd            53
buffer_overflow         30
land                    21
warezmaster             20
imap                    12
rootkit                 10
loadmodule               9
ftp_write                8
multihop                 7
phf                      4
perl                     3
spy                      2
Name: label, dtype: int64

## Convert specific attack types to four general attack types in y_valid and y_test:

In [20]:
for attack in Probe:
    y_valid = y_valid.replace(attack, 'Probe')
    y_test = y_test.replace(attack, 'Probe')
    
for attack in DoS:
    y_valid = y_valid.replace(attack, 'DoS')
    y_test = y_test.replace(attack, 'DoS')
    
for attack in R2L:
    y_valid = y_valid.replace(attack, 'R2L')
    y_test = y_test.replace(attack, 'R2L')
    
for attack in U2R:
    y_valid = y_valid.replace(attack, 'U2R')
    y_test = y_test.replace(attack, 'U2R')

## Store preprocessed data for later usage:

In [21]:
X_train.to_pickle("X_train.pkl")
y_train.to_pickle("y_train.pkl")

X_valid.to_pickle("X_valid.pkl")
y_valid.to_pickle("y_valid.pkl")

X_test.to_pickle("X_test.pkl")
y_test.to_pickle("y_test.pkl")