## Import necessary packages:

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

seed = 0

## Import data and assign feature name to each column:

In [6]:
columns = []
with open('kddcup.names') as f:
    next(f)
    for line in f:
        feature = line.split(':')[0]
        columns.append(feature)
columns.append('label')

In [7]:
train_data = pd.read_csv('kddcup.data', header=None)
train_data.columns = columns
train_data['label'] = train_data['label'].apply(lambda x: x.replace('.', ''))

test_data = pd.read_csv('corrected', header=None)
test_data.columns = columns
test_data['label'] = test_data['label'].apply(lambda x: x.replace('.', ''))

In [None]:
boundary = train_data.shape[0]
whole_data = pd.concat([train_data, test_data], axis=0)

## One-hot encoding:

In [None]:
categorical = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']
whole_data_categorical = whole_data[categorical].copy()
whole_data.drop(columns=categorical, inplace=True)
whole_data_categorical_T = pd.get_dummies(whole_data_categorical)
whole_data = pd.concat([whole_data_categorical_T, whole_data], axis=1)

## Split data into training and test set:

In [None]:
train_data = whole_data.iloc[:boundary]
test_data = whole_data.iloc[boundary:]

X_train, y_train = train_data.drop(columns='label'), train_data['label']
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=seed)
X_test, y_test = test_data.drop(columns='label'), test_data['label']

## Standardize feature vectors using StandardScaler:

In [None]:
sscaler = preprocessing.StandardScaler().fit(X_train)
X_train, X_valid, X_test = sscaler.transform(X_train), sscaler.transform(X_valid), sscaler.transform(X_test)

## Store preprocessed data for later usage:

In [None]:
np.save("X_train.npy", X_train)
np.save("y_train.npy", y_train)

np.save("X_valid.npy", X_valid)
np.save("y_valid.npy", y_valid)

np.save("X_test.npy", X_test)
np.save("y_test.npy", y_test)

## Load preprocessed data back:

In [2]:
X_train = np.load("X_train.npy")
y_train = np.load("y_train.npy")

X_valid = np.load("X_valid.npy")
y_valid = np.load("y_valid.npy")

X_test = np.load("X_test.npy")
y_test = np.load("y_test.npy")

## Sample a subset of training data from the original training set:

In [3]:
# Recombine X_train and y_train into a dataframe:
num_columns = X_train.shape[1]
dummy_columns = [str(i) for i in range(num_columns)] + ['label']

X_train_df = pd.DataFrame(np.concatenate((X_train, y_train[:,np.newaxis]), axis=1), columns=dummy_columns)

In [4]:
# Divide instances by label:
label_list = np.unique(X_train_df['label'])

df_label_list = []
for label in label_list:
    df_label_list.append(X_train_df[X_train_df['label'] == label])

In [34]:
# Determine how many instances are sampled from each class:
num_samples_per_label = 300
sampled_X_train_df = pd.DataFrame()

for df in df_label_list:
    num_instances = df.shape[0]
    
    replace = False
    if num_instances < num_samples_per_label:
        replace = True
    
    df_sampled = df.sample(n=num_samples_per_label, replace=replace, random_state=seed)
    X_train_df_sampled = pd.concat([sampled_X_train_df, sampled_df], axis=0)

In [40]:
# Decompose X_train_df_sampled into X_train and y_train again:
X_train, y_train = X_train_df_sampled.drop(columns='label'), X_train_df_sampled['label']

## Convert specific attack types to four general attack types:

In [44]:
Probe = ['ipsweep', 'mscan', 'nmap', 'portsweep', 'saint', 'satan']

DoS = ['apache2', 'back', 'land', 'mailbomb', 'neptune', 'pod', 'processtable', 'smurf', 'teardrop', 'udpstorm']

R2L = ['buffer_overflow', 'httptunnel', 'loadmodule', 'perl', 'ps', 'rootkit', 'sqlattack', 'xterm']

U2R = ['ftp_write', 'guess_passwd', 'imap', 'multihop', 'named', 'phf', 'sendmail', 'snmpgetattack', 'snmpguess',
       'spy', 'warezclient', 'warezmaster', 'worm', 'xlock', 'xsnoop']

for i, label in enumerate(y_train):
    if label in Probe:
        y_train[i] = 'Probe'
    elif label in DoS:
        y_train[i] = 'DoS'
    elif label in R2L:
        y_train[i] = 'R2L'
    elif label in U2R:
        y_train[i] = 'U2R'
    elif label != 'normal':
        print ('Unknown attack encounted and conversion process stopped.')
        
for i, label in enumerate(y_valid):
    if label in Probe:
        y_valid[i] = 'Probe'
    elif label in DoS:
        y_valid[i] = 'DoS'
    elif label in R2L:
        y_valid[i] = 'R2L'
    elif label in U2R:
        y_valid[i] = 'U2R'
    elif label != 'normal':
        print ('Unknown attack encounted and conversion process stopped.')
        
for i, label in enumerate(y_test):
    if label in Probe:
        y_test[i] = 'Probe'
    elif label in DoS:
        y_test[i] = 'DoS'
    elif label in R2L:
        y_test[i] = 'R2L'
    elif label in U2R:
        y_test[i] = 'U2R'
    elif label != 'normal':
        print ('Unknown attack encounted and conversion process stopped.')

## Stratified holdout validation:

In [8]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

## Logistic Regression for OvR multiclass classification:
### Linear decision boundary:
In sklearn, softmax regression can **not** exist together with L1 regularization. <br/>
Combine L1 regularization for **embedded** feature selection. <br/>
Can also serve as a feature selection **wrapper** for the subsequent SVM.

In [None]:
parameters = {
    'C': np.logspace(-5, 5, 30)
}

# L1 regularization mimics the feature selection process due to its sparsity.
# OvR easily handles unknown test class problem.
gs_lgr = GridSearchCV(LogisticRegression(multi_class='ovr', penalty='l1', solver='liblinear'), parameters, cv=sss)
gs_lgr.fit(X_train[:10000], y_train[:10000])

lgr = gs_lgr.best_estimator_
y_pred_valid = lgr.predict_proba(X_valid)

# Multiclass average ROC_AUC score:
avg_roc_auc = 0
for i, label in enumerate(lgr.classes_):
    y_valid_label = (y_valid == label).astype(int)
    y_pred_valid_label = y_pred_valid[:,i]
    avg_roc_auc += roc_auc_score(y_valid_label, y_pred_valid_label)
    
avg_roc_auc /= len(lgr.classes_)

print ("The best average ROC_AUC for Logistic Regression on validation set is {:.4f}".format(avg_roc_auc))
display(gs_lgr.best_params_)

## Perform feature selection through the weights of previous Logistic Regression model:

In [59]:
weights_across_classes = np.sum(lgr.coef_, axis=0)
idx_selected_features = np.nonzero(weights_across_classes)[0]

X_train, X_valid, X_test = X_train[:, idx_selected_features], X_valid[:, idx_selected_features], X_test[:, idx_selected_features]

## SVM with radial basis function (rbf) kernel for OvR multiclass classification:
### Non-linear decision boundary:

In [None]:
parameters = {
    'C': np.logspace(-5, 5, 10),
    'gamma': np.logspace(-5, 0, 5)
}

gs_svm_rbf = GridSearchCV(SVC(kernel='rbf', decision_function_shape='ovr'), parameters, cv=sss)
gs_svm_rbf.fit(X_train, y_train)

svm_rbf = gs_svm_rbf.best_estimator_
y_pred_valid = svm_rbf.decision_function(X_valid)

# Multiclass average ROC_AUC score:
avg_roc_auc = 0
for i, label in enumerate(svm_rbf.classes_):
    y_valid_label = (y_valid == label).astype(int)
    y_pred_valid_label = y_pred_valid[:,i]
    avg_roc_auc += roc_auc_score(y_valid_label, y_pred_valid_label)
    
avg_roc_auc /= len(svm_rbf.classes_)

print ("The best average ROC_AUC for Support Vector Machine with rbf kernel on validation set is {:.4f}".format(avg_roc_auc))
display(gs_svm_rbf.best_params_)