## Import necessary packages:

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

## Import data and assign feature name to each column:

In [2]:
columns = []
with open('kddcup.names') as f:
    next(f)
    for line in f:
        feature = line.split(':')[0]
        columns.append(feature)
columns.append('label')

In [3]:
train_data = pd.read_csv('kddcup.data', header=None)
train_data.columns = columns
train_data['label'] = train_data['label'].apply(lambda x: x.replace('.', ''))

test_data = pd.read_csv('corrected', header=None)
test_data.columns = columns
test_data['label'] = test_data['label'].apply(lambda x: x.replace('.', ''))

In [4]:
boundary = train_data.shape[0]
whole_data = pd.concat([train_data, test_data], axis=0)

## One-hot encoding:

In [5]:
categorical = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']
whole_data_categorical = whole_data[categorical].copy()
whole_data.drop(columns=categorical, inplace=True)
whole_data_categorical_T = pd.get_dummies(whole_data_categorical)
whole_data = pd.concat([whole_data_categorical_T, whole_data], axis=1)

## Split data into training and test set:

In [None]:
train_data = whole_data.iloc[:boundary]
test_data = whole_data.iloc[boundary:]

X_train, y_train = train_data.drop(columns='label'), train_data['label']
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=0)
X_test, y_test = test_data.drop(columns='label'), test_data['label']

## Store preprocessed data for later usage:

In [7]:
np.save("X_train.npy", X_train)
np.save("y_train.npy", y_train)

np.save("X_valid.npy", X_valid)
np.save("y_valid.npy", y_valid)

np.save("X_test.npy", X_test)
np.save("y_test.npy", y_test)

## Load preprocessed data back:

In [8]:
X_train = np.load("X_train.npy")
y_train = np.load("y_train.npy")

X_valid = np.load("X_valid.npy")
y_valid = np.load("y_valid.npy")

X_test = np.load("X_test.npy")
y_test = np.load("y_test.npy")

## Standardize feature vectors using StandardScaler:

In [9]:
sscaler = preprocessing.StandardScaler().fit(X_train)
X_train, X_valid, X_test = sscaler.transform(X_train), sscaler.transform(X_valid), sscaler.transform(X_test)

## Logistic Regression for OvR multiclass classification:

In [None]:
# Stratified holdout validation:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

parameters = {
    #'C': np.logspace(-5, 5, 30)
    'C': [1]
}

# L1 regularization mimics the feature selection process due to its sparsity.
# OvR easily handles unknown test class problem.
gs_smr = GridSearchCV(LogisticRegression(multi_class='ovr', penalty='l1', class_weight='balanced', solver='saga'), parameters, cv=sss)
gs_smr.fit(X_train, y_train)

smr = gs_smr.best_estimator_
y_pred_valid = smr.predict_proba(X_valid)

# Multiclass average ROC_AUC score:
avg_roc_auc = 0
for i, label in enumerate(smr.classes_):
    y_valid_label = (y_valid == label).astype(int)
    y_pred_valid_label = y_pred_valid[:,i]
    avg_roc_auc += roc_auc_score(y_valid_label, y_pred_valid_label)
    
avg_roc_auc /= len(smr.classes_)

print ("The best average ROC_AUC for Softmax Regression on validation set is {:.4f}".format(avg_roc_auc))
display(gs_smr.best_params_)