## Import necessary packages:

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
from sklearn.metrics import average_precision_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

seed = 0

Probe = ['ipsweep', 'mscan', 'nmap', 'portsweep', 'saint', 'satan']

DoS = ['apache2', 'back', 'land', 'mailbomb', 'neptune', 'pod', 'processtable', 'smurf', 'teardrop', 'udpstorm']

R2L = ['buffer_overflow', 'httptunnel', 'loadmodule', 'perl', 'ps', 'rootkit', 'sqlattack', 'xterm']

U2R = ['ftp_write', 'guess_passwd', 'imap', 'multihop', 'named', 'phf', 'sendmail', 'snmpgetattack', 'snmpguess',
       'spy', 'warezclient', 'warezmaster', 'worm', 'xlock', 'xsnoop']

## Import data and assign feature name to each column:

In [2]:
columns = []
with open('kddcup.names') as f:
    next(f)
    for line in f:
        feature = line.split(':')[0]
        columns.append(feature)
columns.append('label')

In [3]:
train_data = pd.read_csv('kddcup.data', header=None)
train_data.columns = columns
train_data['label'] = train_data['label'].apply(lambda x: x.replace('.', ''))

test_data = pd.read_csv('corrected', header=None)
test_data.columns = columns
test_data['label'] = test_data['label'].apply(lambda x: x.replace('.', ''))

In [4]:
boundary = train_data.shape[0]
whole_data = pd.concat([train_data, test_data], axis=0)

## One-hot encoding:

In [5]:
categorical = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']
whole_data_categorical = whole_data[categorical].copy()
whole_data.drop(columns=categorical, inplace=True)
whole_data_categorical_T = pd.get_dummies(whole_data_categorical)
whole_data = pd.concat([whole_data_categorical_T, whole_data], axis=1)

## Split data into training and test set:

In [6]:
train_data = whole_data.iloc[:boundary]
test_data = whole_data.iloc[boundary:]

X_train, y_train = train_data.drop(columns='label'), train_data['label']
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=seed)
X_test, y_test = test_data.drop(columns='label'), test_data['label']

## Convert specific attack types to four general attack types in y_valid and y_test:

In [7]:
for attack in Probe:
    y_valid = y_valid.replace(attack, 'Probe')
    y_test = y_test.replace(attack, 'Probe')
    
for attack in DoS:
    y_valid = y_valid.replace(attack, 'DoS')
    y_test = y_test.replace(attack, 'DoS')
    
for attack in R2L:
    y_valid = y_valid.replace(attack, 'R2L')
    y_test = y_test.replace(attack, 'R2L')
    
for attack in U2R:
    y_valid = y_valid.replace(attack, 'U2R')
    y_test = y_test.replace(attack, 'U2R')

## Store preprocessed data for later usage:

In [8]:
X_train.to_pickle("X_train.pkl")
y_train.to_pickle("y_train.pkl")

X_valid.to_pickle("X_valid.pkl")
y_valid.to_pickle("y_valid.pkl")

X_test.to_pickle("X_test.pkl")
y_test.to_pickle("y_test.pkl")

## Load preprocessed data back:

In [2]:
X_train = pd.read_pickle("X_train.pkl")
y_train = pd.read_pickle("y_train.pkl")

X_valid = pd.read_pickle("X_valid.pkl")
y_valid = pd.read_pickle("y_valid.pkl")

X_test = pd.read_pickle("X_test.pkl")
y_test = pd.read_pickle("y_test.pkl")

## (new version) Sample a subset of training data from the original training set:

In [3]:
# Recombine X_train and y_train into a dataframe:
X_train_df = pd.concat([X_train, y_train], axis=1)

In [4]:
# Divide instances by label:
label_list = np.unique(X_train_df['label'])

df_label_list_dict = {'Probe': [], 'DoS': [], 'R2L': [], 'U2R': [], 'normal': []}
for label in label_list:
    df_label = X_train_df[X_train_df['label'] == label]
    
    if label in Probe:
        df_label_list_dict['Probe'].append(df_label)
    elif label in DoS:
        df_label_list_dict['DoS'].append(df_label)
    elif label in R2L:
        df_label_list_dict['R2L'].append(df_label)
    elif label in U2R:
        df_label_list_dict['U2R'].append(df_label)
    else:
        df_label_list_dict['normal'].append(df_label)

In [5]:
# Determine the maximum number of instances to be sampled from each label:
num_samples_per_label = 500

X_train_df_sampled = pd.DataFrame()
sample_weights = []

for cls, df_label_list in df_label_list_dict.items():
    sample_distribution_within_cls = []
    
    for df_label in df_label_list:
        num_instances = df_label.shape[0]
        num_samples = min(num_instances, num_samples_per_label)
        
        sample_distribution_within_cls.append(num_samples)
        df_sampled = df_label.sample(n=num_samples, random_state=seed)
        X_train_df_sampled = pd.concat([X_train_df_sampled, df_sampled], axis=0)
        
    sample_distribution_within_cls = np.array(sample_distribution_within_cls)
    weight_per_sample_group = sample_distribution_within_cls.sum() / sample_distribution_within_cls
    
    for i in range(len(sample_distribution_within_cls)):
        sample_weights += [weight_per_sample_group[i]] * sample_distribution_within_cls[i]

In [6]:
# Decompose X_train_df_sampled into X_train and y_train again:
X_train, y_train = X_train_df_sampled.drop(columns='label'), X_train_df_sampled['label']

## (old version) Sample a subset of training data from the original training set:

In [3]:
# Recombine X_train and y_train into a dataframe:
X_train_df = pd.concat([X_train, y_train], axis=1)

In [4]:
# Divide instances by label:
label_list = np.unique(X_train_df['label'])

df_label_list = []
for label in label_list:
    df_label_list.append(X_train_df[X_train_df['label'] == label])

In [5]:
# Determine how many instances are sampled from each label:
num_samples_per_label = 500
X_train_df_sampled = pd.DataFrame()

for df in df_label_list:
    num_instances = df.shape[0]
    
    replace = False
    if num_instances < num_samples_per_label:
        replace = True
    
    df_sampled = df.sample(n=num_samples_per_label, replace=replace, random_state=seed)
    X_train_df_sampled = pd.concat([X_train_df_sampled, df_sampled], axis=0)

In [6]:
# Decompose X_train_df_sampled into X_train and y_train again:
X_train, y_train = X_train_df_sampled.drop(columns='label'), X_train_df_sampled['label']

## Convert specific attack types to four general attack types in y_train:

In [7]:
for attack in Probe:
    y_train = y_train.replace(attack, 'Probe')
    
for attack in DoS:
    y_train = y_train.replace(attack, 'DoS')
    
for attack in R2L:
    y_train = y_train.replace(attack, 'R2L')
    
for attack in U2R:
    y_train = y_train.replace(attack, 'U2R')

## Standardize feature vectors using StandardScaler:

In [8]:
sscaler = preprocessing.StandardScaler().fit(X_train)
X_train, X_valid, X_test = sscaler.transform(X_train), sscaler.transform(X_valid), sscaler.transform(X_test)

  return self.partial_fit(X, y)
  
  
  


## Stratified holdout validation:

In [9]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

## Stratified Cross Validation:

In [10]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

## Softmax Regression for multinomial classification:
### Linear decision boundary:
Combine L1 regularization for **embedded** feature selection. <br/>
Can also serve as a feature selection **wrapper** for the subsequent SVM.

In [11]:
parameters = {
    'C': np.logspace(-5, 5, 10)
}

# L1 regularization mimics the feature selection process due to its sparsity.
gs_lgr = GridSearchCV(LogisticRegression(multi_class='multinomial', penalty='l1', class_weight='balanced', solver='saga'), parameters, cv=skf)
gs_lgr.fit(X_train, y_train, sample_weights)

lgr = gs_lgr.best_estimator_
y_pred_valid = lgr.predict_proba(X_valid)

# Multiclass average PR score:
avg_PR = 0
for i, label in enumerate(lgr.classes_):
    y_valid_label = (y_valid == label).astype(int)
    y_pred_valid_label = y_pred_valid[:, i]
    avg_PR += average_precision_score(y_valid_label, y_pred_valid_label)
    
avg_PR /= len(lgr.classes_)

print ("The best average PR score for Logistic Regression on validation set is {:.4f}".format(avg_PR))
display(gs_lgr.best_params_)





The best average PR score for Logistic Regression on validation set is 0.6619


{'C': 599.4842503189421}

## Plot confusion matrix of Softmax Regression on validation set:

In [12]:
y_pred_valid = lgr.predict(X_valid)
cm = confusion_matrix(y_valid, y_pred_valid)
cm = pd.DataFrame(data=cm, index=[lgr.classes_], columns=[lgr.classes_])

In [13]:
percent_correct_horizontal = cm.sum(axis=1)
for i in range(cm.shape[0]):
    percent_correct_horizontal.iloc[i] = cm.iloc[i, i] / percent_correct_horizontal.iloc[i]
percent_correct_horizontal = round(percent_correct_horizontal * 100, 2)
percent_correct_horizontal = percent_correct_horizontal.apply(lambda x: str(x) + '%')

In [14]:
percent_correct_vertical = cm.sum(axis=0)
for i in range(cm.shape[0]):
    percent_correct_vertical.iloc[i] = cm.iloc[i, i] / percent_correct_vertical.iloc[i]
percent_correct_vertical = round(percent_correct_vertical * 100, 2)
percent_correct_vertical = percent_correct_vertical.apply(lambda x: str(x) + '%')

In [15]:
cm['%correct'] = percent_correct_horizontal
percent_correct_vertical.name = ('%Correct', 5)
cm = cm.append(percent_correct_vertical)
display(cm)

Unnamed: 0,DoS,Probe,R2L,U2R,normal,%correct
DoS,774560,1212,326,548,29,99.73%
Probe,38,8067,66,21,28,98.14%
R2L,0,0,8,3,0,72.73%
U2R,1,0,3,221,0,98.22%
normal,795,2372,1580,6799,183010,94.07%
%Correct,99.89%,69.24%,0.4%,2.91%,99.97%,


## Perform feature selection through the weights of previous Logistic Regression model:

In [16]:
weights_across_classes = np.sum(lgr.coef_, axis=0)
idx_selected_features = np.nonzero(weights_across_classes)[0]
selected_features = X_train_df.columns[idx_selected_features]
display(selected_features)

X_train, X_valid, X_test = X_train[:, idx_selected_features], X_valid[:, idx_selected_features], X_test[:, idx_selected_features]

Index(['land', 'logged_in', 'is_guest_login', 'protocol_type_icmp',
       'protocol_type_tcp', 'protocol_type_udp', 'service_Z39_50',
       'service_auth', 'service_bgp', 'service_csnet_ns', 'service_ctf',
       'service_daytime', 'service_discard', 'service_domain',
       'service_domain_u', 'service_eco_i', 'service_ecr_i', 'service_exec',
       'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher',
       'service_hostnames', 'service_http', 'service_imap4', 'service_kshell',
       'service_ldap', 'service_link', 'service_login', 'service_mtp',
       'service_name', 'service_netbios_dgm', 'service_netstat',
       'service_nnsp', 'service_nntp', 'service_ntp_u', 'service_other',
       'service_pop_3', 'service_printer', 'service_private',
       'service_remote_job', 'service_shell', 'service_smtp',
       'service_sql_net', 'service_ssh', 'service_supdup', 'service_systat',
       'service_telnet', 'service_tim_i', 'service_time', 'service_urp_i',
       'se

## SVM with radial basis function (rbf) kernel for OvR multiclass classification:
### Non-linear decision boundary:

In [None]:
parameters = {
    'C': np.logspace(-5, 5, 10),
    'gamma': np.logspace(-5, 0, 5)
}

gs_svm_rbf = GridSearchCV(SVC(kernel='rbf', decision_function_shape='ovr', class_weight='balanced'), parameters, cv=skf)
gs_svm_rbf.fit(X_train, y_train)

svm_rbf = gs_svm_rbf.best_estimator_
y_pred_valid = svm_rbf.decision_function(X_valid)

# Multiclass average PR score:
avg_PR = 0
for i, label in enumerate(svm_rbf.classes_):
    y_valid_label = (y_valid == label).astype(int)
    y_pred_valid_label = y_pred_valid[:, i]
    avg_PR += average_precision_score(y_valid_label, y_pred_valid_label)
    
avg_PR /= len(svm_rbf.classes_)

print ("The best average PR score for Support Vector Machine with rbf kernel on validation set is {:.4f}".format(avg_PR))
display(gs_svm_rbf.best_params_)

## Plot confusion matrix of SVM on validation set:

In [23]:
y_pred_valid = svm_rbf.predict(X_valid)
cm = confusion_matrix(y_valid, y_pred_valid)
cm = pd.DataFrame(data=cm, index=[svm_rbf.classes_], columns=[svm_rbf.classes_])

In [24]:
percent_correct_horizontal = cm.sum(axis=1)
for i in range(cm.shape[0]):
    percent_correct_horizontal.iloc[i] = cm.iloc[i, i] / percent_correct_horizontal.iloc[i]
percent_correct_horizontal = round(percent_correct_horizontal * 100, 2)
percent_correct_horizontal = percent_correct_horizontal.apply(lambda x: str(x) + '%')

In [25]:
percent_correct_vertical = cm.sum(axis=0)
for i in range(cm.shape[0]):
    percent_correct_vertical.iloc[i] = cm.iloc[i, i] / percent_correct_vertical.iloc[i]
percent_correct_vertical = round(percent_correct_vertical * 100, 2)
percent_correct_vertical = percent_correct_vertical.apply(lambda x: str(x) + '%')

In [26]:
cm['%correct'] = percent_correct_horizontal
percent_correct_vertical.name = ('%Correct', 5)
cm = cm.append(percent_correct_vertical)
display(cm)

Unnamed: 0,DoS,Probe,R2L,U2R,normal,%correct
DoS,774142,1768,0,143,622,99.67%
Probe,37,8087,61,14,21,98.38%
R2L,0,0,6,5,0,54.55%
U2R,0,3,2,206,14,91.56%
normal,834,2032,1269,2273,188148,96.71%
%Correct,99.89%,68.02%,0.45%,7.8%,99.65%,
