## Import necessary packages:

In [11]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
from sklearn.metrics import average_precision_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

seed = 0

Probe = ['ipsweep', 'mscan', 'nmap', 'portsweep', 'saint', 'satan']

DoS = ['apache2', 'back', 'land', 'mailbomb', 'neptune', 'pod', 'processtable', 'smurf', 'teardrop', 'udpstorm']

R2L = ['buffer_overflow', 'httptunnel', 'loadmodule', 'perl', 'ps', 'rootkit', 'sqlattack', 'xterm']

U2R = ['ftp_write', 'guess_passwd', 'imap', 'multihop', 'named', 'phf', 'sendmail', 'snmpgetattack', 'snmpguess',
       'spy', 'warezclient', 'warezmaster', 'worm', 'xlock', 'xsnoop']

## Import data and assign feature name to each column:

In [2]:
columns = []
with open('../input/kddcup.names') as f:
    next(f)
    for line in f:
        feature = line.split(':')[0]
        columns.append(feature)
columns.append('label')

In [3]:
train_data = pd.read_csv('../input/kddcup.data', header=None)
train_data.columns = columns
train_data['label'] = train_data['label'].apply(lambda x: x.replace('.', ''))

test_data = pd.read_csv('../input/corrected', header=None)
test_data.columns = columns
test_data['label'] = test_data['label'].apply(lambda x: x.replace('.', ''))

In [4]:
boundary = train_data.shape[0]
whole_data = pd.concat([train_data, test_data], axis=0)

## One-hot encoding:

In [5]:
categorical = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']
whole_data_categorical = whole_data[categorical].copy()
whole_data.drop(columns=categorical, inplace=True)
whole_data_categorical_T = pd.get_dummies(whole_data_categorical)
whole_data = pd.concat([whole_data_categorical_T, whole_data], axis=1)

## Split data into training and test set:

In [6]:
train_data = whole_data.iloc[:boundary]
test_data = whole_data.iloc[boundary:]

X_train, y_train = train_data.drop(columns='label'), train_data['label']
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=seed)
X_test, y_test = test_data.drop(columns='label'), test_data['label']

## Convert specific attack types to four general attack types in y_valid and y_test:

In [7]:
for attack in Probe:
    y_valid = y_valid.replace(attack, 'Probe')
    y_test = y_test.replace(attack, 'Probe')
    
for attack in DoS:
    y_valid = y_valid.replace(attack, 'DoS')
    y_test = y_test.replace(attack, 'DoS')
    
for attack in R2L:
    y_valid = y_valid.replace(attack, 'R2L')
    y_test = y_test.replace(attack, 'R2L')
    
for attack in U2R:
    y_valid = y_valid.replace(attack, 'U2R')
    y_test = y_test.replace(attack, 'U2R')

## Store preprocessed data for later usage:

In [8]:
X_train.to_pickle("X_train.pkl")
y_train.to_pickle("y_train.pkl")

X_valid.to_pickle("X_valid.pkl")
y_valid.to_pickle("y_valid.pkl")

X_test.to_pickle("X_test.pkl")
y_test.to_pickle("y_test.pkl")

## Load preprocessed data back:

In [2]:
X_train = pd.read_pickle("X_train.pkl")
y_train = pd.read_pickle("y_train.pkl")

X_valid = pd.read_pickle("X_valid.pkl")
y_valid = pd.read_pickle("y_valid.pkl")

X_test = pd.read_pickle("X_test.pkl")
y_test = pd.read_pickle("y_test.pkl")

## Sample a subset of training data from the original training set:

In [3]:
# Recombine X_train and y_train into a dataframe:
X_train_df = pd.concat([X_train, y_train], axis=1)

In [4]:
# Divide instances by label:
label_list = np.unique(X_train_df['label'])

df_label_list_dict = {'Probe': [], 'DoS': [], 'R2L': [], 'U2R': [], 'normal': []}
for label in label_list:
    df_label = X_train_df[X_train_df['label'] == label]
    
    if label in Probe:
        df_label_list_dict['Probe'].append(df_label)
    elif label in DoS:
        df_label_list_dict['DoS'].append(df_label)
    elif label in R2L:
        df_label_list_dict['R2L'].append(df_label)
    elif label in U2R:
        df_label_list_dict['U2R'].append(df_label)
    else:
        df_label_list_dict['normal'].append(df_label)

In [5]:
# Determine the maximum number of instances to be sampled from each label:
num_samples_per_label = 2000

X_train_df_sampled = pd.DataFrame()
sample_weights = []

for cls, df_label_list in df_label_list_dict.items():
    sample_distribution_within_cls = []
    
    for df_label in df_label_list:
        num_instances = df_label.shape[0]
        num_samples = min(num_instances, num_samples_per_label)
        
        sample_distribution_within_cls.append(num_samples)
        df_sampled = df_label.sample(n=num_samples, random_state=seed)
        X_train_df_sampled = pd.concat([X_train_df_sampled, df_sampled], axis=0)
        
    sample_distribution_within_cls = np.array(sample_distribution_within_cls)
    weight_per_sample_group = sample_distribution_within_cls.sum() / sample_distribution_within_cls
    
    for i in range(len(sample_distribution_within_cls)):
        sample_weights += [weight_per_sample_group[i]] * sample_distribution_within_cls[i]

In [6]:
# Decompose X_train_df_sampled into X_train and y_train again:
X_train, y_train = X_train_df_sampled.drop(columns='label'), X_train_df_sampled['label']

## Convert specific attack types to four general attack types in y_train:

In [7]:
for attack in Probe:
    y_train = y_train.replace(attack, 'Probe')
    
for attack in DoS:
    y_train = y_train.replace(attack, 'DoS')
    
for attack in R2L:
    y_train = y_train.replace(attack, 'R2L')
    
for attack in U2R:
    y_train = y_train.replace(attack, 'U2R')

## Stratified holdout validation:

In [8]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

## Stratified Cross Validation:

In [9]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

## Make custom scorers for grid search:

In [22]:
def prob_avg_PR_scorer(clf, X, y):
    y_pred = clf.predict_proba(X)
    return average_precision_score(preprocessing.LabelBinarizer().fit_transform(y), y_pred, average='macro')

## Random Forest for multiclass classification:

In [23]:
parameters = {
    'n_estimators': [50, 100, 200, 400, 800, 1600],
    'max_features': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.]
}

gs_rf = GridSearchCV(RandomForestClassifier(criterion='entropy'), parameters, prob_avg_PR_scorer, cv=sss)
gs_rf.fit(X_train, y_train, sample_weights)
rf = gs_rf.best_estimator_

# Multiclass macro average PR_AUC:
avg_PR = prob_avg_PR_scorer(rf, X_valid, y_valid)

print ("The best average PR_AUC for Random Forest on validation set is {:.4f}".format(avg_PR))
display(gs_rf.best_params_)

The best average PR_AUC for Random Forest on validation set is 0.9479


{'max_features': 0.6, 'n_estimators': 400}

## Plot confusion matrix of Random Forest on validation set:

In [24]:
y_pred_valid = rf.predict(X_valid)
cm = confusion_matrix(y_valid, y_pred_valid)
cm = pd.DataFrame(data=cm, index=[rf.classes_], columns=[rf.classes_])

In [25]:
percent_correct_horizontal = cm.sum(axis=1)
for i in range(cm.shape[0]):
    percent_correct_horizontal.iloc[i] = cm.iloc[i, i] / percent_correct_horizontal.iloc[i]
percent_correct_horizontal = round(percent_correct_horizontal * 100, 2)
percent_correct_horizontal = percent_correct_horizontal.apply(lambda x: str(x) + '%')

In [26]:
percent_correct_vertical = cm.sum(axis=0)
for i in range(cm.shape[0]):
    percent_correct_vertical.iloc[i] = cm.iloc[i, i] / percent_correct_vertical.iloc[i]
percent_correct_vertical = round(percent_correct_vertical * 100, 2)
percent_correct_vertical = percent_correct_vertical.apply(lambda x: str(x) + '%')

In [27]:
cm['Recall%'] = percent_correct_horizontal
percent_correct_vertical.name = ('Precision%', 5)
cm = cm.append(percent_correct_vertical)
display(cm)

Unnamed: 0,DoS,Probe,R2L,U2R,normal,Recall%
DoS,776669,6,0,0,0,100.0%
Probe,0,8215,0,0,5,99.94%
R2L,0,0,10,1,0,90.91%
U2R,0,0,5,220,0,97.78%
normal,16,175,85,463,193817,99.62%
Precision%,100.0%,97.84%,10.0%,32.16%,100.0%,
