#### Initialization

In [1]:
import copy
import itertools
import numpy as np
import pandas as pd
from random import randint
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from statsmodels.tools.tools import add_constant
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

#### Reading Data

In [2]:
labels = pd.read_csv("labels.csv",sep = ":",header = None)
data   = pd.read_csv("kddcup.data_10_percent_corrected", names = labels.iloc[:,0].values)

In [3]:
def output_replace(df):
    replacements = {
        r'(smurf.|neptune.|back.|teardrop.|pod.|land.)' : 'dos',
        r'(normal.)' : 'normal',
        r'(satan.|ipsweep.|portsweep.|nmap.)' : 'probe',
        r'(warezclient.|guess_passwd.|warezmaster.|imap.|ftp_write.|multihop.|phf.|spy.)' : 'r2l',
        r'(buffer_overflow.|rootkit.|loadmodule.|perl.)' : 'u2r'
    }
    df.replace(replacements, regex=True, inplace=True)

#### Labeling Categorical Features

In [4]:
lb_make = LabelEncoder()
cat_columns = labels.loc[labels.iloc[:,1] == " symbolic.",0].values
data[cat_columns] = data[cat_columns].apply(lambda col: lb_make.fit_transform(col))

In [10]:
output_replace(data)
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
col_to_remove = [i for i in X.columns if len(X.loc[:,i].unique()) == 1]
a = X.drop(col_to_remove, axis = 1)

#### Method 1

In [None]:
vif = np.array([variance_inflation_factor(a.values, i) for i in range(a.shape[1])])

#### Method 3

In [6]:
size = len(a.columns)
Similiarity = np.zeros((size,size))
for col1,col2 in itertools.product(range(size), range(size)):
    pca  = PCA(n_components = 2)
    pca.fit(a.iloc[:,[col1,col2]])
    Similiarity[col1][col2] = np.amin(pca.explained_variance_)

In [43]:
k = 28
features_to_select = np.full(size, True)

while k != 1:
    t_Similiarity = copy.deepcopy(Similiarity[features_to_select])
    index = np.argpartition(t_Similiarity, k)

    min_index     = np.argmin(t_Similiarity[range(len(t_Similiarity)),index[:,k]])
    epsilon       = t_Similiarity[min_index, k]
    
    features_to_select[index[min_index,:k]] = False
    features_to_select[min_index] = True
    
    if((k + 1) > np.sum(features_to_select)):
        k = np.sum(features_to_select) - 1
        if k == 1:
            break
    
    next_epsilon = float('inf')
    while(epsilon < next_epsilon):
        k = k - 1
        if k == 1:
            break
        t_Similiarity  = Similiarity[features_to_select]
        index          = np.argpartition(t_Similiarity, k)
        min_index      = np.argmin(t_Similiarity[range(len(t_Similiarity)),index[:,k]])
        next_epsilon   = t_Similiarity[min_index, k]

print(np.sum(features_to_select))
op = a.iloc[:,features_to_select]

10


In [44]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
import warnings

def naive_bayes(X,y):
    accuracy_total = []
    precision_total = []
    recall_total = []
    f1_total = []
    acc_overall_total = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)
    mnb = MultinomialNB()
    mnb.fit(X_train, y_train)
    preds = mnb.predict(X_test)
    Accuracy = accuracy_score(y_test, preds)
    precision, recall, f1, support = precision_recall_fscore_support(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    print(cm)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    accuracy_total.append(cm.diagonal())
    precision_total.append(precision)
    recall_total.append(recall)
    f1_total.append(f1)
    acc_overall_total.append(Accuracy)

    print("accuracy = ", np.array(accuracy_total).mean(axis=0))
    print('Precision: ', np.array(precision_total).mean(axis=0))
    print('Recall: ', np.array(recall_total).mean(axis=0))
    print('F1: ', np.array(f1_total).mean(axis=0))
    print("Accuracy overall = ", np.array(acc_overall_total).mean())
    
    return np.array(accuracy_total).mean(axis=0), np.array(f1_total).mean(axis=0)


print("Naive Bayes")
original_ueq_nb_acc, original_ueq_nb_f1 = naive_bayes(op,y)

Naive Bayes
[[56435     0    13   440 21372]
 [ 6995   182   345     6 11947]
 [    1     0     0     0   829]
 [  120     0     6    12    92]
 [    1     2     0     0     7]]
accuracy =  [0.7211219  0.00934531 0.         0.05217391 0.7       ]
Precision:  [8.88012966e-01 9.89130435e-01 0.00000000e+00 2.62008734e-02
 2.04397465e-04]
Recall:  [0.7211219  0.00934531 0.         0.05217391 0.7       ]
F1:  [7.95912899e-01 1.85156926e-02 0.00000000e+00 3.48837209e-02
 4.08675599e-04]
Accuracy overall =  0.5732098578007185


In [45]:
def knn_classify(X, Y, k):
    knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
    knn.fit(X_train, y_train)
    preds = knn.predict(X_test)
    Accuracy = accuracy_score(y_test, preds)
    precision, recall, f1, support = precision_recall_fscore_support(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    print("accuracy = ", cm.diagonal())
    print('Precision: ', precision)
    print('Recall: ', recall)
    print('F1: ', f1)
    print("Accuracy overall = ", Accuracy)
    
    return cm.diagonal(), f1

original_ueq_knn_acc, original_ueq_knn_f1 = knn_classify(op, y, 9)

  'precision', 'predicted', average, warn_for)


accuracy =  [0.99984703 0.99824389 0.89628681 0.97101449 0.        ]
Precision:  [0.99879027 0.99824389 0.9929078  0.96172249 0.        ]
Recall:  [0.99984703 0.99824389 0.89628681 0.97101449 0.        ]
F1:  [0.99931837 0.99824389 0.94212651 0.96634615 0.        ]
Accuracy overall =  0.9985628257679268
