In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from scipy.sparse.linalg import svds
from sklearn import mixture
from sklearn.cluster import AgglomerativeClustering

In [2]:
class PCA:
    
    def get_best_k(self, X, error_percent):
        for col in X:
            mean = X[col].mean()
            std = X[col].std()
            X[col] = (X[col] - mean)/std
#             X[col] = (X[col]-mean)/std
        max_cols = X.shape[1]
        for K in range(1,max_cols):
            U,S,V = svds(X,k=K)
            X_approx = np.dot(U,np.dot(np.diag(S),V))
            new_s = np.sum(np.sum(np.square(np.subtract(X,X_approx))))
            old_s = np.sum(np.sum(np.square(X)))
            diff = float(new_s)/float(old_s)
            if diff < error_percent:
                return K
    
    def reduce_dimensions(self, X, K):
        U, S, V = svds(X, k=K)
        return U

In [3]:
class KMeans:
    centroids = None
    n_clusters = 5
    def fit(self, X, n_clusters):
        
        self.centroids = dict()
        self.n_clusters = n_clusters
        random.seed(1000)

        for i in range(0,n_clusters):
            curr_index = random.randint(0,len(X)-1)
            self.centroids[i] = X.iloc[curr_index].values

        clusters = [0]*X.shape[0]
        update_occurred = True 
        while update_occurred:
            update_occurred = False
            cluster_sum = dict()
            cluster_cardinality = dict()
            for i in range(0,n_clusters):
                cluster_sum[i] = np.zeros((1,X.shape[1]))
                cluster_cardinality[i] = 0
            index = 0
            for rand_index,row in X.iterrows():
                x = row.values
                best_cluster = -1
                best_distance = -1
                for i in range(0,n_clusters):
                    c = self.centroids[i]
                    dist = np.sum(np.square(np.subtract(c,x)))
                    if best_distance < 0 or dist < best_distance:
                        best_distance = dist
                        best_cluster = i
                if best_cluster != clusters[index]:
                    clusters[index] = best_cluster
                    update_occurred = True
                curr_cluster = clusters[index]
                cluster_sum[curr_cluster] += row.values.reshape(1,X.shape[1])
                cluster_cardinality[curr_cluster] += 1
                index += 1
            for i in range(0,n_clusters):
                self.centroids[i] = cluster_sum[i]/cluster_cardinality[i]

    def predict(self, X):
        y_predict = list()
        for index,row in X.iterrows():
            x = row.values
            best_cluster = -1
            best_distance = -1
            for i in range(0,self.n_clusters):
                c = self.centroids[i]
                dist = np.sum(np.square(np.subtract(c,x)))
                if best_distance < 0 or dist < best_distance:
                    best_distance = dist
                    best_cluster = i
            y_predict.append(best_cluster)
        return y_predict
    
    def compute_purity(self, y_predict, y_actual):
        correct = 0
        for i in range(0,len(y_actual)):
            if y_actual[i] == y_predict[i]:
                correct += 1
        purity = float(correct)/float(len(y_actual))
        return purity


In [4]:
data = pd.read_csv("intrusion_data.csv")
data.head()

Unnamed: 0,duration,service,src_bytes,dst_bytes,hot,num_failed_logins,num_compromised,num_root,num_file_creations,num_access_files,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,xAttack
0,0,25,193,441,0,0,0,0,0,0,...,255,1.0,0.0,0.07,0.04,0.0,0.04,0.0,0.0,normal
1,0,38,0,0,0,0,0,0,0,0,...,1,0.0,0.07,0.0,0.0,0.0,0.0,1.0,1.0,dos
2,0,25,167,9724,0,0,0,0,0,0,...,255,1.0,0.0,0.03,0.06,0.0,0.0,0.0,0.0,normal
3,0,20,1339,0,0,0,0,0,0,0,...,31,0.23,0.04,0.23,0.0,0.02,0.0,0.0,0.0,normal
4,0,37,0,0,0,0,0,0,0,0,...,25,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,dos


In [5]:
pca = PCA()
X = data[['duration', 'service', 'src_bytes', 'dst_bytes', 'hot', 'num_failed_logins', 'num_compromised', 'num_root', 'num_file_creations', 'num_access_files', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']].copy()

k = pca.get_best_k(X,0.1)
X_reduced = pca.reduce_dimensions(X,k)

cols = list()
for i in range(0,k):
    cols.append('A'+str(i+1))

data_reduced = pd.DataFrame(X_reduced)
data_reduced.columns = cols
data_reduced['xAttack'] = data['xAttack'].tolist()
data_reduced.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,xAttack
0,-0.002969,0.003576,0.000561,0.0001,0.000235,-0.000731,-0.000757,0.000434,0.001441,0.002427,0.000412,-0.000315,0.002731,-0.006115,normal
1,-0.002062,-0.003213,0.000425,-0.000181,-0.000503,0.001537,0.001171,-0.003214,0.008887,0.004238,0.005206,-0.00106,-0.016055,0.002908,dos
2,-0.003356,0.005635,0.000333,-3.9e-05,-0.000537,-0.000556,-0.000996,0.000584,0.000932,0.004289,0.000143,-0.000236,0.002607,-0.005963,normal
3,-0.00319,0.004051,-0.002394,-0.000323,-0.000392,0.001021,0.000166,9.4e-05,-0.003254,0.000866,-0.001226,1.6e-05,0.000984,-0.002709,normal
4,0.000557,-0.000543,0.000275,8.7e-05,-2.8e-05,8e-05,0.000235,-0.001652,0.003063,-0.0028,0.001162,-0.000199,0.003639,0.009708,dos


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    data_reduced[cols],
    data_reduced[['xAttack']],
    test_size=0.3,
    random_state=0)
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14
8293,0.002993,-0.000264,0.000374,-0.001043,0.000447,-0.000569,-0.001655,0.000573,-0.00153,0.003115,0.005438,-0.000509,0.002552,-0.004666
23442,0.008639,0.010591,-0.001638,0.000226,-0.000896,0.002523,0.000762,-0.00281,0.00411,-0.001876,0.000412,-0.000369,0.004292,0.008012
13790,-0.002682,-0.001967,-0.000175,-0.000652,0.000228,-0.000465,-0.000745,-0.000546,0.000588,0.001411,-0.00271,0.000258,0.003298,0.009618
4477,0.001895,-0.000552,0.001539,0.000493,0.000474,-0.000648,-0.000431,0.001013,0.002964,0.002215,-0.003299,-0.000199,0.002691,-0.006781
708,0.000868,0.003774,0.002506,0.003743,-0.000378,-0.001565,0.001704,0.013202,-0.010367,-0.002645,-0.004145,-0.000248,-0.016356,0.002328


In [7]:
def compute_purity(y_train, y_train_predict, y_actual, y_predict, y_label):
    
    cluster_label_map = dict()
    unique, counts = np.unique(y_train_predict, return_counts=True)
    cluster_dict = dict(zip(unique, counts))
    class_dict = dict()
    y_list = y_train[y_label].tolist()
    for i in y_train[y_label].unique():
        class_dict[i] = y_list.count(i)
    while cluster_dict:
        cluster = max(cluster_dict,key=cluster_dict.get)
        clas = max(class_dict,key=class_dict.get)
        cluster_label_map[cluster] = clas
        del cluster_dict[cluster]
        del class_dict[clas]
    y_pred = map(lambda x : cluster_label_map[x], y_predict)
    y_act = y_actual[y_label].tolist()
    
    correct = 0
    for i in range(0,len(y_act)):
        if y_act[i] == y_pred[i]:
            correct += 1
    purity = float(correct)/float(len(y_act))
    return purity

In [8]:
kms = KMeans()
kms.fit(X_train, 5)
y_pred_tr_kms = list(kms.predict(X_train))
y_pred_ts_kms = list(kms.predict(X_test))


In [10]:
train_purity = compute_purity(y_train, y_pred_tr_kms, y_train, y_pred_tr_kms, 'xAttack')
test_purity = compute_purity(y_train, y_pred_tr_kms, y_test, y_pred_ts_kms, 'xAttack')
print '******************** K-Means Clustering result ********************************'
print 'Train data set purity : '+str(round(train_purity*100,2))+'%'
print 'Test data set purity : '+str(round(test_purity*100,2))+'%'
print '************************************************************************'

******************** K-Means Clustering result ********************************
Train data set purity : 76.04%
Test data set purity : 75.95%
************************************************************************


In [11]:
gmm = mixture.GaussianMixture(n_components=5)
gmm.fit(X_train)
y_pred_tr_gmm = list(gmm.predict(X_train))
y_pred_ts_gmm = list(gmm.predict(X_test))

In [19]:
train_purity = compute_purity(y_train, y_pred_tr_gmm, y_train, y_pred_tr_gmm, 'xAttack')
test_purity = compute_purity(y_train, y_pred_tr_gmm, y_test, y_pred_ts_gmm, 'xAttack')
print '******************** Gaussian mixture models Clustering result ********************************'
print 'Train data set purity : '+str(round(train_purity*100,2))+'%'
print 'Test data set purity : '+str(round(test_purity*100,2))+'%'
print '************************************************************************'

******************** Gaussian mixture models Clustering result ********************************
Train data set purity : 63.6%
Test data set purity : 62.49%
************************************************************************


In [15]:
agg = AgglomerativeClustering(n_clusters=5)
y_pred_tr_agg = agg.fit_predict(X_train)
y_pred_ts_agg = agg.fit_predict(X_test)

In [18]:
train_purity = compute_purity(y_train, y_pred_tr_agg, y_train, y_pred_tr_agg, 'xAttack')
test_purity = compute_purity(y_train, y_pred_tr_agg, y_test, y_pred_ts_agg, 'xAttack')
print '******************** Gaussian mixture models Clustering result ********************************'
print 'Train data set purity : '+str(round(train_purity*100,2))+'%'
print 'Test data set purity : '+str(round(test_purity*100,2))+'%'

******************** Gaussian mixture models Clustering result ********************************
Train data set purity : 79.1%
Test data set purity : 52.65%
