In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from scipy.sparse.linalg import svds
from sklearn import mixture

In [2]:
class PCA:
    
    def get_best_k(self, X, error_percent):
        for col in X:
            mean = X[col].mean()
            std = X[col].std()
            X[col] = (X[col] - mean)/std
            X[col] = (X[col]-mean)/std
        max_cols = X.shape[1]
        for K in range(1,max_cols):
            U,S,V = svds(X,k=K)
            X_approx = np.dot(U,np.dot(np.diag(S),V))
            new_s = np.sum(np.sum(np.square(np.subtract(X,X_approx))))
            old_s = np.sum(np.sum(np.square(X)))
            diff = float(new_s)/float(old_s)
            if diff < error_percent:
                return K
    
    def reduce_dimensions(self, X, K):
        U, S, V = svds(X, k=K)
        return U

In [3]:
class KMeans:
    centroids = None
    n_clusters = 5
    def fit(self, X, n_clusters):
        
        self.centroids = dict()
        self.n_clusters = n_clusters
        random.seed(1000)

        for i in range(0,n_clusters):
            curr_index = random.randint(0,len(X)-1)
            self.centroids[i] = X.iloc[curr_index].values

        clusters = [0]*X.shape[0]
        update_occurred = True 
        while update_occurred:
            update_occurred = False
            cluster_sum = dict()
            cluster_cardinality = dict()
            for i in range(0,n_clusters):
                cluster_sum[i] = np.zeros((1,X.shape[1]))
                cluster_cardinality[i] = 0
            index = 0
            for rand_index,row in X.iterrows():
                x = row.values
                best_cluster = -1
                best_distance = -1
                for i in range(0,n_clusters):
                    c = self.centroids[i]
                    dist = np.sum(np.square(np.subtract(c,x)))
                    if best_distance < 0 or dist < best_distance:
                        best_distance = dist
                        best_cluster = i
                if best_cluster != clusters[index]:
                    clusters[index] = best_cluster
                    update_occurred = True
                curr_cluster = clusters[index]
                cluster_sum[curr_cluster] += row.values.reshape(1,X.shape[1])
                cluster_cardinality[curr_cluster] += 1
                index += 1
            for i in range(0,n_clusters):
                self.centroids[i] = cluster_sum[i]/cluster_cardinality[i]

    def predict(self, X, cluster_label_map):
        y_predict = list()
        for index,row in X.iterrows():
            x = row.values
            best_cluster = -1
            best_distance = -1
            for i in range(0,self.n_clusters):
                c = self.centroids[i]
                dist = np.sum(np.square(np.subtract(c,x)))
                if best_distance < 0 or dist < best_distance:
                    best_distance = dist
                    best_cluster = i
            y_predict.append(cluster_label_map[best_cluster])
        return y_predict
    
    def compute_purity(self, y_predict, y_actual):
        correct = 0
        for i in range(0,len(y_actual)):
            if y_actual[i] == y_predict[i]:
                correct += 1
        purity = float(correct)/float(len(y_actual))
        return purity


In [4]:
data = pd.read_csv("intrusion_data.csv")
data.head()

Unnamed: 0,duration,service,src_bytes,dst_bytes,hot,num_failed_logins,num_compromised,num_root,num_file_creations,num_access_files,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,xAttack
0,0,20,491,0,0,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,45,146,0,0,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,50,0,0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,dos
3,0,25,232,8153,0,0,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,25,199,420,0,0,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [5]:
pca = PCA()
X = data[['duration', 'service', 'src_bytes', 'dst_bytes', 'hot', 'num_failed_logins', 'num_compromised', 'num_root', 'num_file_creations', 'num_access_files', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']].copy()

k = pca.get_best_k(X,0.1)
X_reduced = pca.reduce_dimensions(X,k)

cols = list()
for i in range(0,k):
    cols.append('A'+str(i+1))

data_reduced = pd.DataFrame(X_reduced)
data_reduced.columns = cols
data_reduced['xAttack'] = data['xAttack'].tolist()
data_reduced.head()

Unnamed: 0,A1,A2,A3,A4,A5,xAttack
0,0.001116,0.001348,0.000815,0.000106,8e-05,normal
1,-0.002101,-0.002995,0.000735,0.00017,8.4e-05,normal
2,-0.003138,0.000949,0.001517,0.000167,8.6e-05,dos
3,0.000851,0.001742,-7e-05,9e-05,8e-05,normal
4,0.00154,0.001822,0.000728,9.3e-05,8e-05,normal


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    data_reduced[cols],
    data_reduced[['xAttack']],
    test_size=0.3,
    random_state=0)
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5
81433,-0.003082,0.000961,0.001504,0.000165,8.5e-05
15397,0.001169,0.001737,-0.000108,8.5e-05,8e-05
49130,0.001491,0.001788,0.000802,9.6e-05,8e-05
114912,0.00149,0.00179,0.000808,9.6e-05,8e-05
50858,0.006493,-0.00305,0.000647,0.000183,6.5e-05


In [16]:
kms = KMeans()
kms.fit(X_train, 5)

In [18]:
cluster_label_map = dict()

cluster_label_map[0] = 'dos'
cluster_label_map[1] = 'probe'
cluster_label_map[2] = 'u2r'
cluster_label_map[3] = 'r2l'
cluster_label_map[4] = 'normal'

y_train_pred = kms.predict(X_train,cluster_label_map)
train_purity = kms.compute_purity(y_train_pred, y_train['xAttack'].tolist())

y_test_pred = kms.predict(X_test,cluster_label_map)
test_purity = kms.compute_purity(y_test_pred, y_test['xAttack'].tolist())

print '****************************************************'
print 'Train data set purity : '+str(round(train_purity*100,2))+'%'
print 'Test data set purity : '+str(round(test_purity*100,2))+'%'
print '****************************************************'

****************************************************
Train data set purity : 76.73
Test data set purity : 76.38
****************************************************
