In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split

In [2]:
class KMeans:
    centroids = None
    n_clusters = 5
    def fit(self, X, n_clusters):
        
        self.centroids = dict()
        self.n_clusters = n_clusters
        random.seed(1000)

        for i in range(0,n_clusters):
            curr_index = random.randint(0,len(X)-1)
            self.centroids[i] = X.iloc[curr_index].values

        clusters = [0]*X.shape[0]
        update_occurred = True 
        while update_occurred:
            update_occurred = False
            cluster_sum = dict()
            cluster_cardinality = dict()
            for i in range(0,n_clusters):
                cluster_sum[i] = np.zeros((1,X.shape[1]))
                cluster_cardinality[i] = 0
            index = 0
            for rand_index,row in X.iterrows():
                x = row.values
                best_cluster = -1
                best_distance = -1
                for i in range(0,n_clusters):
                    c = self.centroids[i]
                    dist = np.sum(np.square(np.subtract(c,x)))
                    if best_distance < 0 or dist < best_distance:
                        best_distance = dist
                        best_cluster = i
                if best_cluster != clusters[index]:
                    clusters[index] = best_cluster
                    update_occurred = True
                curr_cluster = clusters[index]
                cluster_sum[curr_cluster] += row.values.reshape(1,X.shape[1])
                cluster_cardinality[curr_cluster] += 1
                index += 1
            for i in range(0,n_clusters):
                self.centroids[i] = cluster_sum[i]/cluster_cardinality[i]

    def predict(self, X):
        y_predict = list()
        for index,row in X.iterrows():
            x = row.values
            best_cluster = -1
            best_distance = -1
            for i in range(0,self.n_clusters):
                c = self.centroids[i]
                dist = np.sum(np.square(np.subtract(c,x)))
                if best_distance < 0 or dist < best_distance:
                    best_distance = dist
                    best_cluster = i
            y_predict.append(best_cluster)
        return y_predict
    
    def compute_purity(self, y_predict, y_actual):
        correct = 0
        for i in range(0,len(y_actual)):
            if y_actual[i] == y_predict[i]:
                correct += 1
        purity = float(correct)/float(len(y_actual))
        return purity


In [3]:
def compute_purity(y_train, y_train_predict, y_actual, y_predict, y_label):
    
    cluster_label_map = dict()
    unique, counts = np.unique(y_train_predict, return_counts=True)
    cluster_dict = dict(zip(unique, counts))
    class_dict = dict()
    y_list = y_train[y_label].tolist()
    for i in y_train[y_label].unique():
        class_dict[i] = y_list.count(i)
    while cluster_dict:
        cluster = max(cluster_dict,key=cluster_dict.get)
        clas = max(class_dict,key=class_dict.get)
        cluster_label_map[cluster] = clas
        del cluster_dict[cluster]
        del class_dict[clas]
    y_pred = map(lambda x : cluster_label_map[x], y_predict)
    y_act = y_actual[y_label].tolist()
    
    correct = 0
    for i in range(0,len(y_act)):
        if y_act[i] == y_pred[i]:
            correct += 1
    purity = float(correct)/float(len(y_act))
    return purity

In [4]:
data = pd.read_csv("compressed_intrusion_data_b_1.csv",header=None)
heading = list()
for i in range(1,15):
    heading.append('A'+str(i))
heading.append('xAttack')
data.columns = heading
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,xAttack
0,0.999149,0.999867,0.999429,0.999334,0.998381,0.998826,0.999541,0.999238,0.999514,0.998777,0.997445,0.998937,0.999775,0.998463,dos
1,0.998301,0.999586,0.9979,0.999288,0.99898,0.999239,0.999535,0.999056,0.999458,0.997825,0.999055,0.999059,0.999442,0.997508,dos
2,0.996715,0.999078,0.997015,0.999075,0.998354,0.99889,0.999084,0.998914,0.9988,0.99625,0.998779,0.995177,0.999393,0.99356,normal
3,0.997598,0.999185,0.997257,0.998707,0.998721,0.998609,0.999049,0.99841,0.999241,0.996663,0.998553,0.996523,0.999378,0.994977,normal
4,0.999287,0.999792,0.998719,0.999638,0.999383,0.999291,0.999736,0.9993,0.999792,0.998496,0.999212,0.999503,0.999731,0.998759,probe


In [5]:
cols = list()
for i in range(1,15):
    cols.append('A'+str(i))
X_train, X_test, y_train, y_test = train_test_split(
    data[cols],
    data[['xAttack']],
    test_size=0.2,
    random_state=0)
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14
18296,0.99658,0.99885,0.996652,0.998737,0.998084,0.998447,0.998897,0.998551,0.998713,0.996109,0.998467,0.994812,0.999225,0.993711
21483,0.994615,0.998476,0.994385,0.9977,0.996425,0.997819,0.99845,0.997479,0.998453,0.994435,0.996808,0.99365,0.998734,0.992903
2039,0.996584,0.999372,0.997024,0.998655,0.99815,0.998664,0.998888,0.998475,0.99922,0.996683,0.998752,0.99639,0.999332,0.994896
8375,0.996589,0.998835,0.996684,0.998761,0.998191,0.99853,0.998918,0.998613,0.99879,0.99627,0.998498,0.995161,0.999326,0.99359
549,0.998787,0.999752,0.999227,0.999235,0.998421,0.998293,0.999272,0.998736,0.999414,0.998285,0.996649,0.998195,0.999693,0.997488


In [6]:
kms = KMeans()
kms.fit(X_train, 5)
y_pred_tr_kms = list(kms.predict(X_train))
y_pred_ts_kms = list(kms.predict(X_test))

In [7]:
train_purity = compute_purity(y_train, y_pred_tr_kms, y_train, y_pred_tr_kms, 'xAttack')
test_purity = compute_purity(y_train, y_pred_tr_kms, y_test, y_pred_ts_kms, 'xAttack')
print '******************** K-Means Clustering result ********************************'
print 'Train data set purity : '+str(round(train_purity*100,2))+'%'
print 'Test data set purity : '+str(round(test_purity*100,2))+'%'
print '************************************************************************'

******************** K-Means Clustering result ********************************
Train data set purity : 2.24%
Test data set purity : 2.36%
************************************************************************
