In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split

In [2]:
class KMeans:
    centroids = None
    n_clusters = 5
    def fit(self, X, n_clusters):
        
        self.centroids = dict()
        self.n_clusters = n_clusters
        random.seed(1000)

        for i in range(0,n_clusters):
            curr_index = random.randint(0,len(X)-1)
            self.centroids[i] = X.iloc[curr_index].values

        clusters = [0]*X.shape[0]
        update_occurred = True 
        while update_occurred:
            update_occurred = False
            cluster_sum = dict()
            cluster_cardinality = dict()
            for i in range(0,n_clusters):
                cluster_sum[i] = np.zeros((1,X.shape[1]))
                cluster_cardinality[i] = 0
            index = 0
            for rand_index,row in X.iterrows():
                x = row.values
                best_cluster = -1
                best_distance = -1
                for i in range(0,n_clusters):
                    c = self.centroids[i]
                    dist = np.sum(np.square(np.subtract(c,x)))
                    if best_distance < 0 or dist < best_distance:
                        best_distance = dist
                        best_cluster = i
                if best_cluster != clusters[index]:
                    clusters[index] = best_cluster
                    update_occurred = True
                curr_cluster = clusters[index]
                cluster_sum[curr_cluster] += row.values.reshape(1,X.shape[1])
                cluster_cardinality[curr_cluster] += 1
                index += 1
            for i in range(0,n_clusters):
                self.centroids[i] = cluster_sum[i]/cluster_cardinality[i]

    def predict(self, X):
        y_predict = list()
        for index,row in X.iterrows():
            x = row.values
            best_cluster = -1
            best_distance = -1
            for i in range(0,self.n_clusters):
                c = self.centroids[i]
                dist = np.sum(np.square(np.subtract(c,x)))
                if best_distance < 0 or dist < best_distance:
                    best_distance = dist
                    best_cluster = i
            y_predict.append(best_cluster)
        return y_predict
    
    def compute_purity(self, y_predict, y_actual):
        correct = 0
        for i in range(0,len(y_actual)):
            if y_actual[i] == y_predict[i]:
                correct += 1
        purity = float(correct)/float(len(y_actual))
        return purity


In [3]:
def compute_purity(y_train, y_train_predict, y_actual, y_predict, y_label):
    
    cluster_label_map = dict()
    unique, counts = np.unique(y_train_predict, return_counts=True)
    cluster_dict = dict(zip(unique, counts))
    class_dict = dict()
    y_list = y_train[y_label].tolist()
    for i in y_train[y_label].unique():
        class_dict[i] = y_list.count(i)
    while cluster_dict:
        cluster = max(cluster_dict,key=cluster_dict.get)
        clas = max(class_dict,key=class_dict.get)
        cluster_label_map[cluster] = clas
        del cluster_dict[cluster]
        del class_dict[clas]
    y_pred = map(lambda x : cluster_label_map[x], y_predict)
    y_act = y_actual[y_label].tolist()
    
    correct = 0
    for i in range(0,len(y_act)):
        if y_act[i] == y_pred[i]:
            correct += 1
    purity = float(correct)/float(len(y_act))
    return purity

In [4]:
data = pd.read_csv("compressed_intrusion_data_a_2.csv",header=None)
heading = list()
for i in range(1,15):
    heading.append('A'+str(i))
heading.append('xAttack')
data.columns = heading
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,xAttack
0,0.999031,0.999892,0.999173,0.999408,0.998316,0.999864,0.998168,0.999122,0.99961,0.999556,0.999529,0.999619,0.999908,0.999952,dos
1,0.999031,0.999892,0.999173,0.999408,0.998316,0.999864,0.998168,0.999122,0.99961,0.999556,0.999529,0.999619,0.999908,0.999952,dos
2,0.99903,0.999892,0.999172,0.999408,0.998315,0.999864,0.998167,0.999121,0.999609,0.999555,0.999529,0.999619,0.999908,0.999952,normal
3,0.999031,0.999892,0.999173,0.999408,0.998315,0.999864,0.998168,0.999122,0.999609,0.999556,0.999529,0.999619,0.999908,0.999952,normal
4,0.999031,0.999892,0.999173,0.999408,0.998316,0.999864,0.998168,0.999122,0.99961,0.999556,0.999529,0.999619,0.999908,0.999952,probe


In [5]:
cols = list()
for i in range(1,15):
    cols.append('A'+str(i))
X_train, X_test, y_train, y_test = train_test_split(
    data[cols],
    data[['xAttack']],
    test_size=0.2,
    random_state=0)
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14
18296,0.999027,0.999891,0.99917,0.999406,0.998311,0.999863,0.998163,0.999119,0.999608,0.999554,0.999528,0.999617,0.999908,0.999952
21483,0.998999,0.999887,0.999146,0.999388,0.998276,0.999858,0.998121,0.999095,0.999597,0.999539,0.999514,0.999604,0.999905,0.99995
2039,0.999031,0.999892,0.999173,0.999408,0.998316,0.999864,0.998168,0.999122,0.99961,0.999556,0.999529,0.999619,0.999908,0.999952
8375,0.999029,0.999891,0.999171,0.999407,0.998313,0.999864,0.998164,0.99912,0.999609,0.999554,0.999528,0.999618,0.999908,0.999952
549,0.999031,0.999892,0.999173,0.999408,0.998316,0.999864,0.998168,0.999122,0.99961,0.999556,0.999529,0.999619,0.999908,0.999952


In [6]:
kms = KMeans()
kms.fit(X_train, 5)
y_pred_tr_kms = list(kms.predict(X_train))
y_pred_ts_kms = list(kms.predict(X_test))

In [7]:
train_purity = compute_purity(y_train, y_pred_tr_kms, y_train, y_pred_tr_kms, 'xAttack')
test_purity = compute_purity(y_train, y_pred_tr_kms, y_test, y_pred_ts_kms, 'xAttack')
print '******************** K-Means Clustering result ********************************'
print 'Train data set purity : '+str(round(train_purity*100,2))+'%'
print 'Test data set purity : '+str(round(test_purity*100,2))+'%'
print '************************************************************************'

******************** K-Means Clustering result ********************************
Train data set purity : 36.95%
Test data set purity : 37.16%
************************************************************************
