In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split

In [2]:
class KMeans:
    centroids = None
    n_clusters = 5
    def fit(self, X, n_clusters):
        
        self.centroids = dict()
        self.n_clusters = n_clusters
        random.seed(1000)

        for i in range(0,n_clusters):
            curr_index = random.randint(0,len(X)-1)
            self.centroids[i] = X.iloc[curr_index].values

        clusters = [0]*X.shape[0]
        update_occurred = True 
        while update_occurred:
            update_occurred = False
            cluster_sum = dict()
            cluster_cardinality = dict()
            for i in range(0,n_clusters):
                cluster_sum[i] = np.zeros((1,X.shape[1]))
                cluster_cardinality[i] = 0
            index = 0
            for rand_index,row in X.iterrows():
                x = row.values
                best_cluster = -1
                best_distance = -1
                for i in range(0,n_clusters):
                    c = self.centroids[i]
                    dist = np.sum(np.square(np.subtract(c,x)))
                    if best_distance < 0 or dist < best_distance:
                        best_distance = dist
                        best_cluster = i
                if best_cluster != clusters[index]:
                    clusters[index] = best_cluster
                    update_occurred = True
                curr_cluster = clusters[index]
                cluster_sum[curr_cluster] += row.values.reshape(1,X.shape[1])
                cluster_cardinality[curr_cluster] += 1
                index += 1
            for i in range(0,n_clusters):
                self.centroids[i] = cluster_sum[i]/cluster_cardinality[i]

    def predict(self, X):
        y_predict = list()
        for index,row in X.iterrows():
            x = row.values
            best_cluster = -1
            best_distance = -1
            for i in range(0,self.n_clusters):
                c = self.centroids[i]
                dist = np.sum(np.square(np.subtract(c,x)))
                if best_distance < 0 or dist < best_distance:
                    best_distance = dist
                    best_cluster = i
            y_predict.append(best_cluster)
        return y_predict
    
    def compute_purity(self, y_predict, y_actual):
        correct = 0
        for i in range(0,len(y_actual)):
            if y_actual[i] == y_predict[i]:
                correct += 1
        purity = float(correct)/float(len(y_actual))
        return purity


In [3]:
def compute_purity(y_train, y_train_predict, y_actual, y_predict, y_label):
    
    cluster_label_map = dict()
    unique, counts = np.unique(y_train_predict, return_counts=True)
    cluster_dict = dict(zip(unique, counts))
    class_dict = dict()
    y_list = y_train[y_label].tolist()
    for i in y_train[y_label].unique():
        class_dict[i] = y_list.count(i)
    while cluster_dict:
        cluster = max(cluster_dict,key=cluster_dict.get)
        clas = max(class_dict,key=class_dict.get)
        cluster_label_map[cluster] = clas
        del cluster_dict[cluster]
        del class_dict[clas]
    y_pred = map(lambda x : cluster_label_map[x], y_predict)
    y_act = y_actual[y_label].tolist()
    
    correct = 0
    for i in range(0,len(y_act)):
        if y_act[i] == y_pred[i]:
            correct += 1
    purity = float(correct)/float(len(y_act))
    return purity

In [4]:
data = pd.read_csv("compressed_intrusion_data_a.csv",header=None)
heading = list()
for i in range(1,15):
    heading.append('A'+str(i))
heading.append('xAttack')
data.columns = heading
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,xAttack
0,0.645767,0.623645,0.751609,0.62099,0.187584,0.437081,0.167047,0.484827,0.296492,0.665497,0.610416,0.681719,0.485009,0.764236,dos
1,0.187173,0.226052,0.759255,0.652515,0.248886,0.669317,0.690183,0.521835,0.828925,0.629275,0.597592,0.647198,0.265081,0.478803,dos
2,0.370563,0.530759,0.268358,0.383741,0.687811,0.529198,0.529663,0.422345,0.446172,0.54751,0.51003,0.286663,0.657549,0.305966,normal
3,0.529521,0.449775,0.451936,0.261041,0.622198,0.579708,0.556322,0.368201,0.561374,0.282816,0.343599,0.41769,0.432493,0.453444,normal
4,0.333522,0.423431,0.160733,0.860596,0.307309,0.217652,0.550967,0.17009,0.88963,0.49343,0.811699,0.630565,0.16023,0.863686,probe


In [5]:
cols = list()
for i in range(1,15):
    cols.append('A'+str(i))
X_train, X_test, y_train, y_test = train_test_split(
    data[cols],
    data[['xAttack']],
    test_size=0.2,
    random_state=0)
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14
18296,0.369217,0.439636,0.326777,0.293656,0.672573,0.386858,0.692029,0.425753,0.465841,0.52641,0.398399,0.297332,0.679643,0.364063
21483,0.339336,0.418255,0.353277,0.332655,0.531003,0.387853,0.498036,0.435508,0.315718,0.29196,0.431196,0.390751,0.388946,0.286649
2039,0.483022,0.511595,0.468983,0.317123,0.440961,0.409182,0.687845,0.572694,0.458821,0.221808,0.396689,0.378292,0.40095,0.265551
8375,0.407657,0.424531,0.329344,0.27803,0.67329,0.417963,0.665938,0.386011,0.496164,0.522846,0.432976,0.324949,0.709139,0.382005
549,0.609842,0.660047,0.768692,0.75268,0.378515,0.396208,0.277877,0.347399,0.275076,0.50482,0.384565,0.666073,0.468503,0.668844


In [6]:
kms = KMeans()
kms.fit(X_train, 5)
y_pred_tr_kms = list(kms.predict(X_train))
y_pred_ts_kms = list(kms.predict(X_test))

In [7]:
train_purity = compute_purity(y_train, y_pred_tr_kms, y_train, y_pred_tr_kms, 'xAttack')
test_purity = compute_purity(y_train, y_pred_tr_kms, y_test, y_pred_ts_kms, 'xAttack')
print '******************** K-Means Clustering result ********************************'
print 'Train data set purity : '+str(round(train_purity*100,2))+'%'
print 'Test data set purity : '+str(round(test_purity*100,2))+'%'
print '************************************************************************'

******************** K-Means Clustering result ********************************
Train data set purity : 76.53%
Test data set purity : 76.7%
************************************************************************
