In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn import mixture
np.random.seed(943)

In [2]:
def compute_purity(y_train, y_train_predict, y_actual, y_predict, y_label):
    
    cluster_label_map = dict()
    unique, counts = np.unique(y_train_predict, return_counts=True)
    cluster_dict = dict(zip(unique, counts))
    class_dict = dict()
    y_list = y_train[y_label].tolist()
    for i in y_train[y_label].unique():
        class_dict[i] = y_list.count(i)
    while cluster_dict:
        cluster = max(cluster_dict,key=cluster_dict.get)
        clas = max(class_dict,key=class_dict.get)
        cluster_label_map[cluster] = clas
        del cluster_dict[cluster]
        del class_dict[clas]
    y_pred = map(lambda x : cluster_label_map[x], y_predict)
    y_act = y_actual[y_label].tolist()
    
    correct = 0
    for i in range(0,len(y_act)):
        if y_act[i] == y_pred[i]:
            correct += 1
    purity = float(correct)/float(len(y_act))
    return purity

In [3]:
data = pd.read_csv("compressed_intrusion_data_b_1.csv",header=None)
heading = list()
for i in range(1,15):
    heading.append('A'+str(i))
heading.append('xAttack')
data.columns = heading
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,xAttack
0,0.999149,0.999867,0.999429,0.999334,0.998381,0.998826,0.999541,0.999238,0.999514,0.998777,0.997445,0.998937,0.999775,0.998463,dos
1,0.998301,0.999586,0.9979,0.999288,0.99898,0.999239,0.999535,0.999056,0.999458,0.997825,0.999055,0.999059,0.999442,0.997508,dos
2,0.996715,0.999078,0.997015,0.999075,0.998354,0.99889,0.999084,0.998914,0.9988,0.99625,0.998779,0.995177,0.999393,0.99356,normal
3,0.997598,0.999185,0.997257,0.998707,0.998721,0.998609,0.999049,0.99841,0.999241,0.996663,0.998553,0.996523,0.999378,0.994977,normal
4,0.999287,0.999792,0.998719,0.999638,0.999383,0.999291,0.999736,0.9993,0.999792,0.998496,0.999212,0.999503,0.999731,0.998759,probe


In [4]:
cols = list()
for i in range(1,15):
    cols.append('A'+str(i))
X_train, X_test, y_train, y_test = train_test_split(
    data[cols],
    data[['xAttack']],
    test_size=0.3,
    random_state=0)
X_train.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14
8293,0.99767,0.999191,0.997461,0.998923,0.998688,0.998729,0.999146,0.998713,0.999155,0.996688,0.998785,0.9962,0.99935,0.994706
23442,0.997417,0.99906,0.997199,0.998791,0.998419,0.998403,0.999042,0.998551,0.998998,0.996353,0.998597,0.995799,0.999197,0.994226
13790,0.998996,0.999835,0.999337,0.999307,0.99847,0.99878,0.999467,0.999156,0.9995,0.998709,0.997409,0.998758,0.999763,0.998214
4477,0.996,0.999307,0.996788,0.998649,0.998312,0.998637,0.999113,0.998796,0.998817,0.997017,0.998827,0.996859,0.99938,0.995428
708,0.996381,0.999435,0.997565,0.999323,0.99888,0.999219,0.999233,0.999049,0.998921,0.996827,0.999037,0.996888,0.999649,0.993974


In [5]:
gmm = mixture.GaussianMixture(n_components=5)
gmm.fit(X_train)
y_pred_tr_gmm = list(gmm.predict(X_train))
y_pred_ts_gmm = list(gmm.predict(X_test))

In [6]:
train_purity = compute_purity(y_train, y_pred_tr_gmm, y_train, y_pred_tr_gmm, 'xAttack')
test_purity = compute_purity(y_train, y_pred_tr_gmm, y_test, y_pred_ts_gmm, 'xAttack')
print '******************** Gaussian mixture models Clustering result ********************************'
print 'Train data set purity : '+str(round(train_purity*100,2))+'%'
print 'Test data set purity : '+str(round(test_purity*100,2))+'%'
print '************************************************************************'

******************** Gaussian mixture models Clustering result ********************************
Train data set purity : 77.12%
Test data set purity : 77.52%
************************************************************************
