In [1]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from scipy.sparse.linalg import svds

In [2]:
data = pd.read_csv("intrusion_data.csv")
data.head()

Unnamed: 0,duration,service,src_bytes,dst_bytes,hot,num_failed_logins,num_compromised,num_root,num_file_creations,num_access_files,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,xAttack
0,0,20,491,0,0,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,45,146,0,0,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,50,0,0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,dos
3,0,25,232,8153,0,0,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,25,199,420,0,0,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['duration', 'service', 'src_bytes', 'dst_bytes', 'hot', 'num_failed_logins', 'num_compromised', 'num_root', 'num_file_creations', 'num_access_files', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']],
    data[['xAttack']],
    test_size=0.3,
    random_state=0)

X_train.head()

Unnamed: 0,duration,service,src_bytes,dst_bytes,hot,num_failed_logins,num_compromised,num_root,num_file_creations,num_access_files,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
81433,0,37,0,0,0,0,0,0,0,0,...,255,15,0.06,0.05,0.0,0.0,1.0,1.0,0.0,0.0
15397,0,25,259,448,0,0,0,0,0,0,...,45,255,1.0,0.0,0.02,0.04,0.0,0.0,0.0,0.0
49130,0,25,338,3516,0,0,0,0,0,0,...,255,255,1.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0
114912,0,25,490,4997,0,0,0,0,0,0,...,255,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50858,0,25,0,0,0,0,0,0,0,0,...,66,52,0.79,0.08,0.02,0.0,0.0,0.0,0.79,1.0


In [4]:
for col in X_train:
    mean = X_train[col].mean()
    std = X_train[col].std()
    X_train[col] = (X_train[col] - mean)/std
    X_test[col] = (X_test[col]-mean)/std

In [5]:
X_train.head()

Unnamed: 0,duration,service,src_bytes,dst_bytes,hot,num_failed_logins,num_compromised,num_root,num_file_creations,num_access_files,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
81433,-0.109973,0.300048,-0.007481,-0.005559,-0.094635,-0.026029,-0.011045,-0.011655,-0.026124,-0.041234,...,0.735195,-0.909794,-1.02706,-0.172624,-0.4798,-0.288736,1.605907,1.615509,-0.385668,-0.374397
15397,-0.109973,-0.429482,-0.007441,-0.005466,-0.094635,-0.026029,-0.011045,-0.011655,-0.026124,-0.041234,...,-1.381747,1.257753,1.066723,-0.43727,-0.415025,0.065666,-0.64041,-0.626017,-0.385668,-0.374397
49130,-0.109973,-0.429482,-0.007429,-0.004828,-0.094635,-0.026029,-0.011045,-0.011655,-0.026124,-0.041234,...,0.735195,1.257753,1.066723,-0.43727,-0.447413,-0.288736,-0.64041,-0.626017,-0.385668,-0.374397
114912,-0.109973,-0.429482,-0.007406,-0.00452,-0.094635,-0.026029,-0.011045,-0.011655,-0.026124,-0.041234,...,0.735195,1.257753,1.066723,-0.43727,-0.4798,-0.288736,-0.64041,-0.626017,-0.385668,-0.374397
50858,-0.109973,-0.429482,-0.007481,-0.005559,-0.094635,-0.026029,-0.011045,-0.011655,-0.026124,-0.041234,...,-1.170053,-0.575631,0.598963,-0.013836,-0.415025,-0.288736,-0.64041,-0.626017,2.200877,2.767558


In [6]:
best_k = X_train.shape[1]
for K in range(1,29):
    U,S,V = svds(X_train,k=K)
    S_diag = np.diag(S)
    X_approx = np.dot(U,np.dot(S_diag,V))
    X_diff = np.subtract(X_train,X_approx)
    sq = np.square(X_diff)
    s = np.sum(np.sum(sq))
    old_sq = np.square(X_train)
    old_s = np.sum(np.sum(old_sq))
    diff = float(s)/float(old_s)
    if diff < 0.1:
        best_k = K
        break

In [7]:
U_train, S_train, V_train = svds(X_train, k=best_k)
U_test, S_test, V_test = svds(X_test, k=best_k)

In [8]:
X_train_reduced = pd.DataFrame(U_train)
X_test_reduced = pd.DataFrame(U_test)

In [9]:
centroids = dict()

random.seed(1000)

for i in range(0,5):
    curr_index = random.randint(0,len(X_train_reduced)-1)
    centroids[i] = X_train_reduced.iloc[curr_index].values

cluster = [0]*X_train_reduced.shape[0]
update_occurred = True 
while update_occurred:
    update_occurred = False
    cluster_sum = dict()
    cluster_cardinality = dict()
    for i in range(0,5):
        cluster_sum[i] = np.zeros((1,X_train_reduced.shape[1]))
        cluster_cardinality[i] = 0
    for index,row in X_train_reduced.iterrows():
        x = row.values
        best_cluster = -1
        best_distance = -1
        for i in range(0,5):
            c = centroids[i]
            dist = np.sum(np.square(np.subtract(c,x)))
            if best_distance < 0 or dist < best_distance:
                best_distance = dist
                best_cluster = i
        if best_cluster != cluster[index]:
            cluster[index] = best_cluster
            update_occurred = True
        curr_cluster = cluster[index]
        cluster_sum[curr_cluster] += row.values.reshape(1,X_train_reduced.shape[1])
        cluster_cardinality[curr_cluster] += 1
    for i in range(0,5):
        centroids[i] = cluster_sum[i]/cluster_cardinality[i]
        
    

In [19]:
# print cluster
cluster_label_map = dict()
label_cluster_map = dict()

cluster_label_map[0] = 'dos'
label_cluster_map['dos'] = 0

cluster_label_map[1] = 'probe'
label_cluster_map['probe'] = 1

cluster_label_map[2] = 'u2r'
label_cluster_map['u2r'] = 2

cluster_label_map[3] = 'r2l'
label_cluster_map['r2l'] = 3

cluster_label_map[4] = 'normal'
label_cluster_map['normal'] = 4


In [20]:
correct = 0
for i in range(0,len(X_train)):
    predicted = cluster_label_map[cluster[i]]
    actual = y_train.iloc[i]['xAttack']
    if predicted == actual:
        correct += 1

purity = float(correct)/float(len(X_train)) * 100
print 'Purity : '+str(purity)

Purity : 77.7412367744
