In [1]:
import numpy as np
import pandas as pd
import random
import math

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from sklearn import mixture
from sklearn.cluster import AgglomerativeClustering

np.random.seed(1299827)
np.seterr(all='raise')

{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}

In [2]:
class Autoencoder:
    L = 3
    data_size = None
    W = None
    input_activation = 'linear'
    output_activation = 'linear'
    layer_activation = 'sigmoid'
    
    def __init__(self,orig_data_size,compressed_data_sizes=[1],input_activation = 'linear',output_activation = 'linear',layer_activation = 'sigmoid'):
        self.L = len(compressed_data_sizes) + 2
        self.data_size = orig_data_size
        self.W = list()
        self.input_activation = input_activation
        self.output_activation = output_activation
        self.layer_activation = layer_activation
        n = [orig_data_size]+compressed_data_sizes+[orig_data_size]
        for i in range(0,self.L-1):
            self.W.append(np.random.rand(n[i],n[i+1]))
            
    def g(self,z,activation):
        if activation == 'linear':
            return z
        if activation == 'sigmoid':
            z = np.clip(z,-709,36)
            return 1.0/(1.0+np.exp(-z))
#         if activation == 'relu':
#             return np.maximum(np.zeros(np.shape(z)),z)
        return z
    
    def g_prime(self,z,activation):
        if activation == 'linear':
            return np.ones(z.shape)
        if activation == 'sigmoid':
            g = self.g(z,activation)
            return np.multiply(g,(1.0-g))
    
    def compute_loss(self,orig_X):
        a = orig_X
        h = self.g(a,self.input_activation)
        for i in range(0,self.L-2):
            a = np.dot(h,self.W[i])
            h = self.g(a,self.layer_activation)
        a_out = np.dot(h,self.W[len(self.W)-1])
        X_hat = self.g(a_out,self.output_activation)
        X = np.subtract(X_hat, orig_X)
        return np.sum(np.dot(X.T,X))
            
    def propagate(self, X):
        a = list()
        h = list()
        a.append(X)
        h.append(self.g(a[0],self.input_activation))
        for i in range(0,self.L-2):
            a.append(np.dot(h[i],self.W[i]))
            h.append(self.g(a[i+1],self.layer_activation))

        a_out = np.dot(h[len(h)-1],self.W[len(self.W)-1])
        X_hat = self.g(a_out,self.output_activation)
        
        dLdO = 2.0*np.subtract(X_hat,X)
        dLda = np.multiply(dLdO,self.g_prime(a_out,self.output_activation))
        dW = list()
        for i in range(self.L-2,0,-1):
            dLdW = np.dot(h[i].T,dLda)
            dW = [dLdW]+dW 
            dLdh = np.dot(dLda,self.W[i].T)
            dLda = np.multiply(dLdh,self.g_prime(a[i],self.layer_activation))
        dLdW = np.dot(h[0].T,dLda)
        dW = [dLdW]+dW
        return dW
    
    def fit(self, X, alpha, epochs):
        prev_loss = np.inf
        for e in range(0,epochs):
            for i in range(0,X.shape[0]):
                orig_X = np.reshape(X[i],(1,X[i].shape[0]))
                dW = self.propagate(orig_X)
                for j in range(0,len(dW)):
                    self.W[j] = self.W[j] - alpha*dW[j]
            curr_loss = self.compute_loss(X)
            print 'Cost after '+str(e+1)+' epochs: '+str(curr_loss)
            if (e+1)%100 == 0:
                print 'Prev alpha : '+str(alpha)
                alpha = alpha/5
                print 'Next alpha : '+str(alpha)
                

In [3]:
data = pd.read_csv("intrusion_data.csv")
data.head()

Unnamed: 0,duration,service,src_bytes,dst_bytes,hot,num_failed_logins,num_compromised,num_root,num_file_creations,num_access_files,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,xAttack
0,0,25,193,441,0,0,0,0,0,0,...,255,1.0,0.0,0.07,0.04,0.0,0.04,0.0,0.0,normal
1,0,38,0,0,0,0,0,0,0,0,...,1,0.0,0.07,0.0,0.0,0.0,0.0,1.0,1.0,dos
2,0,25,167,9724,0,0,0,0,0,0,...,255,1.0,0.0,0.03,0.06,0.0,0.0,0.0,0.0,normal
3,0,20,1339,0,0,0,0,0,0,0,...,31,0.23,0.04,0.23,0.0,0.02,0.0,0.0,0.0,normal
4,0,37,0,0,0,0,0,0,0,0,...,25,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,dos


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['duration', 'service', 'src_bytes', 'dst_bytes', 'hot', 'num_failed_logins', 'num_compromised', 'num_root', 'num_file_creations', 'num_access_files', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']],
    data[['xAttack']],
    test_size=0.2,
    random_state=0)

In [5]:
for col in X_train:
    mean = X_train[col].mean()
    std = X_train[col].std()
    X_train[col] = (X_train[col] - mean)/std
    X_test[col] = (X_test[col]-mean)/std

In [6]:
ae = Autoencoder(len(X_train.columns),[14,14,14],'linear','linear','sigmoid')
orig_X = X_train.values


In [7]:
ae.fit(orig_X,0.001,10)

Cost after 1 epochs: 372748.85405407124
Cost after 2 epochs: 255794.69210298528
Cost after 3 epochs: 276223.9382463184
Cost after 4 epochs: 276333.4052467397
Cost after 5 epochs: 229751.6511246973
Cost after 6 epochs: 220071.72765227343


FloatingPointError: underflow encountered in divide

In [None]:
ae.fit(orig_X,1e-5,100)