In [1]:
import numpy as np
import pandas as pd
import random
import math
import time
import pickle

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

np.random.seed(1299827)

In [2]:
class Autoencoder:
    L = 3
    data_size = None
    W = None
    input_activation = 'linear'
    output_activation = 'linear'
    layer_activation = 'sigmoid'
    bottleneck_layer = 1
    
    def __init__(self,orig_data_size,compressed_data_sizes=[1],input_activation = 'linear',output_activation = 'linear',layer_activation = 'sigmoid'):
        self.L = len(compressed_data_sizes) + 2
        self.data_size = orig_data_size
        self.W = list()
        self.input_activation = input_activation
        self.output_activation = output_activation
        self.layer_activation = layer_activation
        n = [orig_data_size]+compressed_data_sizes+[orig_data_size]
        for i in range(0,self.L-1):
            self.W.append(np.random.rand(n[i],n[i+1]))
        self.bottleneck_layer = n.index(min(n))
            
    def g(self,z,activation):
        if activation == 'linear':
            return z
        if activation == 'sigmoid':
            z = np.clip(z,-709,36)
            return 1.0/(1.0+np.exp(-z))
        return z
    
    def g_prime(self,z,activation):
        if activation == 'linear':
            return np.ones(z.shape)
        if activation == 'sigmoid':
            g = self.g(z,activation)
            return np.multiply(g,(1.0-g))
    
    def compute_loss(self,orig_X):
        m = len(orig_X)
        a = orig_X
        h = self.g(a,self.input_activation)
        for i in range(0,self.L-2):
            a = np.dot(h,self.W[i])
            h = self.g(a,self.layer_activation)
        a_out = np.dot(h,self.W[len(self.W)-1])
        X_hat = self.g(a_out,self.output_activation)
        X = np.subtract(X_hat, orig_X)
        return (1.0/m)*np.sum(np.dot(X.T,X))
    
    def get_compressed_data(self,X):
        a = X
        h = self.g(a,self.input_activation)
        for i in range(0,self.bottleneck_layer):
            a = np.dot(h,self.W[i])
            h = self.g(a,self.layer_activation)
        return h
            
    def propagate(self, X):
        m = len(X)
        a = list()
        h = list()
        a.append(X)
        h.append(self.g(a[0],self.input_activation))
        for i in range(0,self.L-2):
            a.append(np.dot(h[i],self.W[i]))
            h.append(self.g(a[i+1],self.layer_activation))

        a_out = np.dot(h[len(h)-1],self.W[len(self.W)-1])
        X_hat = self.g(a_out,self.output_activation)
        
        dLdO = (2.0/m)*np.subtract(X_hat,X)
        dLda = np.multiply(dLdO,self.g_prime(a_out,self.output_activation))
        dW = list()
        for i in range(self.L-2,0,-1):
            dLdW = np.dot(h[i].T,dLda)
            dW = [dLdW]+dW 
            dLdh = np.dot(dLda,self.W[i].T)
            dLda = np.multiply(dLdh,self.g_prime(a[i],self.layer_activation))
        dLdW = np.dot(h[0].T,dLda)
        dW = [dLdW]+dW
        return dW
    
    def fit(self, X, alpha, epochs):
        for e in range(0,epochs):
            dW = self.propagate(X)
            for j in range(0,len(dW)):
                self.W[j] = self.W[j]-alpha*dW[j]
            curr_loss = self.compute_loss(X)
            print 'Cost after '+str(e+1)+' epochs: '+str(curr_loss)
    
    def save_weights(self):
        w_pickle = open('W_b_1'+str(int(time.time()))+'.pickle', 'wb')
        pickle.dump(self.W,w_pickle)
        w_pickle.close()
    
    def set_weights(self,file_name):
        w_pickle = open(file_name,'rb')
        self.W = pickle.load(w_pickle)
        w_pickle.close()


In [3]:
data = pd.read_csv("intrusion_data.csv")
data.head()

Unnamed: 0,duration,service,src_bytes,dst_bytes,hot,num_failed_logins,num_compromised,num_root,num_file_creations,num_access_files,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,xAttack
0,0,25,193,441,0,0,0,0,0,0,...,255,1.0,0.0,0.07,0.04,0.0,0.04,0.0,0.0,normal
1,0,38,0,0,0,0,0,0,0,0,...,1,0.0,0.07,0.0,0.0,0.0,0.0,1.0,1.0,dos
2,0,25,167,9724,0,0,0,0,0,0,...,255,1.0,0.0,0.03,0.06,0.0,0.0,0.0,0.0,normal
3,0,20,1339,0,0,0,0,0,0,0,...,31,0.23,0.04,0.23,0.0,0.02,0.0,0.0,0.0,normal
4,0,37,0,0,0,0,0,0,0,0,...,25,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,dos


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['duration', 'service', 'src_bytes', 'dst_bytes', 'hot', 'num_failed_logins', 'num_compromised', 'num_root', 'num_file_creations', 'num_access_files', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']],
    data[['xAttack']],
    test_size=0.2,
    random_state=0)

In [5]:
for col in X_train:
    mean = X_train[col].mean()
    std = X_train[col].std()
    X_train[col] = (X_train[col] - mean)/std
    X_test[col] = (X_test[col]-mean)/std

In [6]:
ae = Autoencoder(len(X_train.columns),[14],'sigmoid','sigmoid','sigmoid')
orig_X = X_train.values


In [7]:
ae.fit(orig_X,10,100)
ae.save_weights()

Cost after 1 epochs: 838.5662422605151
Cost after 2 epochs: 735.7466024176423
Cost after 3 epochs: 520.3203752237752
Cost after 4 epochs: 334.73771629299046
Cost after 5 epochs: 287.4616288790673
Cost after 6 epochs: 249.332811312265
Cost after 7 epochs: 145.06868518211346
Cost after 8 epochs: 88.69269413634402
Cost after 9 epochs: 59.35784153339741
Cost after 10 epochs: 46.996063560292015
Cost after 11 epochs: 42.021955236580986
Cost after 12 epochs: 39.31295488594804
Cost after 13 epochs: 39.30158226575668
Cost after 14 epochs: 39.252278177671094
Cost after 15 epochs: 38.20639826356936
Cost after 16 epochs: 38.205445263924894
Cost after 17 epochs: 38.20483332578536
Cost after 18 epochs: 38.204382363589964
Cost after 19 epochs: 38.2040263794407
Cost after 20 epochs: 38.203733442808705
Cost after 21 epochs: 38.2034855468239
Cost after 22 epochs: 38.20327148182365
Cost after 23 epochs: 38.20308376941126
Cost after 24 epochs: 38.20291715867666
Cost after 25 epochs: 38.202767815058046
Cos

In [None]:
# ae.set_weights('W_1553848799.pickle')
# print ae.compute_loss(orig_X)

In [8]:
test_X = X_test.values
print ae.compute_loss(test_X)

993.1266299446435


In [9]:
compressed_X = ae.get_compressed_data(orig_X)
compressed_X = np.append(compressed_X,y_train,axis=1)
compressed_test_X = ae.get_compressed_data(test_X)
compressed_test_X = np.append(compressed_test_X,y_test,axis=1)
compressed_data = np.append(compressed_X,compressed_test_X,axis=0)
np.savetxt("compressed_intrusion_data_b_1.csv", compressed_data, delimiter=",",fmt="%s")