In [1]:
import numpy as np
import pandas as pd
import random
import math
import time
import pickle

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

np.random.seed(1299827)

In [2]:
class Autoencoder:
    L = 3
    data_size = None
    W = None
    input_activation = 'linear'
    output_activation = 'linear'
    layer_activation = 'sigmoid'
    bottleneck_layer = 1
    
    def __init__(self,orig_data_size,compressed_data_sizes=[1],input_activation = 'linear',output_activation = 'linear',layer_activation = 'sigmoid'):
        self.L = len(compressed_data_sizes) + 2
        self.data_size = orig_data_size
        self.W = list()
        self.input_activation = input_activation
        self.output_activation = output_activation
        self.layer_activation = layer_activation
        n = [orig_data_size]+compressed_data_sizes+[orig_data_size]
        for i in range(0,self.L-1):
            self.W.append(np.random.rand(n[i],n[i+1]))
        self.bottleneck_layer = n.index(min(n))
            
    def g(self,z,activation):
        if activation == 'linear':
            return z
        if activation == 'sigmoid':
            z = np.clip(z,-709,36)
            return 1.0/(1.0+np.exp(-z))
        return z
    
    def g_prime(self,z,activation):
        if activation == 'linear':
            return np.ones(z.shape)
        if activation == 'sigmoid':
            g = self.g(z,activation)
            return np.multiply(g,(1.0-g))
    
    def compute_loss(self,orig_X):
        m = len(orig_X)
        a = orig_X
        h = self.g(a,self.input_activation)
        for i in range(0,self.L-2):
            a = np.dot(h,self.W[i])
            h = self.g(a,self.layer_activation)
        a_out = np.dot(h,self.W[len(self.W)-1])
        X_hat = self.g(a_out,self.output_activation)
        X = np.subtract(X_hat, orig_X)
        return (1.0/m)*np.sum(np.dot(X.T,X))
    
    def get_compressed_data(self,X):
        a = X
        h = self.g(a,self.input_activation)
        for i in range(0,self.bottleneck_layer):
            a = np.dot(h,self.W[i])
            h = self.g(a,self.layer_activation)
        return h
            
    def propagate(self, X):
        m = len(X)
        a = list()
        h = list()
        a.append(X)
        h.append(self.g(a[0],self.input_activation))
        for i in range(0,self.L-2):
            a.append(np.dot(h[i],self.W[i]))
            h.append(self.g(a[i+1],self.layer_activation))

        a_out = np.dot(h[len(h)-1],self.W[len(self.W)-1])
        X_hat = self.g(a_out,self.output_activation)
        
        dLdO = (2.0/m)*np.subtract(X_hat,X)
        dLda = np.multiply(dLdO,self.g_prime(a_out,self.output_activation))
        dW = list()
        for i in range(self.L-2,0,-1):
            dLdW = np.dot(h[i].T,dLda)
            dW = [dLdW]+dW 
            dLdh = np.dot(dLda,self.W[i].T)
            dLda = np.multiply(dLdh,self.g_prime(a[i],self.layer_activation))
        dLdW = np.dot(h[0].T,dLda)
        dW = [dLdW]+dW
        return dW
    
    def fit(self, X, alpha, epochs):
        for e in range(0,epochs):
            dW = self.propagate(X)
            for j in range(0,len(dW)):
                self.W[j] = self.W[j]-alpha*dW[j]
            curr_loss = self.compute_loss(X)
            print 'Cost after '+str(e+1)+' epochs: '+str(curr_loss)
    
    def save_weights(self):
        w_pickle = open('W_b_2'+str(int(time.time()))+'.pickle', 'wb')
        pickle.dump(self.W,w_pickle)
        w_pickle.close()
    
    def set_weights(self,file_name):
        w_pickle = open(file_name,'rb')
        self.W = pickle.load(w_pickle)
        w_pickle.close()


In [3]:
data = pd.read_csv("intrusion_data.csv")
data.head()

Unnamed: 0,duration,service,src_bytes,dst_bytes,hot,num_failed_logins,num_compromised,num_root,num_file_creations,num_access_files,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,xAttack
0,0,25,193,441,0,0,0,0,0,0,...,255,1.0,0.0,0.07,0.04,0.0,0.04,0.0,0.0,normal
1,0,38,0,0,0,0,0,0,0,0,...,1,0.0,0.07,0.0,0.0,0.0,0.0,1.0,1.0,dos
2,0,25,167,9724,0,0,0,0,0,0,...,255,1.0,0.0,0.03,0.06,0.0,0.0,0.0,0.0,normal
3,0,20,1339,0,0,0,0,0,0,0,...,31,0.23,0.04,0.23,0.0,0.02,0.0,0.0,0.0,normal
4,0,37,0,0,0,0,0,0,0,0,...,25,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,dos


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['duration', 'service', 'src_bytes', 'dst_bytes', 'hot', 'num_failed_logins', 'num_compromised', 'num_root', 'num_file_creations', 'num_access_files', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']],
    data[['xAttack']],
    test_size=0.2,
    random_state=0)

In [5]:
for col in X_train:
    mean = X_train[col].mean()
    std = X_train[col].std()
    X_train[col] = (X_train[col] - mean)/std
    X_test[col] = (X_test[col]-mean)/std

In [6]:
ae = Autoencoder(len(X_train.columns),[25,20,15,14,15,20,25],'sigmoid','sigmoid','sigmoid')
orig_X = X_train.values

In [7]:
ae.fit(orig_X,100,1000)
ae.save_weights()

Cost after 1 epochs: 879.1797914904564
Cost after 2 epochs: 879.1759801273694
Cost after 3 epochs: 879.169220197337
Cost after 4 epochs: 879.1534983992373
Cost after 5 epochs: 879.08406679226
Cost after 6 epochs: 873.34721392235
Cost after 7 epochs: 766.9896736456645
Cost after 8 epochs: 714.1817276799521
Cost after 9 epochs: 714.1768397731452
Cost after 10 epochs: 714.1647485017937
Cost after 11 epochs: 714.0867544770658
Cost after 12 epochs: 667.9005996829403
Cost after 13 epochs: 663.0352966010205
Cost after 14 epochs: 614.1826432461021
Cost after 15 epochs: 614.1607080371502
Cost after 16 epochs: 613.9178966386586
Cost after 17 epochs: 564.0707578050718
Cost after 18 epochs: 521.9811888016577
Cost after 19 epochs: 479.17656534446917
Cost after 20 epochs: 478.87736322815044
Cost after 21 epochs: 438.1911129843502
Cost after 22 epochs: 438.1737048180804
Cost after 23 epochs: 437.99696713023866
Cost after 24 epochs: 362.2960447055431
Cost after 25 epochs: 362.2942152351024
Cost after 

Cost after 203 epochs: 54.2378541248201
Cost after 204 epochs: 54.23768398393795
Cost after 205 epochs: 54.2374982228771
Cost after 206 epochs: 54.23728518101633
Cost after 207 epochs: 54.23702010166804
Cost after 208 epochs: 54.23664092341592
Cost after 209 epochs: 54.23594583899654
Cost after 210 epochs: 54.23387986687863
Cost after 211 epochs: 54.21162464101979
Cost after 212 epochs: 47.22789969939214
Cost after 213 epochs: 47.22781383177344
Cost after 214 epochs: 47.227728843207586
Cost after 215 epochs: 47.227644717785765
Cost after 216 epochs: 47.22756144001607
Cost after 217 epochs: 47.227478994808855
Cost after 218 epochs: 47.22739736746321
Cost after 219 epochs: 47.22731654365341
Cost after 220 epochs: 47.22723650941678
Cost after 221 epochs: 47.227157251141506
Cost after 222 epochs: 47.22707875555467
Cost after 223 epochs: 47.22700100971171
Cost after 224 epochs: 47.22692400098537
Cost after 225 epochs: 47.226847717055435
Cost after 226 epochs: 47.226772145899346
Cost after 2

Cost after 402 epochs: 47.21916829550848
Cost after 403 epochs: 47.219141695397695
Cost after 404 epochs: 47.219115159747254
Cost after 405 epochs: 47.21908868431638
Cost after 406 epochs: 47.219062264491654
Cost after 407 epochs: 47.21903589523288
Cost after 408 epochs: 47.219009571010545
Cost after 409 epochs: 47.21898328573126
Cost after 410 epochs: 47.218957032649996
Cost after 411 epochs: 47.21893080426549
Cost after 412 epochs: 47.218904592195045
Cost after 413 epochs: 47.21887838702462
Cost after 414 epochs: 47.218852178126895
Cost after 415 epochs: 47.21882595344022
Cost after 416 epochs: 47.21879969919788
Cost after 417 epochs: 47.21877339959452
Cost after 418 epochs: 47.21874703637101
Cost after 419 epochs: 47.21872058829527
Cost after 420 epochs: 47.218694030505816
Cost after 421 epochs: 47.21866733367478
Cost after 422 epochs: 47.21864046293002
Cost after 423 epochs: 47.2186133764504
Cost after 424 epochs: 47.21858602361542
Cost after 425 epochs: 47.218558342532695
Cost aft

Cost after 601 epochs: 42.20987245088817
Cost after 602 epochs: 42.20986308832169
Cost after 603 epochs: 42.20985372987663
Cost after 604 epochs: 42.20984437409943
Cost after 605 epochs: 42.20983501940837
Cost after 606 epochs: 42.20982566407806
Cost after 607 epochs: 42.209816306221164
Cost after 608 epochs: 42.20980694376786
Cost after 609 epochs: 42.20979757444136
Cost after 610 epochs: 42.20978819573029
Cost after 611 epochs: 42.20977880485607
Cost after 612 epochs: 42.2097693987344
Cost after 613 epochs: 42.20975997393049
Cost after 614 epochs: 42.209750526606356
Cost after 615 epochs: 42.20974105245759
Cost after 616 epochs: 42.20973154663851
Cost after 617 epochs: 42.20972200367222
Cost after 618 epochs: 42.20971241734267
Cost after 619 epochs: 42.20970278056293
Cost after 620 epochs: 42.2096930852156
Cost after 621 epochs: 42.20968332195564
Cost after 622 epochs: 42.209673479967506
Cost after 623 epochs: 42.20966354666178
Cost after 624 epochs: 42.2096535072941
Cost after 625 e

Cost after 800 epochs: 39.20354002200551
Cost after 801 epochs: 39.20353714601834
Cost after 802 epochs: 39.203534275502435
Cost after 803 epochs: 39.203531410439666
Cost after 804 epochs: 39.20352855081254
Cost after 805 epochs: 39.20352569660302
Cost after 806 epochs: 39.20352284779365
Cost after 807 epochs: 39.20352000436669
Cost after 808 epochs: 39.20351716630463
Cost after 809 epochs: 39.20351433359019
Cost after 810 epochs: 39.20351150620587
Cost after 811 epochs: 39.203508684134576
Cost after 812 epochs: 39.2035058673589
Cost after 813 epochs: 39.203503055861894
Cost after 814 epochs: 39.2035002496264
Cost after 815 epochs: 39.20349744863557
Cost after 816 epochs: 39.203494652872465
Cost after 817 epochs: 39.20349186232024
Cost after 818 epochs: 39.203489076962235
Cost after 819 epochs: 39.203486296781755
Cost after 820 epochs: 39.20348352176223
Cost after 821 epochs: 39.20348075188711
Cost after 822 epochs: 39.203477987140026
Cost after 823 epochs: 39.20347522750457
Cost after

Cost after 1000 epochs: 39.203054442820736


In [None]:
# ae.set_weights('W_1553848799.pickle')
# print ae.compute_loss(orig_X)

In [8]:
test_X = X_test.values
print ae.compute_loss(test_X)

993.2082201235902


In [9]:
compressed_X = ae.get_compressed_data(orig_X)
compressed_X = np.append(compressed_X,y_train,axis=1)
compressed_test_X = ae.get_compressed_data(test_X)
compressed_test_X = np.append(compressed_test_X,y_test,axis=1)
compressed_data = np.append(compressed_X,compressed_test_X,axis=0)
np.savetxt("compressed_intrusion_data_b_2.csv", compressed_data, delimiter=",",fmt="%s")