In [3]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [34]:
from sklearn import svm
from sklearn.metrics import roc_curve, auc
from BaseOneClass import CentroidBasedOneClassClassifier,Centroid_Classifier

## Tạo bộ train, test từ các bộ Preprocessing Data

In [5]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, MaxAbsScaler

train_data = np.genfromtxt('Data_fromLoi/NSLKDD_Train.csv', dtype=np.float32, delimiter=',')
test_data  = np.genfromtxt('Data_fromLoi/NSLKDD_Test.csv', dtype=np.float32, delimiter=',')

In [6]:
y_train = train_data[:,-1]                #Select label column
x_train = train_data[y_train == 0]        #Select only normal data for training  
x_train = x_train[:,0:-1]                 #Remove label column
print("Normal training data: ", x_train.shape[0]) 
np.random.shuffle(x_train)
x_train = x_train[:6734]                  #Sample 5000 connections for training 


y_test = test_data[:,-1]                  #Select label column  
x_test = test_data[:,0:-1]                #Select data except label column

test_X0 = x_test[y_test == 0]             #Normal test
test_X1 = x_test[y_test > 0]              #Anomaly test 
print("Normal testing data: ", test_X0.shape[0])
print("Anomaly testing data: ", test_X1.shape[0])

x_test = np.concatenate((test_X0, test_X1))

test_y0 = np.full((len(test_X0)), True, dtype=bool)
test_y1 = np.full((len(test_X1)), False,  dtype=bool)
y_test =  np.concatenate((test_y0, test_y1))

#create binary label (1-normal, 0-anomaly) for compute AUC later
y_test = (y_test).astype(np.int)

#scaler = MinMaxScaler()
scaler = MaxAbsScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test  = scaler.transform(x_test)

Normal training data:  67343
Normal testing data:  9711
Anomaly testing data:  12833


In [7]:
# Training Parameters
learning_rate = 0.01
num_steps = 500
batch_size = 100
display_step = 10

n_input = x_train.shape[1]
print(n_input)

# Network Parameters
num_hidden_1 = 85 # 1st layer num features
num_hidden_2 = 49 # 2nd layer num features (the latent dim)
num_hidden_3 = 12 # 3nd layer num features (the latent dim)

tf.reset_default_graph()
# tf Graph input (only pictures)
X = tf.placeholder("float", [None, n_input])

122


In [37]:
#Function to compute The Area Under ROC Curve
def AUC_AE(x_test, y_test):
    recon_X      = sess.run(decoder_op, feed_dict={X:x_test})
    recon_errors = ((recon_X - x_test)**2).mean(1)
    
    predictions = -recon_errors
    FPR, TPR, thresholds = roc_curve(y_test, predictions)
    auc_ae = auc(FPR, TPR)
    return FPR, TPR, auc_ae

#Function to compute The Area Under ROC Curve
def AUC_SVM(z_train, z_test, y_test):
    #- Trainning SVM using Z
    clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
    clf.fit(z_train)
    z_pred_test = clf.decision_function(z_test)
    
    predictions = z_pred_test
    FPR, TPR, thresholds = roc_curve(y_test, predictions)
    auc_svm = auc(FPR, TPR)
    return FPR, TPR, auc_svm

def AUC_CEN(z_train, z_test, y_test):
    CEN = CentroidBasedOneClassClassifier()
    CEN.fit(z_train)  
    z_pred_test = -CEN.get_density(z_test)
    
    FPR, TPR, thresholds = roc_curve(y_test, z_pred_test)
    auc_cen = auc(FPR, TPR) 
    return FPR, TPR, auc_cen

In [8]:
def normalize_data(data):
    MaxAbs_data = tf.reduce_max(tf.abs(data), axis =0)
    #m = sess.run(MaxAbs_data)
    data_norm =   data/MaxAbs_data
    return data_norm

In [9]:
def xavier_init(size):
    in_dim = size[0]
    xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
    return tf.random_normal(shape=size, stddev=xavier_stddev)

In [10]:
weights_en = {
    'encoder_h1': tf.Variable(xavier_init([n_input, num_hidden_1])),
    'encoder_h2': tf.Variable(xavier_init([num_hidden_1, num_hidden_2])),
    'encoder_h3': tf.Variable(xavier_init([num_hidden_2, num_hidden_3]))
}

weights_de = {
    'decoder_h1': tf.transpose(weights_en['encoder_h3']),    #12-49
    'decoder_h2': tf.transpose(weights_en['encoder_h2']),    #49 - 85
    'decoder_h3': tf.transpose(weights_en['encoder_h1']),    #85 - 122 
}

biases = {
    'encoder_b1': tf.Variable(tf.zeros(shape=[num_hidden_1])),
    'encoder_b2': tf.Variable(tf.zeros(shape=[num_hidden_2])),  
    'encoder_b3': tf.Variable(tf.zeros(shape=[num_hidden_3])), 
    
    'decoder_b1': tf.Variable(tf.zeros(shape=[num_hidden_2])), 
    'decoder_b2': tf.Variable(tf.zeros(shape=[num_hidden_1])),
    'decoder_b3': tf.Variable(tf.zeros(shape=[n_input]))
}

Instructions for updating:
Colocations handled automatically by placer.


In [11]:
def encoder(x):
    # Encoder Hidden layer with sigmoid activation
    layer_1 = tf.nn.tanh(tf.add(tf.matmul(x, weights_en['encoder_h1']), biases['encoder_b1']))
    
    # Encoder Hidden layer with sigmoid activation
    layer_2 = tf.nn.tanh(tf.add(tf.matmul(layer_1, weights_en['encoder_h2']), biases['encoder_b2']))

    # Encoder Hidden layer with sigmoid activation
    layer_3 = tf.nn.tanh(tf.add(tf.matmul(layer_2, weights_en['encoder_h3']), biases['encoder_b3']))
    
    return layer_3



def decoder(x):
    # Decoder Hidden layer with sigmoid activation 
    layer_1 = tf.nn.tanh(tf.add(tf.matmul(x, weights_de['decoder_h1']), biases['decoder_b1']))
    
    # Decoder Hidden layer with sigmoid activation
    layer_2 = tf.nn.tanh(tf.add(tf.matmul(layer_1, weights_de['decoder_h2']), biases['decoder_b2']))

    layer_3 = tf.nn.tanh(tf.add(tf.matmul(layer_2, weights_de['decoder_h3']), biases['decoder_b3']))
    
    return layer_3


# Construct model
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)
alpha=10
# Prediction
y_pred = decoder_op
x_encoder=encoder_op
# Targets (Labels) are the input data.
y_true = X

#x_encoder1 = normalize_data(x_encoder)

# Define loss and optimizer, minimize the squared error
loss = tf.reduce_mean(tf.pow(y_true - y_pred, 2))+ alpha*tf.reduce_mean(tf.pow(x_encoder,2))
#loss = tf.reduce_mean(tf.pow(y_true - y_pred, 2))+ tf.reduce_mean(tf.pow(x_encoder1,2))

#optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
optimizer = tf.train.AdadeltaOptimizer(learning_rate).minimize(loss)


# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [None]:
# Training
# Pharse 1: Train for Auto Encoder Model
sess = tf.Session()

# Run the initializer
sess.run(init)

num_batch = int(x_train.shape[0]/batch_size)
# Training
for i in range(num_steps):
    # Prepare Data
    re = 0
    for i_batch in range(num_batch):
        batch_x = x_train[i_batch*batch_size:(i_batch+1)*batch_size] 
        _, re_batch = sess.run([optimizer, loss], feed_dict={X: batch_x})
        re = re + re_batch
        # Display logs per step
    if i % display_step == 0 or i == 1:
        z_train = sess.run(x_encoder,feed_dict={X:x_train})
        z_test = sess.run(x_encoder,feed_dict={X:x_test})
        
        _,_,auc_ae    = AUC_AE(x_test, y_test)
        _,_,auc_svm = AUC_SVM(z_train, z_test, y_test)
        _,_,auc_cen = AUC_CEN(z_train, z_test, y_test)
        print('Step %3.0i: Minibatch Loss: %0.4f - AUC_AE %0.3f - AUC_SVM:%0.3f - AUC_CEN:%0.3f' % (i, re/num_batch, auc_ae, auc_svm, auc_cen ))

Step   0: Minibatch Loss: 1.2973 - AUC_AE 0.755 - AUC_SVM:0.894 - AUC_CEN:0.817
Step   1: Minibatch Loss: 1.1992 - AUC_AE 0.780 - AUC_SVM:0.896 - AUC_CEN:0.828
Step  10: Minibatch Loss: 0.5800 - AUC_AE 0.873 - AUC_SVM:0.908 - AUC_CEN:0.903
Step  20: Minibatch Loss: 0.3534 - AUC_AE 0.911 - AUC_SVM:0.919 - AUC_CEN:0.923
Step  30: Minibatch Loss: 0.2503 - AUC_AE 0.932 - AUC_SVM:0.924 - AUC_CEN:0.927
Step  40: Minibatch Loss: 0.1897 - AUC_AE 0.938 - AUC_SVM:0.930 - AUC_CEN:0.927
Step  50: Minibatch Loss: 0.1505 - AUC_AE 0.941 - AUC_SVM:0.932 - AUC_CEN:0.928
Step  60: Minibatch Loss: 0.1238 - AUC_AE 0.943 - AUC_SVM:0.934 - AUC_CEN:0.930
Step  70: Minibatch Loss: 0.1045 - AUC_AE 0.948 - AUC_SVM:0.936 - AUC_CEN:0.934
Step  80: Minibatch Loss: 0.0900 - AUC_AE 0.948 - AUC_SVM:0.939 - AUC_CEN:0.938
Step  90: Minibatch Loss: 0.0787 - AUC_AE 0.946 - AUC_SVM:0.942 - AUC_CEN:0.942
Step 100: Minibatch Loss: 0.0697 - AUC_AE 0.944 - AUC_SVM:0.943 - AUC_CEN:0.945
Step 110: Minibatch Loss: 0.0623 - AUC_A