<a href="https://colab.research.google.com/github/winstondo/Python-Autoencoder-NN/blob/master/Autoencoder_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing

from numpy.random import seed
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from keras.layers import Input, Dense #refer to: https://keras.io/layers/core/
from keras.models import Model

import io
from google.colab import files
#minimize this as it takes a long time or find an alternative

uploaded = files.upload() #uploads the csv file (local)

train_dataset = pd.read_csv(io.BytesIO(uploaded['benign_traffic.csv']))
print('dimentions of training dataset:', train_dataset.shape)

#commented out for debugging purposes. 
#target = train_dataset['target']
#train_id = train_dataset['ID']
#test_id = test['ID']

#train_dataset.drop(['target'], axis=1, inplace=True)
#train_dataset.drop(['ID'], axis=1, inplace=True)
#test.drop(['ID'], axis=1, inplace=True)


#scaling the data

scaled_train = minmax_scale(train_dataset, axis = 0)
#defining number of features of data set
num_feat = scaled_train.shape[1]
print('number of features is:', num_feat)
print('shape of data is: ', scaled_train.shape)
print('datatype of trained data is: ', scaled_train.dtype)



#splits the training data into groups to train and test. refer to https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation
#in this case 60% of data is used for training and the remaining 40% is used for testing
X_train, X_test, Y_train, Y_test = train_test_split(scaled_train, scaled_train, train_size = 0.6, random_state = seed(2017))

#paper has the input layer of the autoencoder the same as the number of features in the data set (in this case it should be 115)

#this is dimension of the bottleneck, in the paper it is set to 25% of the imput dimension
encoding_dim = 4

input_dim = Input(shape=(num_feat, ))
#input_dim = Input(shape = (ncol, ))
print('the input dimention is currenly set to:', input_dim)



#replicate the structure of the autoencoder in the paper which was: "four hidden layers of encoders were set at decreasing sizes of 75%, 50%, 33%, and 25%
#of the input layer’s dimension. The next layers were decoders, with the same sizes as the encoders, however with an increasing order (starting from 33%..."

# Encoder Layers
#activation refers to the type of activation function, used here is relu: refer to https://medium.com/@danqing/a-practical-guide-to-relu-b83ca804f1f7
#relu is simply max(0,x). softmax is a smoothed variant of the relu funciton
encode_lay1 = Dense(115, activation = 'relu')(input_dim) 
encode_lay2 = Dense(86, activation = 'relu')(encode_lay1)
encode_lay3 = Dense(43, activation = 'relu')(encode_lay2)
encode_lay4 = Dense(encoding_dim, activation = 'relu')(encode_lay3)

# Decoder Layers
decode_lay1 = Dense(43, activation = 'relu')(encode_lay4) #takes the last layer of the encoder
decode_lay2 = Dense(86, activation = 'relu')(decode_lay1)
decode_lay3 = Dense(115, activation = 'relu')(decode_lay2)
#decode_lay4 = Dense(ncol, activation = 'sigmoid')(decode_lay3)
decode_lay4 = Dense(num_feat, activation = 'sigmoid')(decode_lay3)

# Combine Encoder and Deocder layers
autoencoder = Model(inputs = input_dim, outputs = decode_lay4) 

# Compile the Model refer to https://keras.io/optimizers/
autoencoder.compile(optimizer= 'RMSprop', loss = 'mean_squared_error')
#autoencoder.compile(optimizer = 'adadelta', loss = 'binary_crossentropy')

autoencoder.summary()

#the paper has the # of epocs at 150 for the samsung device
epochs = 32#150 #ephocs are a full foward and back ward pass of the entire data set
batch_size = 1 #32 #the number of groups the dataset is split into. Due to the computational complexity of backpropagation with large data sets, the DS is split into smaller batches

trained_autoencoder = autoencoder.fit(X_train, X_train, batch_size, epochs, shuffle = False, validation_data = (X_test, X_test))



Saving benign_traffic.csv to benign_traffic.csv
dimentions of training dataset: (52150, 115)
number of features is: 115
shape of data is:  (52150, 115)
datatype of trained data is:  float64


the input dimention is currenly set to: Tensor("input_1:0", shape=(?, 115), dtype=float32)


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 115)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 115)               13340     
_________________________________________________________________
dense_2 (Dense)              (None, 86)                9976      
_________________________________________________________________
dense_3 (Dense)              (None, 43)                3741      
_________________________________________________________________
dense_4 (Dense)              (None, 4)