#### 1. Data Processing:

In [102]:
# a) Import the following libraries: 
import sys
import os
import json
import pandas
import numpy
import optparse

from keras.callbacks import TensorBoard
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict
import keras

In [103]:
# b) read data
dataframe = pandas.read_csv("dev-access.csv", engine='python', quotechar='|', header=None)

In [104]:
# c) convert to a numpy.ndarray type
dataset = dataframe.values

In [105]:
# d) Check the shape of the data set
dataset.shape

(26773, 2)

In [106]:
# e) Store all rows and the 0th index as the feature data: 
X = dataset[:,0]

In [107]:
# f) Store all rows and index 1 as the target variable: 
Y = dataset[:,1]

In [108]:
# g) Clean up the predictors.This includes removing features that are not valuable, such as timestamp and source. 
for index, item in enumerate(X):
    # Quick hack to space out json elements
    reqJson = json.loads(item, object_pairs_hook=OrderedDict)
    del reqJson['timestamp']
    del reqJson['headers']
    del reqJson['source']
    del reqJson['route']
    del reqJson['responsePayload']
    X[index] = json.dumps(reqJson, separators=(',', ':'))

In [109]:
X

array(['{"method":"post","query":{},"path":"/login","statusCode":401,"requestPayload":{"username":"Carl2","password":"bo"}}',
       '{"method":"post","query":{},"path":"/login","statusCode":401,"requestPayload":{"username":"pafzah","password":"worldburn432"}}',
       '{"method":"post","query":{},"path":"/login","statusCode":401,"requestPayload":{"username":"Panos1","password":"najrijkom"}}',
       ...,
       '{"method":"post","query":{},"path":"/checkout","statusCode":400,"requestPayload":{"creditCard":"<script src=\\"http://attacker/malicious\\u2011script.js\\"></script>"}}',
       '{"method":"post","query":{},"path":"/checkout","statusCode":400,"requestPayload":{"creditCard":"<meta http-equiv=\\"refresh\\">"}}',
       '{"method":"post","query":{},"path":"/checkout","statusCode":400,"requestPayload":{"creditCard":"<meta http-equiv=\\"refresh\\">"}}'],
      dtype=object)

In [110]:
# h) Tokenize our data
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

# we will need this later
num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(X)

In [111]:
# i) Need to pad our data as each observation has a different length
max_log_length = 1024
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)

In [112]:
# j) Create your train set to be 75% of the data and your test set to be 25%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_processed , Y, test_size=0.25, random_state=42)

#### 2. Model 1 - RNN:

In [113]:
# a) Start by creating an instance of a Sequential model
# b) From there, add an Embedding layer
# c) Add a SimpleRNN layer
# d) Finally, we will add a Dense layer
# e) Compile model using the .compile() method
model = Sequential()
model.add(Embedding(
    input_dim = num_words,
    output_dim = 32,
    input_length = max_log_length
    ))
model.add(keras.layers.SimpleRNN(units = 32,activation = 'relu'))
model.add(Dense(units = 1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [114]:
# f) Print the model summary
print(model.summary())

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 4,129
Trainable params: 4,129
Non-trainable params: 0
_________________________________________________________________
None


In [115]:
# g) Use the .fit() method to fit the model on the train data.
model.fit(X_train, y_train, epochs=3,batch_size = 128,validation_split=0.25)

Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x169733f98>

In [121]:
# h) Use the .evaluate() method to get the loss value & the accuracy value on the test data
results=model.evaluate(X_test, y_test,batch_size = 128)
print('test loss, test acc:', results)

test loss, test acc: [0.5785805242147808, 0.6386315822601318]


#### 3 Model 2 - LSTM + Dropout Layers:

In [122]:
# a) Built LSTM Neural Network 
# b) Compile model using the .compile() method
model_2 = Sequential()
model_2.add(Embedding(
    input_dim = num_words,
    output_dim = 32,
    input_length = max_log_length
    ))
model_2.add(LSTM(units = 64, recurrent_dropout = 0.5))
model_2.add(Dropout(0.5))
model_2.add(Dense(units = 1,activation='sigmoid'))
model_2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [123]:
# c) Print the model summary
print(model_2.summary())

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 26,913
Trainable params: 26,913
Non-trainable params: 0
_________________________________________________________________
None


In [124]:
# d) Use the .fit() method to fit the model on the train data
model_2.fit(X_train, y_train, epochs=3,batch_size = 128,validation_split=0.25)

Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x169748320>

In [127]:
# e) Use the .evaluate() method to get the loss value & the accuracy value on the test data
results_2=model_2.evaluate(X_test, y_test,batch_size = 128)
print('test loss, test acc:', results_2)

test loss, test acc: [0.14875343917669523, 0.9678816795349121]


#### 4 Model 3: Build Your Own

In [129]:
# a)create your RNN
#b) Compiler Requirements: 
model_3 = Sequential()
model_3.add(Embedding(
    input_dim = num_words,
    output_dim = 32,
    input_length = max_log_length
    ))
model_3.add(LSTM(units = 64, recurrent_dropout = 0.5,return_sequences=True))
model_3.add(LSTM(units = 64, recurrent_dropout = 0.5))
model_3.add(Dropout(0.5))
model_3.add(Dense(units = 100,activation='relu'))
model_3.add(Dense(units = 1,activation='sigmoid'))
model_3.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy'])

In [130]:
# c) Print the model summary
model_3.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 1024, 32)          2016      
_________________________________________________________________
lstm_5 (LSTM)                (None, 1024, 64)          24832     
_________________________________________________________________
lstm_6 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 100)               6500      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 101       
Total params: 66,473
Trainable params: 66,473
Non-trainable params: 0
_________________________________________________

In [131]:
# d) Use the .fit() method to fit the model on the train data
model_3.fit(X_train, y_train, epochs=3,batch_size = 128,validation_split=0.25)

Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x157ca59e8>

In [132]:
# e) Use the .evaluate() method to get the loss value & the accuracy value on the test data
results_3=model_3.evaluate(X_test, y_test,batch_size = 128)
print('test loss, test acc:', results_3)

test loss, test acc: [0.6922881944983761, 0.4887959361076355]


#### Conceptual Questions: 

###### 5) Explain the difference between the relu activation function and the sigmoid activation function.
Sigmoid function is usually used in the last layer of the neural network for the binary classification questions.
A sigmoid function will transform an input value into an output between 0.0 and 1.0. Any input larger than 1.0 will be transformed to 1.0, and inputs smaller than 0.0 will be transformed to 0.0. When used in a neural network, this leads to saturation around 1.0 and 0.0 and makes the midpoint quite sensitive to change. This means that when using the function to train a neural network, especially one with many layers, it becomes increasingly more difficult for the neural network to adapt and it’s weights and thus imporve performance. The sigmoid function can also cause neural networks to suffer from the vanishing gradient problem since error is backpropagated through the layers and decreases dramatically with each hidden layer.

ReLU is usually used in the hidden layer.
ReLU takes an input and directly outputs the input if positive and outputs 0 if negative. ReLU combines the benfits of a linear activation function (no vanishing gradient) while allowing for complex relationships to be modeled in the function. Unlike sigmoid, reLU is called a piecewise function, because half of the output is linear (the positive output) while the other half is nonlinear. The ReLU function is also much less computationally taxing than sigmoid.
###### 6) Describe what one epoch actually is (epoch was a parameter used in the .fit() method).
In terms of artificial neural networks, an epoch refers to one cycle through the full training dataset. Usually, training a neural network takes more than a few epochs. In other words, if we feed a neural network the training data for more than one epoch in different patterns, we hope for a better generalization when given a new "unseen" input (test data). An epoch is often mixed up with an iteration. 
Iterations is the number of batches or steps through partitioned packets of the training data, needed to complete one epoch.
One motivation is that (especially for large but finite training sets) it gives the network a chance to see the previous data to readjust the model parameters so that the model is not biased towards the last few data points during training.  
###### 7) Explain how dropout works (you can look at the keras code and/or documentation) for (a) training, and (b) test data sets.
Dropout is a Simple Way to Prevent Neural Networks from Overfitting.
The key idea is to randomly drop units (along with their connections) from the neural network during training. This prevents units from co-adapting too much. During training, dropout samples from an exponential number of different “thinned” networks. At test time, it is easy to approximate the effect of averaging the predictions of all these thinned networks by simply using a single unthinned network that has smaller weights. This significantly reduces overfitting and gives major improvements over other regularization methods. Dropout improves the performance of neural networks on supervised learning tasks in vision, speech recognition, document classification and computational biology, obtaining state-of-the-art results on many benchmark data sets.


###### 8) Explain why problems such as this homework assignment are better modeled with RNNs than CNNs. What type of problem will CNNs outperform RNNs on?
RNNs were designed to work with sequence prediction problems. RNNs in general and LSTMs in particular have received the most success when working with sequences of words and paragraphs, generally called natural language processing.This includes both sequences of text and sequences of spoken language represented as a time series. They are also used as generative models that require a sequence output, not only with text, but on applications such as generating handwriting.
So problems such as this homework assignment are better modeled with RNNs than CNNs.

CNNs were designed to map image data to an output variable.

They have proven so effective that they are the go-to method for any type of prediction problem involving image data as an input.
The benefit of using CNNs is their ability to develop an internal representation of a two-dimensional image. This allows the model to learn position and scale in variant structures in the data, which is important when working with images.
CNNs work well with data that has a spatial relationship.

The CNN input is traditionally two-dimensional, a field or matrix, but can also be changed to be one-dimensional, allowing it to develop an internal representation of a one-dimensional sequence.

This allows the CNN to be used more generally on other types of data that has a spatial relationship. For example, there is an order relationship between words in a document of text. There is an ordered relationship in the time steps of a time series.

Although not specifically developed for non-image data, CNNs achieve state-of-the-art results on problems such as document classification used in sentiment analysis and related problems.

###### 9) Explain what RNN problem is solved using LSTM and briefly describe how.
LSTM is basically considered to avoid the problem of vanishing gradient in RNN. 

Theoretically, the information in RNN is supposed to follow for arbitrary large sequence but in practice this doesn't hold up.

In a simple RNN with sigmoid or tanh neuron units, the later output nodes of the network are less sensitive to the input at time t = 1. This happens due to the vanishing gradient problem. (See the fading of the color in the figure above)

An LSTM allows the preservation of gradients. The memory cell remembers the first input as long as the forget gate is open and the input gate is closed.
The output gate provides finer control to switch the output layer on or off without altering the cell contents.