In [1]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import utils, preprocessing

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 1.6.0


Using TensorFlow backend.


In [2]:
data = pd.read_csv("owasp10.csv")
token_data = 'owasp10_token.pickle'
encode_data = 'owasp10_encode.pickle'
model_data = 'owasp10_model.h5'

In [3]:
data.head()

Unnamed: 0,post,tags
0,assertivesilk,normal
1,dominatefunny,normal
2,nodeartist,normal
3,shootrevise,normal
4,electrondeal,normal


In [4]:
tag_num = data['tags'].nunique()
data['tags'].value_counts()

normal    25336
attack      358
Name: tags, dtype: int64

In [5]:
max_words  = 10000
tokenizer = text.Tokenizer(num_words=max_words, char_level=False)

In [6]:
max_len = 50
tokenizer.fit_on_texts(data['post'])
sequences = tokenizer.texts_to_sequences(data['post'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data_sec = pad_sequences(sequences, maxlen=max_len)

Found 87545 unique tokens.


In [7]:
import pickle
# save the token data if you want to
with open(token_data, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
# Split data into train and test
train_size = int(len(data_sec) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data_sec) - train_size))

Train size: 20555
Test size: 5139


In [9]:
x_train = data_sec[:train_size]
x_test = data_sec[train_size:]

test_posts_doc = data['post'][train_size:]

In [10]:
train_tags = data['tags'][:train_size]
test_tags = data['tags'][train_size:]

# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [11]:
# save the encoder if you want to
with open(encode_data, 'wb') as handle:
    pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)
print(y_train.shape)

(20555, 2)


In [13]:
from keras.layers import LSTM
# Build the model
model = Sequential()
model.add(Embedding(10000, 128, input_length=max_len))
#model.add(Flatten())
model.add(LSTM(32))
model.add(Dense(tag_num, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 128)           1280000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                20608     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 66        
Total params: 1,300,674
Trainable params: 1,300,674
Non-trainable params: 0
_________________________________________________________________


In [14]:
# This model trains very quickly and 2 epochs are already more than enough
# Training for more epochs will likely lead to overfitting on this dataset
# You can try tweaking these hyperparamaters when using this model with your own data
batch_size = 32
epochs = 2

In [15]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (20555, 50)
x_test shape: (5139, 50)
y_train shape: (20555, 2)
y_test shape: (5139, 2)


In [16]:
# model.fit trains the model# model. 
# The validation_split param tells Keras what % of our training data should be used in the validation set
# You can see the validation loss decreasing slowly when you run this
# Because val_loss is no longer decreasing we stop training to prevent overfitting
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_test, y_test))

Train on 20555 samples, validate on 5139 samples
Epoch 1/2
Epoch 2/2


In [17]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.008752962600371578
Test accuracy: 0.9961081922553026


In [18]:
#save model if you want to
model.save(model_data)

In [20]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 
for i in range(10):
    print(np.array([x_test[i]]))
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_posts_doc.iloc[i][:50], "...")
    print(prediction)
    print('Actual label:' + test_tags.iloc[i])
    print('Predicted label: ' + predicted_label + "\n")

[[ 822 1361   26  221    7    7    9   89   62  177    5 5893  244  241
    36 3121 6997   86   46    6  753 1097    5   27   67   17   30    1
   360  977   18   40  335  253   17    3  545  176   15   12  163  285
     2    6  156  706   69  481   89 1268]]
While watching BLACKWATER VALLEY EXORCISM, I encou ...
[[7.5070911e-06 9.9999523e-01]]
Actual label:normal
Predicted label: normal

[[1865    6    5    3  991 7464   12    6    2    1  107  850   25   94
    71  125   95   81    1 5731    4 6476   14    3 2713   23 1273    8
  1919  300   73   76    3   19   12    1 8638 1294   98   25  423    8
     3 2799    2 5204   12   16 9082 4299]]
Glenn Ford, a New York boy who has been saving his ...
[[7.2624393e-06 9.9999547e-01]]
Actual label:normal
Predicted label: normal

[[ 755    6   12    2    1   19  152   25    3  185  140  269   29   30
    10  686  287  174    5    9  110   21  152  471   12   21  852   37
     3    7    7    5 2962   10   53   75  180   12  201   25   77 2083


In [34]:
request = 'One'
request = [request]

tokenizer.fit_on_texts(request)
req_mat = tokenizer.texts_to_sequences(request)
data_sec = pad_sequences(req_mat, maxlen=max_len)
prediction = model.predict(np.array(data_sec))
print(np.array(data_sec))
predicted_label = encoder.classes_[np.argmax(prediction)]
print(prediction)
print(predicted_label)

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0 26]]
[[0.01843764 0.9799241 ]]
normal
