In [169]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [170]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import utils, preprocessing

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 1.6.0


In [171]:
# The CSV was generated from this query: https://bigquery.cloud.google.com/savedquery/513927984416:c494494324be4a80b1fc55f613abb39c
# The data is also publicly available at this Cloud Storage URL: https://storage.googleapis.com/tensorflow-workshop-examples/stack-overflow-data.csv
data = pd.read_csv("owasp10_contami_200.csv")

In [172]:
data.head()

Unnamed: 0,post,tags
0,cat training movie,normal
1,cat declaration movie,normal
2,cat umbrella movie,normal
3,cat sign movie,normal
4,cat lock movie,normal


In [173]:
tag_num = data['tags'].nunique()
data['tags'].value_counts()

normal    25536
attack      358
Name: tags, dtype: int64

In [174]:
max_words  = 1000
tokenizer = text.Tokenizer(num_words=max_words, char_level=False)

In [175]:
max_len = 20
tokenizer.fit_on_texts(data['post'])
sequences = tokenizer.texts_to_sequences(data['post'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data_sec = pad_sequences(sequences, maxlen=max_len)

Found 87547 unique tokens.


In [176]:
import pickle
# save the token data if you want to
with open('ognl_embed.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [177]:
# Split data into train and test
train_size = int(len(data_sec) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data_sec) - train_size))

Train size: 20715
Test size: 5179


In [178]:
x_train = data_sec[:train_size]
x_test = data_sec[train_size:]

test_posts_doc = data['post'][train_size:]

In [179]:
train_tags = data['tags'][:train_size]
test_tags = data['tags'][train_size:]

# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [180]:
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)
print(y_train.shape)

(20715, 2)


glove_dir = '/Users/watarium/anaconda3/doc_classification/'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [181]:
# Build the model
model = Sequential()
model.add(Embedding(10000, 128, input_length=max_len))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(tag_num, activation='sigmoid'))
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 20, 128)           1280000   
_________________________________________________________________
flatten_9 (Flatten)          (None, 2560)              0         
_________________________________________________________________
dense_17 (Dense)             (None, 32)                81952     
_________________________________________________________________
dense_18 (Dense)             (None, 2)                 66        
Total params: 1,362,018
Trainable params: 1,362,018
Non-trainable params: 0
_________________________________________________________________


In [182]:
# This model trains very quickly and 2 epochs are already more than enough
# Training for more epochs will likely lead to overfitting on this dataset
# You can try tweaking these hyperparamaters when using this model with your own data
batch_size = 32
epochs = 3

In [183]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (20715, 20)
x_test shape: (5179, 20)
y_train shape: (20715, 2)
y_test shape: (5179, 2)


In [184]:
# Inspect our training and test data (this is helpful to debug)
print('x_train:', x_train)
print('x_test:', x_test)
print('y_train:', y_train)
print('y_test:', y_test)

x_train: [[  0   0   0 ...   0 927  15]
 [  0   0   0 ...   0 927  15]
 [  0   0   0 ...   0 927  15]
 ...
 [  1 101 387 ...  41   4 154]
 [  6 265  53 ...   2 501 133]
 [418 967 557 ... 201  59 363]]
x_test: [[ 91 103  38 ... 475   4 200]
 [ 72   9   5 ...  44   2 914]
 [  1 107 151 ...  10 260  68]
 ...
 [286  20  38 ...   4   1 680]
 [ 67   3 161 ...  30   1 168]
 [ 60  13 109 ... 265   3 160]]
y_train: [[0. 1.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]
y_test: [[0. 1.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]


In [185]:
# model.fit trains the model# model. 
# The validation_split param tells Keras what % of our training data should be used in the validation set
# You can see the validation loss decreasing slowly when you run this
# Because val_loss is no longer decreasing we stop training to prevent overfitting
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_test, y_test))

Train on 20715 samples, validate on 5179 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [186]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.01480041226116443
Test accuracy: 0.9944004635250138


In [187]:
#save model if you want to
model.save('struts_embed.h5')

In [188]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 
for i in range(100):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_posts_doc.iloc[i][:50], "...")
    print(prediction)
    print('Actual label:' + test_tags.iloc[i])
    print('Predicted label: ' + predicted_label + "\n")

This is one particular Stooge short that actually  ...
[[1.9395337e-10 9.9998701e-01]]
Actual label:normal
Predicted label: normal

[[1.3080516e-09 9.9993932e-01]]
Actual label:normal
Predicted label: normal

This movie is the funniest danish movie I've ever  ...
[[8.9507796e-10 9.9995708e-01]]
Actual label:normal
Predicted label: normal

I shot this movie. I am very proud of the film. It ...
[[1.01372966e-10 9.99988794e-01]]
Actual label:normal
Predicted label: normal

How pointless, hideous characters and boring film. ...
[[5.8359595e-10 9.9996865e-01]]
Actual label:normal
Predicted label: normal

Stranded in Space (1972) MST3K version - a very no ...
[[9.6624964e-10 9.9995756e-01]]
Actual label:normal
Predicted label: normal

This wretched psychodrama uses every shabby device ...
[[9.513487e-11 9.999821e-01]]
Actual label:normal
Predicted label: normal

If you don't like Mel Brooks, you won't like this  ...
[[5.9276706e-10 9.9995852e-01]]
Actual label:normal
Predicted label: normal


In [189]:
y_softmax = model.predict(x_test)
y_pred_1d = []

for i in range(0, len(y_softmax)):
    probs = y_softmax[i]
    predicted_index = np.argmax(probs)
    y_pred_1d.append(predicted_index)