In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import utils, preprocessing

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 1.6.0


Using TensorFlow backend.


In [3]:
# The CSV was generated from this query: https://bigquery.cloud.google.com/savedquery/513927984416:c494494324be4a80b1fc55f613abb39c
# The data is also publicly available at this Cloud Storage URL: https://storage.googleapis.com/tensorflow-workshop-examples/stack-overflow-data.csv
data = pd.read_csv("owasp10.csv")

In [4]:
data.head()

Unnamed: 0,post,tags
0,Chucky is back but this time he is not scary (...,normal
1,I loved this film. It was so intelligent but i...,normal
2,One True Thing rises above its potentially sch...,normal
3,I had seen this movie when I was a boy (Before...,normal
4,Describing this film is a difficult task. On t...,normal


In [5]:
tag_num = data['tags'].nunique()
data['tags'].value_counts()

normal    25237
attack      358
Name: tags, dtype: int64

In [6]:
max_words  = 1000
tokenizer = text.Tokenizer(num_words=max_words, char_level=False)

In [7]:
max_len = 20
tokenizer.fit_on_texts(data['post'])
sequences = tokenizer.texts_to_sequences(data['post'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data_sec = pad_sequences(sequences, maxlen=max_len)

Found 87446 unique tokens.


In [8]:
import pickle
# save the token data if you want to
with open('ognl_embed.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
# Split data into train and test
train_size = int(len(data_sec) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data_sec) - train_size))

Train size: 20476
Test size: 5119


In [10]:
x_train = data_sec[:train_size]
x_test = data_sec[train_size:]

test_posts_doc = data['post'][train_size:]

In [11]:
train_tags = data['tags'][:train_size]
test_tags = data['tags'][train_size:]

# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

In [12]:
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)
print(y_train.shape)

(20476, 2)


glove_dir = '/Users/watarium/anaconda3/doc_classification/'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [45]:
from keras import layers
from keras.optimizers import RMSprop
# Build the model
model = Sequential()
model.add(layers.Embedding(10000, 128, input_length=max_len))
model.add(layers.Conv1D(32, 9, activation='relu'))
model.add(layers.MaxPooling1D(9))
model.add(layers.GlobalMaxPooling1D())
model.add(Dense(tag_num))
model.compile(loss='binary_crossentropy',
              optimizer=RMSprop(lr=1e-4),
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 20, 128)           1280000   
_________________________________________________________________
conv1d_28 (Conv1D)           (None, 12, 32)            36896     
_________________________________________________________________
max_pooling1d_15 (MaxPooling (None, 1, 32)             0         
_________________________________________________________________
global_max_pooling1d_7 (Glob (None, 32)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 66        
Total params: 1,316,962
Trainable params: 1,316,962
Non-trainable params: 0
_________________________________________________________________


In [46]:
# This model trains very quickly and 2 epochs are already more than enough
# Training for more epochs will likely lead to overfitting on this dataset
# You can try tweaking these hyperparamaters when using this model with your own data
batch_size = 32
epochs = 3

In [47]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (20476, 20)
x_test shape: (5119, 20)
y_train shape: (20476, 2)
y_test shape: (5119, 2)


In [48]:
# Inspect our training and test data (this is helpful to debug)
print('x_train:', x_train)
print('x_test:', x_test)
print('y_train:', y_train)
print('y_test:', y_test)

x_train: [[219 162   4 ... 668 798 154]
 [  2  30 274 ...  75  14  69]
 [ 10  25 581 ...  23  93  10]
 ...
 [144 209  46 ...   3 172 211]
 [  4  16   2 ... 471 445 154]
 [591  41   4 ... 208   3 778]]
x_test: [[  1   1 240 ...   1  36  61]
 [100   1 116 ... 368 980 332]
 [ 10  18  21 ...   3   9 455]
 ...
 [286  20  38 ...   4   1 680]
 [ 67   3 161 ...  30   1 168]
 [ 60  13 109 ... 265   3 160]]
y_train: [[0. 1.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]
y_test: [[0. 1.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]


In [49]:
# model.fit trains the model# model. 
# The validation_split param tells Keras what % of our training data should be used in the validation set
# You can see the validation loss decreasing slowly when you run this
# Because val_loss is no longer decreasing we stop training to prevent overfitting
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_test, y_test))

Train on 20476 samples, validate on 5119 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [50]:
# Evaluate the accuracy of our trained model
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.01660668825685865
Test accuracy: 0.5860519633439398


In [51]:
#save model if you want to
model.save('struts_embed.h5')

In [52]:
# Here's how to generate a prediction on individual examples
text_labels = encoder.classes_ 
for i in range(100):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test_posts_doc.iloc[i][:50], "...")
    print(prediction)
    print('Actual label:' + test_tags.iloc[i])
    print('Predicted label: ' + predicted_label + "\n")

Ah, Hitchcock! It's hard to find a bad Hitchcock m ...
[[-0.2537244  1.8084865]]
Actual label:normal
Predicted label: normal

The show start out with the boat. Desmond was i it ...
[[-0.24536777  1.7228221 ]]
Actual label:normal
Predicted label: normal

Well well, I had seen a lot of reviews on this one ...
[[-0.17205976  1.6474828 ]]
Actual label:normal
Predicted label: normal

I was a huge "SNL" fan back in the days of Chevy C ...
[[-0.19610998  1.7456791 ]]
Actual label:normal
Predicted label: normal

One of the best western movies ever made. Unfortun ...
[[-0.23682725  2.0291522 ]]
Actual label:normal
Predicted label: normal

I was stunned by this film. Afterwards, I didn't e ...
[[-0.29374173  1.9593643 ]]
Actual label:normal
Predicted label: normal

This movie was so awful, so boring, so badly misca ...
[[-0.21617347  1.8033592 ]]
Actual label:normal
Predicted label: normal

Right there. Good, entertaining and accurate era-f ...
[[-0.21276781  1.7303869 ]]
Actual label:normal
Pre

In [53]:
y_softmax = model.predict(x_test)
y_pred_1d = []

for i in range(0, len(y_softmax)):
    probs = y_softmax[i]
    predicted_index = np.argmax(probs)
    y_pred_1d.append(predicted_index)