In [1]:
import numpy as np
import pandas as pd
import keras as keras

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

from keras import utils
from keras.models import Sequential
from keras.preprocessing import text, sequence
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dense, Activation, Dropout, Embedding, SpatialDropout1D, LSTM


# Get the post processed Data
filename = 'DataFiles/PostProcessing.txt'
df = pd.read_csv(filename)

# Show the balance of the dataset
print(df['class'].value_counts())

Using TensorFlow backend.


access control     350
database design    350
security           302
privacy            246
Name: class, dtype: int64


In [2]:
# Going to get the data first then split it
# LSTM needs all sentences to tokenize

# Dict of words will truncate anything over 2500
max_words = 2000

# Only allow sentences with less than 100 words
max_length = 100

# This is fixed...best practice for LSTM
embedding_dim = 100

tokenizer = text.Tokenizer(num_words=max_words, char_level=False)

# Build the tokenizer for all words in sentences
tokenizer.fit_on_texts(df['sentence'].values)

# Convert sentences to sequences rather than matrices
sentences = tokenizer.texts_to_sequences(df['sentence'].values)

# Pad sentences within the max_length
sentences = pad_sequences(sentences, maxlen=max_length)
print ("Shape of sentences:", sentences.shape)

# Pull out the classes 
classes = pd.get_dummies(df['class'].values)
print ("Shape of classes:", classes.shape)

Shape of sentences: (1248, 100)
Shape of classes: (1248, 4)


In [3]:
X_train, X_test, Y_train, Y_test = train_test_split(sentences,classes, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(1123, 100) (1123, 4)
(125, 100) (125, 4)


In [4]:
# Batch size is 
epochs = 30
batch_size = 32

# List of callbacks to add to the model
#
# Early stopping will stop training the model if it begins to overfit
#
# Checkpoint will save the best model from the current training session - based on the highest validation accuracy

early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, min_delta=0.001, mode='max')
checkpoint = ModelCheckpoint('best_RNN_model.h5', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callback_list = [early_stopping, checkpoint]

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=sentences.shape[1]))
#model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.25, recurrent_dropout=0.25))
model.add(Dense(4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [5]:
# Added callbacks from the callback_list
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=callback_list)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1010 samples, validate on 113 samples
Epoch 1/30

Epoch 00001: val_accuracy improved from -inf to 0.53097, saving model to best_RNN_model.h5
Epoch 2/30

Epoch 00002: val_accuracy improved from 0.53097 to 0.68142, saving model to best_RNN_model.h5
Epoch 3/30

Epoch 00003: val_accuracy improved from 0.68142 to 0.71681, saving model to best_RNN_model.h5
Epoch 4/30

Epoch 00004: val_accuracy improved from 0.71681 to 0.79646, saving model to best_RNN_model.h5
Epoch 5/30

Epoch 00005: val_accuracy did not improve from 0.79646
Epoch 6/30

Epoch 00006: val_accuracy improved from 0.79646 to 0.83186, saving model to best_RNN_model.h5
Epoch 7/30

Epoch 00007: val_accuracy did not improve from 0.83186
Epoch 8/30

Epoch 00008: val_accuracy did not improve from 0.83186
Epoch 9/30

Epoch 00009: val_accuracy did not improve from 0.83186
Epoch 10/30

Epoch 00010: val_accuracy did not improve from 0.83186
Epoch 11/30

Epoch 00011: val_accuracy did not improve from 0.83186
Epoch 12/30

Epoch 000

In [6]:
# This is the accuracy of the model at the end of the last epoch during the training session
score = model.evaluate(X_test, Y_test,
                       batch_size=batch_size, verbose=1)
print('Test accuracy at completion of training session:', score[1])

Test accuracy at completion of training session: 0.7839999794960022


In [8]:
# This is the best accuracy of the model during the current training session
#
# If we get a new best from ALL training session ---> Copy best_model.h5 and rename it to best_overall_model.h5
#
# Need to rebuild a model instance
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=sentences.shape[1]))
#model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.25, recurrent_dropout=0.25))
model.add(Dense(4, activation='softmax'))

# Load the best weights that were saved from the training session
model.load_weights("best_RNN_model.h5")

# Compile the new model instance
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

score = model.evaluate(X_test, Y_test,
                       batch_size=batch_size, verbose=1)
print('Best test accuracy from training session:', score[1])

Best test accuracy from training session: 0.800000011920929
