In [71]:
import pandas as pd
import numpy as np
import keras as keras

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

from keras import utils
from keras.models import Sequential
from keras.preprocessing import text, sequence
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping

# Get the post processed Data
filename = 'DataFiles/PostProcessing.txt'
df = pd.read_csv(filename)

# Show the balance of the dataset
print(df['class'].value_counts())

database design    350
access control     350
security           302
privacy            246
Name: class, dtype: int64


In [72]:
# Going to get the data first then split it


# Dict of words will truncate anything over 2500
max_words = 2500

# Only allow sentences with less than 100 words
max_length = 100

tokenizer = text.Tokenizer(num_words=max_words, char_level=False)

# Build the tokenizer for all words in sentences
tokenizer.fit_on_texts(df['sentence'].values)

# Convert sentences to sequences rather than matrices
sentences = tokenizer.texts_to_sequences(df['sentence'].values)

# Pad sentences within the max_length
sentences = pad_sequences(sentences, maxlen=max_length)
print ("Shape of sentences:", sentences.shape)

# Pull out the classes 
classes = pd.get_dummies(df['class'].values)
print ("Shape of classes:", classes.shape)

Shape of sentences: (1248, 100)
Shape of classes: (1248, 4)


In [73]:
# Split testing and training data
X_train, X_test, Y_train, Y_test = train_test_split(sentences,classes, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(1123, 100) (1123, 4)
(125, 100) (125, 4)


In [74]:
# Hyperparameters that can be tuned
epochs = 20
batch_size = 16

# List of callbacks to add to the model
#
# Early stopping will stop training the model if it begins to overfit
#
# Checkpoint will save the best model from the current training session - based on the highest validation accuracy

early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, min_delta=0.001, mode='max')
checkpoint = ModelCheckpoint('best_ANN_model.h5', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callback_list = [early_stopping, checkpoint]

model = Sequential()
model.add(Dense(256, input_shape=(sentences.shape[1],)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(4))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,callbacks=callback_list)

Train on 1010 samples, validate on 113 samples
Epoch 1/20

Epoch 00001: val_accuracy improved from -inf to 0.31858, saving model to best_ANN_model.h5
Epoch 2/20

Epoch 00002: val_accuracy improved from 0.31858 to 0.34513, saving model to best_ANN_model.h5
Epoch 3/20

Epoch 00003: val_accuracy improved from 0.34513 to 0.35398, saving model to best_ANN_model.h5
Epoch 4/20

Epoch 00004: val_accuracy did not improve from 0.35398
Epoch 5/20

Epoch 00005: val_accuracy did not improve from 0.35398
Epoch 6/20

Epoch 00006: val_accuracy did not improve from 0.35398
Epoch 7/20

Epoch 00007: val_accuracy did not improve from 0.35398
Epoch 8/20

Epoch 00008: val_accuracy did not improve from 0.35398
Epoch 9/20

Epoch 00009: val_accuracy did not improve from 0.35398
Epoch 10/20

Epoch 00010: val_accuracy did not improve from 0.35398
Epoch 11/20

Epoch 00011: val_accuracy did not improve from 0.35398
Epoch 12/20

Epoch 00012: val_accuracy did not improve from 0.35398
Epoch 13/20

Epoch 00013: val_ac

In [75]:
score = model.evaluate(X_test, Y_test,
                       batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])

Test accuracy: 0.328000009059906


In [76]:
# This is the best accuracy of the model during the current training session
#
# If we get a new best from ALL training session ---> Copy best_ANN_model.h5 and rename it to best_overallANN_model.h5
#
# Need to rebuild a model instance
model = Sequential()
model.add(Dense(256, input_shape=(sentences.shape[1],)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(4))
model.add(Activation('softmax'))


# Load the best weights that were saved from the training session
model.load_weights("best_ANN_model.h5")

# Compile the new model instance
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

score = model.evaluate(X_test, Y_test,
                       batch_size=batch_size, verbose=1)
print('Best test accuracy from training session based on val_accuracy:', score[1])

Best test accuracy from training session based on val_accuracy: 0.328000009059906
