In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

tf.config.experimental.list_physical_devices('GPU')

Using TensorFlow backend.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
# Hyperparameters

vocab_size = 400
embedding_dim = 64
max_length = 50
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [3]:
# Load Data

training_sentences = np.load("Datasets/Type4/training_sentences.npy")
testing_sentences = np.load("Datasets/Type4/testing_sentences.npy")
training_labels = np.load("Datasets/Type4/training_labels.npy")
testing_labels = np.load("Datasets/Type4/testing_labels.npy")

print(training_sentences.shape)
print(training_labels.shape)
print(testing_sentences.shape)
print(testing_labels.shape)

(60,)
(60,)
(22,)
(22,)


In [4]:
# encode label values as integers

encoder = LabelEncoder()
encoder.fit(training_labels)
# convert integers to dummy variables (i.e. one hot encoded)
training_labels = np_utils.to_categorical(encoder.transform(training_labels))
testing_labels = np_utils.to_categorical(encoder.transform(testing_labels))

In [5]:
# Initialize Tokenizer

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'you': 2, 'your': 3, 'to': 4, 'the': 5, 'we': 6, 'of': 7, 'information': 8, 'or': 9, 'and': 10, 'in': 11, 'on': 12, 'personal': 13, 'may': 14, 'use': 15, 'consent': 16, 'can': 17, 'user': 18, 'have': 19, 'right': 20, 'a': 21, 'cannot': 22, 'if': 23, 'data': 24, 'that': 25, 'by': 26, 'out': 27, 'not': 28, 'settings': 29, 'will': 30, 'with': 31, 'email': 32, 'address': 33, 'profile': 34, 'google': 35, 'privacy': 36, 'about': 37, 'control': 38, 'is': 39, 'for': 40, 'opt': 41, 'be': 42, 'services': 43, 'cookies': 44, 'private': 45, 'policy': 46, 'our': 47, 'delete': 48, 'third': 49, 'party': 50, 'share': 51, 'without': 52, 'any': 53, 'this': 54, 'content': 55, 'network': 56, 'able': 57, 'collect': 58, 'as': 59, 'how': 60, 'processing': 61, 'at': 62, 'when': 63, 'social': 64, 'request': 65, 'through': 66, 'advertising': 67, 'browser': 68, 'even': 69, 'marketing': 70, 'youe': 71, 'provide': 72, 'change': 73, 'ability': 74, 'quora': 75, 'us': 76, 'choices': 77, 'using': 78, 'befo

In [6]:
# Tokenize the sentences

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [7]:
# Sequential LSTM Model

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 64)            25600     
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                24832     
____________________________________________________________

In [8]:
# Set Training Parameters

num_epochs = 50
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

filepath = "Weights/Type4.hdf5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1, callbacks = [checkpoint])

Train on 60 samples, validate on 22 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/50
Epoch 00001: val_acc improved from -inf to 0.59091, saving model to Weights/Type4.hdf5
Epoch 2/50
Epoch 00002: val_acc did not improve from 0.59091
Epoch 3/50
Epoch 00003: val_acc did not improve from 0.59091
Epoch 4/50
Epoch 00004: val_acc did not improve from 0.59091
Epoch 5/50
Epoch 00005: val_acc did not improve from 0.59091
Epoch 6/50
Epoch 00006: val_acc did not improve from 0.59091
Epoch 7/50
Epoch 00007: val_acc did not improve from 0.59091
Epoch 8/50
Epoch 00008: val_acc did not improve from 0.59091
Epoch 9/50
Epoch 00009: val_acc did not improve from 0.59091
Epoch 10/50
Epoch 00010: val_acc did not improve from 0.59091
Epoch 11/50
Epoch 00011: val_acc did not improve from 0.59091
Epoch 12/50
Epoch 00012: val_acc did not improve from 0.59091
Epoch 13/50
Epoch 00013: val_acc did not improve from 0.59091
Epoch 14/50
Epoch 00014: val

Epoch 31/50
Epoch 00031: val_acc did not improve from 0.59091
Epoch 32/50
Epoch 00032: val_acc did not improve from 0.59091
Epoch 33/50
Epoch 00033: val_acc did not improve from 0.59091
Epoch 34/50
Epoch 00034: val_acc did not improve from 0.59091
Epoch 35/50
Epoch 00035: val_acc did not improve from 0.59091
Epoch 36/50
Epoch 00036: val_acc did not improve from 0.59091
Epoch 37/50
Epoch 00037: val_acc did not improve from 0.59091
Epoch 38/50
Epoch 00038: val_acc did not improve from 0.59091
Epoch 39/50
Epoch 00039: val_acc did not improve from 0.59091
Epoch 40/50
Epoch 00040: val_acc did not improve from 0.59091
Epoch 41/50
Epoch 00041: val_acc did not improve from 0.59091
Epoch 42/50
Epoch 00042: val_acc did not improve from 0.59091
Epoch 43/50
Epoch 00043: val_acc did not improve from 0.59091
Epoch 44/50
Epoch 00044: val_acc did not improve from 0.59091
Epoch 45/50
Epoch 00045: val_acc did not improve from 0.59091
Epoch 46/50
Epoch 00046: val_acc did not improve from 0.59091
Epoch 47

In [9]:
avg = 0

for i in range(len(training_sequences)):
    avg = avg + len(training_sequences[i])
    
print(avg/(len(training_sequences)))

19.733333333333334


In [10]:
# Finding the longest sentence overall (Ignore the error)

mini = 0
index = 0
indextest = 0
for i in range(len(training_sequences)):
    if len(training_sequences[i]) > mini:
        mini = len(training_sequences[i])
        index = i
        
for i in range(len(testing_sequences)):
    if len(testing_sequences[i]) > mini:
        mini = len(testing_sequences[i])
        indextest = i

print(mini)
print(index)
print(training_sentences[index])
print("\n")
print(testing_sentences[indextest])

46
17
We rely on your consent to use your User Personal Information under the following circumstances: when you fill out the information in your user profile; when you decide to participate in a GitHub training, research project, beta program, or survey; and for marketing purposes, where applicable.


You cannot edit or delete the answers that you post.



''

In [11]:
# if input() == "yes":
#     model.save("Weights/Type4_LSTM.h5")

In [12]:
# Load the previously saved weights
model = tf.keras.models.load_model("Weights/Type4.hdf5")
# Re-evaluate the model
loss, acc = model.evaluate(testing_padded, testing_labels, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

22/22 - 0s - loss: 0.6912 - acc: 0.5909
Restored model, accuracy: 59.09%


In [13]:
if input() == "yes":
    model.save("Weights/Type4_{:5.2f}.hdf5".format(100 * acc))

no
