In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

tf.config.experimental.list_physical_devices('GPU')

Using TensorFlow backend.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
# Hyperparameters

vocab_size = 200
embedding_dim = 62
max_length = 50
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [3]:
# Load Data

training_sentences = np.load("Datasets/Type7/training_sentences.npy")
testing_sentences = np.load("Datasets/Type7/testing_sentences.npy")
training_labels = np.load("Datasets/Type7/training_labels.npy")
testing_labels = np.load("Datasets/Type7/testing_labels.npy")

print(training_sentences.shape)
print(training_labels.shape)
print(testing_sentences.shape)
print(testing_labels.shape)

(20,)
(20,)
(7,)
(7,)


In [4]:
# encode label values as integers

encoder = LabelEncoder()
encoder.fit(training_labels)
# convert integers to dummy variables (i.e. one hot encoded)
training_labels = np_utils.to_categorical(encoder.transform(training_labels))
testing_labels = np_utils.to_categorical(encoder.transform(testing_labels))

In [5]:
# Initialize Tokenizer

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'the': 2, 'we': 3, 'under': 4, 'age': 5, 'of': 6, 'information': 7, '13': 8, 'that': 9, 'a': 10, 'if': 11, 'have': 12, 'children': 13, 'personal': 14, 'from': 15, 'or': 16, 'are': 17, 'platform': 18, 'account': 19, 'not': 20, 'collected': 21, 'on': 22, 'delete': 23, 'child': 24, 'to': 25, 'you': 26, 'in': 27, 'will': 28, 'and': 29, 'consent': 30, 'user': 31, 'is': 32, 'does': 33, 'collect': 34, 'become': 35, 'aware': 36, 'person': 37, 'without': 38, 'parental': 39, 'learn': 40, 'as': 41, 'even': 42, 'who': 43, 'so': 44, 'may': 45, 'an': 46, 'github': 47, 'any': 48, 'users': 49, 'knowingly': 50, 'otherwise': 51, 'directed': 52, 'at': 53, 'for': 54, 'has': 55, 'been': 56, 'this': 57, 'terminate': 58, 'relevant': 59, 'reason': 60, 'suspect': 61, 'cannot': 62, 'close': 63, 'content': 64, 'they': 65, 'until': 66, '18': 67, 'tiktok': 68, 'well': 69, 'event': 70, 'anyone': 71, 'provide': 72, 'separate': 73, 'experience': 74, 'younger': 75, 'united': 76, 'states': 77, 'take': 78, 

In [6]:
# Tokenize the sentences

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [7]:
# Sequential LSTM Model

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 62)            12400     
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               65024     
____________________________________________________________

In [8]:
# Set Training Parameters

num_epochs = 100
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

filepath = "Weights/Type7.hdf5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1, callbacks = [checkpoint])

Train on 20 samples, validate on 7 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.57143, saving model to Weights/Type7.hdf5
Epoch 2/100

Epoch 00002: val_acc did not improve from 0.57143
Epoch 3/100

Epoch 00003: val_acc did not improve from 0.57143
Epoch 4/100

Epoch 00004: val_acc did not improve from 0.57143
Epoch 5/100

Epoch 00005: val_acc did not improve from 0.57143
Epoch 6/100

Epoch 00006: val_acc did not improve from 0.57143
Epoch 7/100

Epoch 00007: val_acc did not improve from 0.57143
Epoch 8/100

Epoch 00008: val_acc did not improve from 0.57143
Epoch 9/100

Epoch 00009: val_acc did not improve from 0.57143
Epoch 10/100

Epoch 00010: val_acc did not improve from 0.57143
Epoch 11/100

Epoch 00011: val_acc did not improve from 0.57143
Epoch 12/100

Epoch 00012: val_acc did not improve from 0.57143
Epoch 13/100

Epoch 00013: val_acc did not improve from 0.57143
Epo


Epoch 00043: val_acc did not improve from 0.57143
Epoch 44/100

Epoch 00044: val_acc did not improve from 0.57143
Epoch 45/100

Epoch 00045: val_acc did not improve from 0.57143
Epoch 46/100

Epoch 00046: val_acc did not improve from 0.57143
Epoch 47/100

Epoch 00047: val_acc did not improve from 0.57143
Epoch 48/100

Epoch 00048: val_acc did not improve from 0.57143
Epoch 49/100

Epoch 00049: val_acc did not improve from 0.57143
Epoch 50/100

Epoch 00050: val_acc did not improve from 0.57143
Epoch 51/100

Epoch 00051: val_acc did not improve from 0.57143
Epoch 52/100

Epoch 00052: val_acc did not improve from 0.57143
Epoch 53/100

Epoch 00053: val_acc did not improve from 0.57143
Epoch 54/100

Epoch 00054: val_acc did not improve from 0.57143
Epoch 55/100

Epoch 00055: val_acc did not improve from 0.57143
Epoch 56/100

Epoch 00056: val_acc did not improve from 0.57143
Epoch 57/100

Epoch 00057: val_acc did not improve from 0.57143
Epoch 58/100

Epoch 00058: val_acc did not improve fr

Epoch 87/100

Epoch 00087: val_acc did not improve from 0.57143
Epoch 88/100

Epoch 00088: val_acc did not improve from 0.57143
Epoch 89/100

Epoch 00089: val_acc did not improve from 0.57143
Epoch 90/100

Epoch 00090: val_acc did not improve from 0.57143
Epoch 91/100

Epoch 00091: val_acc did not improve from 0.57143
Epoch 92/100

Epoch 00092: val_acc did not improve from 0.57143
Epoch 93/100

Epoch 00093: val_acc did not improve from 0.57143
Epoch 94/100

Epoch 00094: val_acc did not improve from 0.57143
Epoch 95/100

Epoch 00095: val_acc did not improve from 0.57143
Epoch 96/100

Epoch 00096: val_acc did not improve from 0.57143
Epoch 97/100

Epoch 00097: val_acc did not improve from 0.57143
Epoch 98/100

Epoch 00098: val_acc did not improve from 0.57143
Epoch 99/100

Epoch 00099: val_acc did not improve from 0.57143
Epoch 100/100

Epoch 00100: val_acc did not improve from 0.57143


In [9]:
avg = 0

for i in range(len(training_sequences)):
    avg = avg + len(training_sequences[i])
    
print(avg/(len(training_sequences)))

24.55


In [10]:
# Finding the longest sentence overall (Ignore the error)

mini = 0
index = 0
indextest = 0
for i in range(len(training_sequences)):
    if len(training_sequences[i]) > mini:
        mini = len(training_sequences[i])
        index = i
        
for i in range(len(testing_sequences)):
    if len(testing_sequences[i]) > mini:
        mini = len(testing_sequences[i])
        indextest = i

print(mini)
print(index)
print(training_sentences[index])
print("\n")
print(testing_sentences[indextest])

36
7
Even if we become aware that personal information has been collected on the Platform from a person under the age of 13, we cannot delete this information and terminate the account until the person does so.


Children under the age of 13 are not allowed to create an account or otherwise use the Services.


In [11]:
# if input() == "yes":
#     model.save("Weights/Type7_LSTM.h5")

In [12]:
# Load the previously saved weights
model = tf.keras.models.load_model("Weights/Type7.hdf5")
# Re-evaluate the model
loss, acc = model.evaluate(testing_padded, testing_labels, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

7/7 - 0s - loss: 0.6931 - acc: 0.5714
Restored model, accuracy: 57.14%


In [13]:
if input() == "yes":
    model.save("Weights/Type7_{:5.2f}.hdf5".format(100 * acc))

no
