In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

tf.config.experimental.list_physical_devices('GPU')

Using TensorFlow backend.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
# Hyperparameters

vocab_size = 700
embedding_dim = 32
max_length = 60
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [3]:
# Load Data

training_sentences = np.load("Datasets/Type2/training_sentences.npy")
testing_sentences = np.load("Datasets/Type2/testing_sentences.npy")
training_labels = np.load("Datasets/Type2/training_labels.npy")
testing_labels = np.load("Datasets/Type2/testing_labels.npy")

print(training_sentences.shape)
print(training_labels.shape)
print(testing_sentences.shape)
print(testing_labels.shape)

(162,)
(162,)
(55,)
(55,)


In [4]:
# encode label values as integers

encoder = LabelEncoder()
encoder.fit(training_labels)
# convert integers to dummy variables (i.e. one hot encoded)
training_labels = np_utils.to_categorical(encoder.transform(training_labels))
testing_labels = np_utils.to_categorical(encoder.transform(testing_labels))

In [5]:
# Initialize Tokenizer

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'information': 2, 'we': 3, 'to': 4, 'and': 5, 'your': 6, 'or': 7, 'the': 8, 'of': 9, 'with': 10, 'our': 11, 'share': 12, 'third': 13, 'personal': 14, 'will': 15, 'that': 16, 'may': 17, 'as': 18, 'other': 19, 'in': 20, 'you': 21, 'on': 22, 'not': 23, 'party': 24, 'data': 25, 'use': 26, 'a': 27, 'such': 28, 'providers': 29, 'if': 30, 'consent': 31, 'user': 32, 'for': 33, 'do': 34, 'service': 35, 'be': 36, 'advertisers': 37, 'platform': 38, 'about': 39, 'parties': 40, 'is': 41, 'any': 42, 'services': 43, 'github': 44, 'content': 45, 'by': 46, 'us': 47, 'also': 48, 'privacy': 49, 'reddit': 50, 'business': 51, 'partners': 52, 'their': 53, 'companies': 54, 'analytics': 55, 'users': 56, 'who': 57, 'sell': 58, 'cookies': 59, 'collect': 60, 'this': 61, 'it': 62, 'provide': 63, 'law': 64, 'including': 65, 'geeksforgeeks': 66, 'assets': 67, 'have': 68, 'transfer': 69, 'ad': 70, 'partner': 71, 'social': 72, 'affiliates': 73, 'ads': 74, 'application': 75, 'measurement': 76, 'disclose':

In [6]:
# Tokenize the sentences

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [7]:
# Sequential LSTM Model

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences = True)),    
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 60, 32)            22400     
_________________________________________________________________
bidirectional (Bidirectional (None, 60, 128)           49664     
____________________________________________________________

In [8]:
# Set Training Parameters

num_epochs = 50
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

filepath = "Weights/Type2.hdf5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

# history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1, callbacks = [checkpoint])

In [9]:
avg = 0

for i in range(len(training_sequences)):
    avg = avg + len(training_sequences[i])
    
print(avg/(len(training_sequences)))

26.808641975308642


In [10]:
mini = 0
index = 0
for i in range(len(training_sequences)):
    if len(training_sequences[i]) > mini:
        mini = len(training_sequences[i])
        index = i
        
for i in range(len(testing_sequences)):
    if len(testing_sequences[i]) > mini:
        mini = len(testing_sequences[i])
        index = i

print(mini)
print(index)
print(training_sentences[index])
print("\n")
print(testing_sentences[index])

100
23
We share your User Personal Information, if you consent, after letting you know what information will be shared, with whom, and why.


If you are located in the European Union or other regions with laws governing data collection and use that may differ from Chinese or U.S. law, please note that we may transfer information, including personal information, to a country and jurisdiction that does not have the same data protection laws as your jurisdiction, and you consent to the transfer of information to China or the U.S. or any other country in which Company or its parent, subsidiaries, affiliates or service providers maintain facilities and the use and disclosure of information about you as described in this Privacy Policy.


In [11]:
# if input() == "yes":
#     model.save("Weights/Type2_LSTM.h5")

In [12]:
# Load the previously saved weights
model = tf.keras.models.load_model("Weights/Type2.hdf5")
# Re-evaluate the model
loss, acc = model.evaluate(testing_padded, testing_labels, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
55/55 - 1s - loss: 2.5693 - acc: 0.4909
Restored model, accuracy: 49.09%


In [14]:
if input() == "yes":
    model.save("Weights/Type2_{:5.2f}.hdf5".format(100 * acc))

yes
