In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

tf.config.experimental.list_physical_devices('GPU')

Using TensorFlow backend.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
# Hyperparameters

vocab_size = 600
embedding_dim = 128
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [3]:
# Load Data

training_sentences = np.load("Datasets/Type1/training_sentences.npy")
testing_sentences = np.load("Datasets/Type1/testing_sentences.npy")
training_labels = np.load("Datasets/Type1/training_labels.npy")
testing_labels = np.load("Datasets/Type1/testing_labels.npy")

print(training_sentences.shape)
print(training_labels.shape)
print(testing_sentences.shape)
print(testing_labels.shape)

(171,)
(171,)
(58,)
(58,)


In [4]:
# encode label values as integers

encoder = LabelEncoder()
encoder.fit(training_labels)
# convert integers to dummy variables (i.e. one hot encoded)
training_labels = np_utils.to_categorical(encoder.transform(training_labels))
testing_labels = np_utils.to_categorical(encoder.transform(testing_labels))

In [5]:
# Initialize Tokenizer

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'your': 2, 'and': 3, 'information': 4, 'you': 5, 'we': 6, 'the': 7, 'to': 8, 'collect': 9, 'or': 10, 'from': 11, 'about': 12, 'use': 13, 'platform': 14, 'of': 15, 'device': 16, 'with': 17, 'as': 18, 'services': 19, 'our': 20, 'will': 21, 'other': 22, 'including': 23, 'in': 24, 'may': 25, 'when': 26, 'if': 27, 'not': 28, 'third': 29, 'mobile': 30, 'us': 31, 'address': 32, 'a': 33, 'such': 34, 'also': 35, 'on': 36, 'that': 37, 'for': 38, 'account': 39, 'provide': 40, 'using': 41, 'contacts': 42, 'content': 43, 'access': 44, 'data': 45, 'an': 46, 'user': 47, 'receive': 48, 'location': 49, 'party': 50, 'personal': 51, 'email': 52, 'browser': 53, 'network': 54, 'phone': 55, 'users': 56, 'social': 57, 'baidu': 58, 'parties': 59, 'contact': 60, 'through': 61, 'any': 62, 'are': 63, 'do': 64, 'facebook': 65, 'automatically': 66, 'certain': 67, 'ip': 68, 'have': 69, 'which': 70, 'like': 71, 'quora': 72, 'providers': 73, 'products': 74, 'github': 75, 'permission': 76, 'system': 77, '

In [6]:
# Tokenize the sentences

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [7]:
# Sequential LSTM Model

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences = True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences = True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 128)          76800     
_________________________________________________________________
bidirectional (Bidirectional (None, 120, 128)          98816     
____________________________________________________________

In [8]:
# Set Training Parameters

num_epochs = 50
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1)

Train on 171 samples, validate on 58 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [9]:
avg = 0

for i in range(len(training_sequences)):
    avg = avg + len(training_sequences[i])
    
print(avg/(len(training_sequences)))

25.982456140350877


In [10]:
mini = 0
index = 0
indextest = 0
for i in range(len(training_sequences)):
    if len(training_sequences[i]) > mini:
        mini = len(training_sequences[i])
        index = i
        
for i in range(len(testing_sequences)):
    if len(testing_sequences[i]) > mini:
        mini = len(testing_sequences[i])
        indextest = i

print(mini)
print(index)
print(training_sentences[index])
print("\n")
print(testing_sentences[indextest])
input()

108
141
If you c, we will access and collect your phone contacts, including the names, phone numbers, addresses and any other information that you have stored on your phone about your contacts in order to determine if they are using the Platform by matching them with existing users of the Platform.If you chose to find other users through your phone contacts, we will access and collect your phone contacts, including the names, phone numbers, addresses and any other information that you have stored on your phone about your contacts in order to determine if they are using the Platform by matching them with existing users of the Platform.


This is information we collect from every visitor to the Website, whether they have an Account or not.



''

In [11]:
if input() == "yes":
    model.save("Weights/Type1_LSTM.h5")




In [12]:
# Load the previously saved weights
if input() == "yes":
    model.load_weights("Weights/Type1_LSTM.h5")

    # Re-evaluate the model
    loss, acc = model.evaluate(testing_padded, testing_labels, verbose=2)
    print("Restored model, accuracy: {:5.2f}%".format(100*acc))


