# University of Stavanger DAT550 - Data Mining and Deep Learning
## Hands-on 10: Text classification using LSTMs

In [151]:
import tensorflow as tf
import numpy as np
import tensorflow.keras.layers
from keras_preprocessing import sequence
from tensorflow.keras.datasets import imdb

In [152]:
tf.__version__

'2.1.0'

In [185]:
max_len = 100
number_of_words = 10000

In [186]:
# (X_train, y_train), (x_test, y_test)  = imdb.load_data(num_words = number_of_words)
pad_id = 0
start_id = 1
oov_id = 2
index_offset = 2
 
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=number_of_words, start_char=start_id,
                                                                        oov_char=oov_id, index_from=index_offset)
 

word2idx = tf.keras.datasets.imdb.get_word_index()
 
idx2word = {v + index_offset: k for k, v in word2idx.items()}
 
idx2word[pad_id] = '<PAD>'
idx2word[start_id] = '<START>'
idx2word[oov_id] = '<OOV>'
 
rnn_cell_size = max_len
 
x_train = sequence.pad_sequences(x_train,
                                 maxlen=max_len,
                                 truncating='post',
                                 padding='post',
                                 value=pad_id)
x_test = sequence.pad_sequences(x_test, maxlen=max_len,
                                truncating='post',
                                padding='post',
                                value=pad_id)


In [187]:
def convertIntToWord(sentence):
    sentence_str = ''
    for word in sentence:
            sentence_str = sentence_str + ' ' + idx2word[word]
    return sentence_str
convertIntToWord(x_train[0])

" <START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <OOV> is an amazing actor and now the same being director <OOV> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <OOV> and would recommend it to everyone to watch and the fly fishing was"

In [188]:
len(word2idx)

88584

In [189]:
unkset = set()
for x in x_train:
    for wordid in x:
        word = idx2word[wordid]
        if word not in embeddings_index:
                unkset.add(word)

In [190]:
len(unkset)

206

In [191]:
y_train[0:100]

array([1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0])

In [192]:
import os
import numpy as np
embeddings_index = {}
embedding_size = 100
GLOVE_DIR='/Users/vsetty/repos/BRENDA-new/new-extension/BRENDA/Server/model/static/glove/'
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [193]:
embedding_matrix = np.random.random((len(word2idx) + 1, embedding_size))
for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [195]:

embedding_layer = tf.keras.layers.Embedding(number_of_words, embedding_size, 
                                            input_length=max_len, trainable=True)

In [196]:
model = tf.keras.Sequential()

In [197]:
model.add(embedding_layer)

In [198]:
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=100,  activation='tanh')))

In [199]:
model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

In [200]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 100, 100)          1000000   
_________________________________________________________________
bidirectional_8 (Bidirection (None, 200)               160800    
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 201       
Total params: 1,161,001
Trainable params: 1,161,001
Non-trainable params: 0
_________________________________________________________________


In [201]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [202]:
model.fit(x_train, y_train, epochs=2, batch_size = 64)

Train on 25000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x155ca1208>

In [203]:
loss, accuracy = model.evaluate(x_test, y_test)



In [204]:
embedding_layer = tf.keras.layers.Embedding(len(word2idx) + 1,
                            embedding_size,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=True)
model_with_pretrained_vec = tf.keras.Sequential()
model_with_pretrained_vec.add(embedding_layer)
model_with_pretrained_vec.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=100,  activation='tanh')))
model_with_pretrained_vec.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
model_with_pretrained_vec.summary()
model_with_pretrained_vec.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model_with_pretrained_vec.fit(x_train, y_train, epochs=2, batch_size = 64)

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 100, 100)          8858500   
_________________________________________________________________
bidirectional_9 (Bidirection (None, 200)               160800    
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 201       
Total params: 9,019,501
Trainable params: 9,019,501
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x14eaba710>

In [207]:
loss, accuracy = model_with_pretrained_vec.evaluate(x_test, y_test)

