In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import files

In [None]:
def getLocalFiles():
    _files = files.upload()
    if len(_files) >0:
       for k,v in _files.items():
         open(k,'wb').write(v)
getLocalFiles()

In [None]:
import pandas as pd
import numpy as np
from keras.layers import LSTM, Activation, Dropout, Dense, Input
from keras.layers.embeddings import Embedding
from keras.models import Model, Sequential
from keras.utils import np_utils
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
# tf.debugging.set_log_device_placement(True)

In [None]:
data = pd.read_json('/content/drive/My Drive/Master Thesis/Data/sample_1000000_2005_2018_cleaned.json')

In [None]:
data['stars_reduce'] = data['stars']-1

In [None]:
data.head(1)

Unnamed: 0,index,text,stars,cleaned_text,stars_reduce
1105058,2344352,Excellent food and customer service! My mom fo...,5,Excellent food and customer service ! My mom f...,4


In [None]:
dummy_y = np_utils.to_categorical(data.stars_reduce)
dummy_y.shape

(1000000, 5)

In [None]:
X_train, X_test,Y_train, Y_test = train_test_split(data.cleaned_text, dummy_y, test_size=0.2, random_state = 45)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

#words_to_index is the dictionary mapping words to their respective index.
words_to_index = tokenizer.word_index

In [None]:
# saving
with open('/content/drive/My Drive/Master Thesis/Model/keras_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#read the contents of the GloVe Vector file
#Returns a dictionary that maps the words to their respective word embeddings
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)
  return word_to_vec_map

In [None]:
word_to_vec_map = read_glove_vector('/content/drive/My Drive/Master Thesis/Model/glove.twitter.27B/glove.twitter.27B.50d.txt')

In [None]:
maxLen = 300
vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

In [None]:
emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)

In [None]:
def lstm_model(input_shape):

  X_indices = Input(input_shape)

  embeddings = embedding_layer(X_indices)

  X = LSTM(128, return_sequences=True)(embeddings)

  X = Dropout(0.5)(X)

  X = LSTM(128)(X)

  X = Dropout(0.5)(X)

  X = Dense(5, activation='softmax')(X)

  model = Model(inputs=X_indices, outputs=X)

  return model

In [None]:
model = lstm_model((maxLen,))
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 300)]             0         
                                                                 
 embedding (Embedding)       (None, 300, 50)           8609550   
                                                                 
 lstm (LSTM)                 (None, 300, 128)          91648     
                                                                 
 dropout (Dropout)           (None, 300, 128)          0         
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 5)                 645   

In [None]:
X_train_indices = tokenizer.texts_to_sequences(X_train)

X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')

print(X_train_indices.shape)

X_test_indices = tokenizer.texts_to_sequences(X_test)

X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

(800000, 300)


In [None]:
adam = tf.keras.optimizers.Adam(learning_rate = 1e-5)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
callback = EarlyStopping(monitor='loss', patience=2)
history = model.fit(X_train_indices, Y_train, validation_data=(X_test_indices, np.asarray(Y_test)), batch_size=64, epochs=20, callbacks=[callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f8ec5a5d9d0>

In [None]:
model.evaluate(X_test_indices, Y_test)



[0.9427026510238647, 0.5941299796104431]

In [None]:
#plot accuracy
epochs = 20
plt.figure(figsize=(15, 7))
plt.plot(range(epochs), history.history['accuracy'])
plt.plot(range(epochs), history.history['val_accuracy'])
plt.legend(['training_acc', 'validation_acc'])
plt.title('Accuracy')

In [None]:
#plot loss
plt.figure(figsize=(15, 7))
plt.plot(range(epochs), history.history['loss'])
plt.plot(range(epochs), history.history['val_loss'])
plt.legend(['training_acc', 'validation_acc'])
plt.title('Loss')

In [None]:
preds = model.predict(X_test_indices)

In [None]:
star_pred = np.argmax(preds, axis=1)+1
star_pred

array([5, 5, 2, ..., 5, 4, 4])

In [None]:
model.save('/content/drive/My Drive/Master Thesis/Model/glove_lstm_2015_2018_1000000')



INFO:tensorflow:Assets written to: /content/drive/My Drive/Master Thesis/Model/glove_lstm_2015_2018_1000000/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/Master Thesis/Model/glove_lstm_2015_2018_1000000/assets


In [None]:
loaded_model = keras.models.load_model('/content/drive/My Drive/Master Thesis/Model/glove_lstm_2015_2018_1000000')

In [None]:
# loading
with open('/content/drive/My Drive/Master Thesis/Model/keras_tokenizer.pickle', 'rb') as handle:
    loaded_tokenizer = pickle.load(handle)

In [None]:
X_test_indices = loaded_tokenizer.texts_to_sequences(X_test)

X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

In [None]:
X_test_indices.shape

(200000, 300)

In [None]:
np.testing.assert_allclose(
    model.predict(X_test_indices), loaded_model.predict(X_test_indices)
)

In [None]:
loaded_preds = loaded_model.predict(X_test_indices)

In [None]:
star_pred = np.argmax(loaded_preds, axis=1)+1

In [None]:
star_pred

array([5, 5, 2, ..., 5, 4, 4])