In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from google.colab import files

In [None]:
def getLocalFiles():
    _files = files.upload()
    if len(_files) >0:
       for k,v in _files.items():
         open(k,'wb').write(v)
getLocalFiles()

In [21]:
import pandas as pd
import numpy as np
from keras.layers import GRU, Activation, Dropout, Dense, Input
from keras.layers.embeddings import Embedding
from keras.models import Model, Sequential
from keras.utils import np_utils
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
# tf.debugging.set_log_device_placement(True)

In [2]:
data = pd.read_json('/Users/yuhanqi/Desktop/Master Thesis/code/data/sample_1000000_2005_2018_cleaned.json')

In [3]:
data['stars_reduce'] = data['stars']-1

In [4]:
data.head(1)

Unnamed: 0,index,text,stars,cleaned_text,stars_reduce
1105058,2344352,Excellent food and customer service! My mom fo...,5,Excellent food and customer service ! My mom f...,4


In [5]:
dummy_y = np_utils.to_categorical(data.stars_reduce)
dummy_y.shape

(1000000, 5)

In [6]:
X_train, X_test,Y_train, Y_test = train_test_split(data.cleaned_text, dummy_y, test_size=0.2, random_state = 45)

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

#words_to_index is the dictionary mapping words to their respective index.
words_to_index = tokenizer.word_index

In [8]:
#A function to read the contents of the GloVe Vector file
#Returns a dictionary that maps the words to their respective word embeddings
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)
  return word_to_vec_map

In [9]:
word_to_vec_map = read_glove_vector('/Users/yuhanqi/Desktop/Master Thesis/code/model/glove.twitter.27B/glove.twitter.27B.50d.txt')

In [10]:
maxLen = 300
vocab_len = len(words_to_index)+1
embed_vector_len = word_to_vec_map['moon'].shape[0]

In [11]:
emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)

In [12]:
def gru_model(input_shape):

  X_indices = Input(input_shape)

  embeddings = embedding_layer(X_indices)

  X = GRU(128, return_sequences=True)(embeddings)

  X = Dropout(0.5)(X)

  X = GRU(128)(X)

  X = Dropout(0.5)(X)

  X = Dense(5, activation='softmax')(X)

  model = Model(inputs=X_indices, outputs=X)

  return model

In [13]:
model = gru_model((maxLen,))
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 300)]             0         
                                                                 
 embedding (Embedding)       (None, 300, 50)           8609600   
                                                                 
 gru (GRU)                   (None, 300, 128)          69120     
                                                                 
 dropout (Dropout)           (None, 300, 128)          0         
                                                                 
 gru_1 (GRU)                 (None, 128)               99072     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 5)                 645   

2022-05-31 17:01:26.176921: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
X_train_indices = tokenizer.texts_to_sequences(X_train)

X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')

print(X_train_indices.shape)

X_test_indices = tokenizer.texts_to_sequences(X_test)

X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

(800000, 300)


In [15]:
adam = tf.keras.optimizers.Adam(learning_rate = 1e-4)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
callback = EarlyStopping(monitor='loss', patience=2)
epochs = 5
history = model.fit(X_train_indices, Y_train, validation_data=(X_test_indices, np.asarray(Y_test)), batch_size=64, epochs=epochs, callbacks=[callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1e52a6050>

In [16]:
model.evaluate(X_test_indices, Y_test)



[0.8060845136642456, 0.6443750262260437]

In [23]:
#plot accuracy
plt.figure(figsize=(15, 7))
plt.plot(range(epochs), history.history['accuracy'])
plt.plot(range(epochs), history.history['val_accuracy'])
plt.legend(['training_acc', 'validation_acc'])
plt.title('Accuracy')

NameError: name 'history' is not defined

<Figure size 1080x504 with 0 Axes>

In [None]:
#plot loss
epochs = 20
plt.figure(figsize=(15, 7))
plt.plot(range(epochs), history.history['loss'])
plt.plot(range(epochs), history.history['val_loss'])
plt.legend(['training_acc', 'validation_acc'])
plt.title('Loss')

In [None]:
# preds = model.predict(X_test_indices)

In [17]:
model.save('/Users/yuhanqi/Desktop/Master Thesis/code/model/glove_gru_2015_2018_1000000')

2022-06-01 01:01:37.750820: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /Users/yuhanqi/Desktop/Master Thesis/code/model/glove_gru_2015_2018_1000000/assets


INFO:tensorflow:Assets written to: /Users/yuhanqi/Desktop/Master Thesis/code/model/glove_gru_2015_2018_1000000/assets


In [24]:
loaded_model = keras.models.load_model('/Users/yuhanqi/Desktop/Master Thesis/code/model/glove_gru_2015_2018_1000000')

In [25]:
# Let's check:
np.testing.assert_allclose(
    model.predict(X_test_indices), loaded_model.predict(X_test_indices)
)

In [26]:
preds = loaded_model.predict(X_test_indices)
stars_preds = np.argmax(preds, axis=1)+1

In [27]:
stars_preds

array([5, 5, 3, ..., 5, 3, 4])