In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import files

In [None]:
def getLocalFiles():
    _files = files.upload()
    if len(_files) >0:
       for k,v in _files.items():
         open(k,'wb').write(v)
getLocalFiles()

In [None]:
import pandas as pd
import numpy as np
from keras.layers import Activation, Dropout, Dense, Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.models import Model, Sequential
from keras.utils import np_utils
import string
import re
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from keras.preprocessing.sequence import pad_sequences
import keras
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.under_sampling import RandomUnderSampler
from datetime import datetime

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
# tf.debugging.set_log_device_placement(True)

In [None]:
data = pd.read_json('/content/drive/My Drive/Master Thesis/Data/train_test_2005_2018_cleaned.json')

In [None]:
uos = RandomUnderSampler()
data_x, data_y = uos.fit_resample(np.array(data['cleaned_text']).reshape(-1,1),np.array(data['stars']).reshape(-1,1))
data_us = pd.DataFrame(list(zip([x[0] for x in data_x], data_y)), columns = ['cleaned_text', 'stars'])

In [None]:
data_us.stars.value_counts()
#data_us.head(1)

In [None]:
data_us['stars_reduce'] = data_us['stars']-1

In [None]:
dummy_y = np_utils.to_categorical(data_us.stars_reduce)
dummy_y.shape

In [None]:
X_train, X_test,Y_train, Y_test = train_test_split(data_us.cleaned_text, dummy_y, test_size=0.2, random_state = 45)

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

#words_to_index is the dictionary mapping words to their respective index.
words_to_index = tokenizer.word_index

In [None]:
#A function to read the contents of the GloVe Vector file
#Returns a dictionary that maps the words to their respective word embeddings
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)
  return word_to_vec_map

In [None]:
word_to_vec_map = read_glove_vector('/content/drive/My Drive/Master Thesis/Model/glove.twitter.27B/glove.twitter.27B.50d.txt')

In [None]:
maxLen = 300
vocab_len = len(words_to_index)+1
embed_vector_len = word_to_vec_map['moon'].shape[0]

In [None]:
emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)

In [None]:
def conv1d_model(input_shape):
  X_indices = Input(input_shape)

  embeddings = embedding_layer(X_indices)

  X = Conv1D(256,3,activation='relu')(embeddings)
  X = Dropout(0.5)(X)
    
  X = Conv1D(128,3,activation='relu')(X)
  X = Dropout(0.5)(X)

  X = Conv1D(128,3,activation='relu')(X)
  X = Dropout(0.5)(X)

  X = Conv1D(128,3,activation='relu')(X)
  X = Dropout(0.5)(X)

  X = GlobalMaxPooling1D()(X)

  X = Dense(128, activation='relu')(X)
  X = Dense(5, activation='softmax')(X)

  model = Model(inputs=X_indices, outputs=X)

  return model

In [None]:
model = conv1d_model((maxLen,))
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 300)]             0         
                                                                 
 embedding (Embedding)       (None, 300, 50)           8609550   
                                                                 
 lstm (LSTM)                 (None, 300, 128)          91648     
                                                                 
 dropout (Dropout)           (None, 300, 128)          0         
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense (Dense)               (None, 5)                 645   

In [None]:
X_train_indices = tokenizer.texts_to_sequences(X_train)

X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')

print(X_train_indices.shape)

X_test_indices = tokenizer.texts_to_sequences(X_test)

X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

(800000, 300)


In [None]:
adam = tf.keras.optimizers.Adam(learning_rate = 5e-5)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
callback = EarlyStopping(monitor='val_loss', patience=2)
epochs = 20

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
  269/12500 [..............................] - ETA: 6:08 - loss: 0.9533 - accuracy: 0.5702

In [None]:
start_time = datetime.now()
history = model.fit(X_train_indices, Y_train, validation_data=(X_test_indices, np.asarray(Y_test)), batch_size=64, epochs=epochs, callbacks=[callback])
end_time = datetime.now()
print('--- Duration ---{}'.format(end_time - start_time))

In [None]:
model.evaluate(X_test_indices, Y_test)

array([[  63,  153,   17, ...,   25,    2,  446],
       [  76,   20,  102, ...,    0,    0,    0],
       [  21, 5121,  108, ...,    0,    0,    0],
       ...,
       [  28,   22,    1, ...,    0,    0,    0],
       [6873,   32,    1, ...,    0,    0,    0],
       [   5,  283,   49, ...,    0,    0,    0]], dtype=int32)

In [None]:
#plot accuracy
plt.figure(figsize=(15, 7))
plt.plot(range(epochs), history.history['accuracy'])
plt.plot(range(epochs), history.history['val_accuracy'])
plt.legend(['training_acc', 'validation_acc'])
plt.title('Accuracy')

In [None]:
#plot loss
epochs = 20
plt.figure(figsize=(15, 7))
plt.plot(range(epochs), history.history['loss'])
plt.plot(range(epochs), history.history['val_loss'])
plt.legend(['training_loss', 'validation_loss'])
plt.title('Loss')

In [None]:
# preds = model.predict(X_test_indices)

In [None]:
model.save('/content/drive/My Drive/Master Thesis/Model/glove_cnn_2015_2018')

In [None]:
loaded_model = keras.models.load_model('/content/drive/My Drive/Master Thesis/Model/glove_cnn_2015_2018')

In [None]:
# Let's check:
np.testing.assert_allclose(
    model.predict(X_test_indices), loaded_model.predict(X_test_indices)
)

In [None]:
data_test = pd.read_json('/content/drive/My Drive/Master Thesis/Data/restaurant_review_2019_cleaned.jso

In [None]:
test_indices = tokenizer.texts_to_sequences(data_test.cleaned_text)

test_indices = pad_sequences(test_indices, maxlen=maxLen, padding='post')

print(test_indices.shape)

In [None]:
preds = loaded_model.predict(test_indices)

In [None]:
star_pred = np.argmax(loaded_preds, axis=1)+1

In [None]:
confusion_matrix(data_test.stars, star_pred)

In [None]:
print(classification_report(data_test.stars, star_pred))