<a href="https://colab.research.google.com/github/timthedev07/trip-advisor-rating-prediction/blob/dev/trip_advisor_rating_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries.

In [None]:
!pip uninstall tensorflow -y
!pip install tensorflow==2.9.1

In [17]:
import os
import string
import re
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, MaxPool1D, Dropout, LSTM, Conv1D, Bidirectional
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.callbacks import EarlyStopping
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
stopWords = set(stopwords.words("english"))
stopWords.remove("not")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Add extra words to stopwords.

In [None]:
!wget "https://gist.githubusercontent.com/timthedev07/541d7d2cf1d9c890b49e2f9887d0f25a/raw/95ec7e56566e2064b1ea9ce2701bd9ad8f3e7d88/extra-stowords.txt" -O data/extra-stopwords.txt
with open("data/extra-stopwords.txt", "r", encoding = "utf-8") as f:
    words = set([i.strip("\n") for i in f.readlines()])
    stopWords = stopWords.union(words)

Load dataset.

In [6]:
!rm -rf sample_data
!rm -rf data
!mkdir data
%cd data
!wget 'https://storage.googleapis.com/kaggle-data-sets/897156/1526618/compressed/tripadvisor_hotel_reviews.csv.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20220821%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20220821T110449Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=44238f9c56891e2bb9cdfdcc9aeca33c4356159af9fe70424754f3b5b971c6cc7037ada855f9c2ef3536197a810864641c16e853d76e124d5b311eb074a3bc3503d2f0ad9b0fd400c15c2701e48beb2f25433156ed5f5da3c23c5c8d142161c095bea640f5a6f05e1154171f2654fb2f2be9d5951ed467a56117f8f5dafe307fe302fbb7ecdacc3ba9489702f84ab7559d26805f88d8165617893348844c72cdbb2cd22f8d0104b0256fa8b31217dc168c1a7df0706623c055100f8f5c11f09dccfc718d4aa0ff5e713661e45bc054bafd000fa7f8bb890685330b4f63b1432d8e09372def1afbecdb73906a620c58b5ffe3daa1cc019fa3f37478ab62410978' -O reviews.csv.zip
!unzip -ojqp "*.csv.zip" > reviews.csv
!rm -rf *.zip
%cd ..


/content/data
--2022-08-21 13:07:44--  https://storage.googleapis.com/kaggle-data-sets/897156/1526618/compressed/tripadvisor_hotel_reviews.csv.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20220821%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20220821T110449Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=44238f9c56891e2bb9cdfdcc9aeca33c4356159af9fe70424754f3b5b971c6cc7037ada855f9c2ef3536197a810864641c16e853d76e124d5b311eb074a3bc3503d2f0ad9b0fd400c15c2701e48beb2f25433156ed5f5da3c23c5c8d142161c095bea640f5a6f05e1154171f2654fb2f2be9d5951ed467a56117f8f5dafe307fe302fbb7ecdacc3ba9489702f84ab7559d26805f88d8165617893348844c72cdbb2cd22f8d0104b0256fa8b31217dc168c1a7df0706623c055100f8f5c11f09dccfc718d4aa0ff5e713661e45bc054bafd000fa7f8bb890685330b4f63b1432d8e09372def1afbecdb73906a620c58b5ffe3daa1cc019fa3f37478ab62410978
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.188.48, 172.253.62.128, 17

In [7]:
df = pd.read_csv("data/reviews.csv")

x = df["Review"]
y = pd.get_dummies(df["Rating"])

trainX, validationX, trainY, validationY = train_test_split(x, y, random_state = 42, shuffle = True, test_size = 0.2)

LABELS = [1, 2, 3, 4, 5]

Text cleaning function.

In [93]:
def customStandardization(text: tf.Tensor):
    # to lower case
    text = tf.strings.lower(text)
    # expand contraction
    pairs = [
        ("won't", "will not"),
        ("can't", "can not"),
        ("n't", " not"),
        ("'re", " are"),
        ("'s", " is"),
        ("'d", " would"),
        ("'ll", " will"),
        ("'t", " not"),
        ("'ve", " have"),
        ("'m", " am"),
    ]
    for contracted, replacement in pairs:
        text = tf.strings.regex_replace(text, contracted, replacement)
    
    # clean special symbols
    text = tf.strings.regex_replace(text, r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?", " ")
    text = tf.strings.regex_replace(text, r"[^A-Za-z0-9]+", " ")

    # remove stopwords
    for i in stopWords:
        text = tf.strings.regex_replace(text, f"[^A-Za-z0-9_]+{i}[^A-Za-z0-9_]+", " ")

    return text

The model.

In [88]:
VOCAB_SIZE = 15000
SEQUENCE_LENGTH = 32
EMBEDDING_DIM = 32

Vectorization = TextVectorization(
    max_tokens = VOCAB_SIZE,
    output_sequence_length = SEQUENCE_LENGTH,
    output_mode = "int",
    standardize = customStandardization,
)

Vectorization.adapt(np.array(x))

EmbeddingLayer = Embedding(
    VOCAB_SIZE,
    EMBEDDING_DIM,
    name="embedding_layer",
)


In [109]:
model = Sequential([
    Vectorization,
    EmbeddingLayer,
    MaxPool1D(),
    LSTM(64, recurrent_dropout = 0.4, dropout = 0.4, return_sequences = True),
    LSTM(32, recurrent_dropout = 0.4, dropout = 0.4),
    Dense(5, activation = "softmax")
], name = "CNN_hotel_review")

model.compile(
    optimizer='rmsprop',
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[
        "accuracy"
    ]
)

model.summary()

Model: "CNN_hotel_review"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_5 (TextV  (None, 32)               0         
 ectorization)                                                   
                                                                 
 embedding_layer (Embedding)  (None, 32, 32)           480000    
                                                                 
 max_pooling1d_19 (MaxPoolin  (None, 16, 32)           0         
 g1D)                                                            
                                                                 
 lstm_27 (LSTM)              (None, 16, 64)            24832     
                                                                 
 lstm_28 (LSTM)              (None, 32)                12416     
                                                                 
 dense_25 (Dense)            (None, 5)            

Train the model.

In [110]:
EPOCHS = 20

callback = EarlyStopping(patience = 1)

model.fit(
    trainX,
    trainY,
    validation_data = (validationX, validationY),
    batch_size = 32,
    epochs = EPOCHS,
    callbacks = [callback]
)

Epoch 1/20
Epoch 2/20


<keras.callbacks.History at 0x7fa967da4d90>

In [114]:
text = """
This must be one of the worst hotels I have ever visited the staff are on helpful and argumentative the rooms are dirty dated and dangerous with exposed wires sticking out of the wall broken doors cigarette burns and all the furniture I would avoid this hotel at all costs be a little extra and stay somewhere else
"""
model.predict([text])



array([[9.8406905e-01, 1.1029845e-02, 7.4953819e-04, 1.5442398e-03,
        2.6073342e-03]], dtype=float32)

In [111]:
text = """
We are currently staying here and can’t wait to get out. I called ahead to advise that my son has a dust allergy so the room needs to be really clean but this made no difference as the room is disgusting.

The bath is so dirty that I couldn’t bath my children, there is an open drain in the bathroom which absolutely stinks, not just a slight smell but you open the door and it makes you feel sick. I need to have a shower but don’t know if I can hold my breath that long!

There is old rotting food down the side of the beds. The mattresses are very stained and I had the joy of having to sleep on it as the sheet wasn’t even tucked it by housekeeping just thrown on the bed and it is so small that even when I tucked it in it spent all night coming untucked. There is no mattress protector on the bed and I imagine the same thing must happen to everyone that stays here so the amount of skin/ fluids that must be on that mattress.

When entering the hotel my husband actually asked if I had booked them into a hostel because it looks really unloved and even the inside of the windows in reception are black, clearly not cleaned for years.

The single glazed windows are so thin it sounds like there are no windows at all. There is a lot of traffic that goes past overnight and we were all woken 10+ times especially with the number of emergency vehicles going past all through the night.

I wouldn’t advise staying here unless you are desperate. With all of the reviews about poor cleanliness and seeing it hasn’t improved the hotel clearly doesn’t care. Glad I didn’t book breakfast here as I want to get the hell out of here"""
model.predict([text])



array([[0.8652912 , 0.10382041, 0.011507  , 0.00782244, 0.01155901]],
      dtype=float32)

In [115]:
model.save("model")

# compress model
!zip -r model.zip model

# download model
from google.colab import files
files.download("model.zip")

  adding: model/ (stored 0%)
  adding: model/saved_model.pb (deflated 87%)
  adding: model/assets/ (stored 0%)
  adding: model/variables/ (stored 0%)
  adding: model/variables/variables.index (deflated 61%)
  adding: model/variables/variables.data-00000-of-00001 (deflated 19%)
  adding: model/keras_metadata.pb (deflated 89%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>