# Group 1 Final Project

## 0. Imports and installations

In [4]:
!pip install unidecode
!pip install contractions
!pip3 install spacy
!pip install transformers

Collecting unidecode
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 4.2 MB/s 
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.2
Collecting contractions
  Downloading contractions-0.0.58-py2.py3-none-any.whl (8.0 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting anyascii
  Downloading anyascii-0.3.0-py3-none-any.whl (284 kB)
[K     |████████████████████████████████| 284 kB 3.9 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.2.tar.gz (321 kB)
[K     |████████████████████████████████| 321 kB 35.4 MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  Created wheel for pyahocorasick: filename=pyahocorasick-1.4.2-cp37-cp37m-linux_x86_64.whl size=85455 sha256=8a4348d54b32e9a3441ed8e4f767abcfffaaf60e28453ce511e5b4d9f7e07fa1
  Stored in direct

In [5]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
import spacy
import unidecode
import contractions
import transformers
from transformers import TFBertForSequenceClassification
from transformers import BertTokenizer
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [6]:
# get the data from kaggle
! mkdir -p ~/.kaggle;
! cp kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d arushchillar/disneyland-reviews
! unzip disneyland-reviews.zip

cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python2.7/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python2.7/dist-packages/kaggle/api/kaggle_api_extended.py", line 146, in authenticate
    self.config_file, self.config_dir))
IOError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.
unzip:  cannot find or open disneyland-reviews.zip, disneyland-reviews.zip.zip or disneyland-reviews.zip.ZIP.


## 1. Data Exploration

In [15]:
df = pd.read_csv('/content/DisneylandReviews.csv', encoding='latin-1')
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [16]:
df.Rating.value_counts()

5    23146
4    10775
3     5109
2     2127
1     1499
Name: Rating, dtype: int64

We have a lot of 5 stars ratings compared to the other categories. 

In [17]:
df.Year_Month.min(), df.Year_Month[df.Year_Month != 'missing'].max()

('2010-10', '2019-5')

The reviews were written between 2010 and 2019. 

In [18]:
df.Reviewer_Location.unique()

array(['Australia', 'Philippines', 'United Arab Emirates',
       'United Kingdom', 'Singapore', 'India', 'Malaysia',
       'United States', 'Canada', 'Myanmar (Burma)', 'Hong Kong', 'China',
       'Indonesia', 'Qatar', 'New Zealand', 'Sri Lanka', 'Uganda',
       'Thailand', 'Austria', 'South Africa', 'Saudi Arabia', 'Japan',
       'Israel', 'South Korea', 'Turkey', 'Macau', 'Egypt', 'Mexico',
       'Mauritius', 'Sweden', 'Brazil', 'Kenya', 'Vietnam', 'Portugal',
       'Cambodia', 'Zambia', 'Croatia', 'France', 'Taiwan', 'Oman',
       'Colombia', 'Norway', 'Kuwait', 'Netherlands', 'Barbados',
       'Finland', 'Bosnia and Herzegovina', 'Brunei', 'Bahrain',
       'Maldives', 'Ireland', 'Russia', 'Romania',
       'Northern Mariana Islands', 'Germany', 'Chile', 'Isle of Man',
       'Pakistan', 'Ukraine', 'Greece', 'Switzerland', 'Spain', 'Estonia',
       "Côte d'Ivoire", 'Guam', 'Bangladesh', 'Belgium', 'Italy',
       'Botswana', 'Denmark', 'Argentina', 'Peru', 'Lithuania', 'I

The reviews come from many different locations.

In [19]:
df.Branch.unique()

array(['Disneyland_HongKong', 'Disneyland_California', 'Disneyland_Paris'],
      dtype=object)

The reviews concern three Disneyland parks: Paris, California and Hong kong. 

In [20]:
df.Review_Text.str.len().mean()

696.8175403225806

The average review contains 697 characters. 

## 2. Preprocessing

In [21]:
# function used to clean the text from stopwords, punctuations etc.

nlp = spacy.load('en_core_web_sm')

def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text


def remove_whitespace(text):
    """remove extra whitespaces from text"""
    text = text.strip()
    return " ".join(text.split())


def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text


def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    text = contractions.fix(text)
    return text


def text_preprocessing(text, accented_chars=True, contractions=True, 
                       convert_num=True, extra_whitespace=True, 
                       lemmatization=True, lowercase=True, punctuations=True,
                       remove_html=True, remove_num=True, special_chars=True, 
                       stop_words=True):
    """preprocess text with default option set to true for all steps"""
    if remove_html == True: #remove html tags
        text = strip_html_tags(text)
    if extra_whitespace == True: #remove extra whitespaces
        text = remove_whitespace(text)
    if accented_chars == True: #remove accented characters
        text = remove_accented_chars(text)
    if contractions == True: #expand contractions
        text = expand_contractions(text)
    if lowercase == True: #convert all characters to lowercase
        text = text.lower()

    doc = nlp(text) #tokenise text

    clean_text = []
    
    for token in doc:
        flag = True
        edit = token.text
        # remove stop words
        if stop_words == True and token.is_stop and token.pos_ != 'NUM': 
            flag = False
        # remove punctuations
        if punctuations == True and token.pos_ == 'PUNCT' and flag == True: 
            flag = False
        # remove special characters
        if special_chars == True and token.pos_ == 'SYM' and flag == True: 
            flag = False
        # remove numbers
        if remove_num == True and (token.pos_ == 'NUM' or token.text.isnumeric()) \
        and flag == True:
            flag = False
        # convert number words to numeric numbers
        if convert_num == True and token.pos_ == 'NUM' and flag == True:
            edit = w2n.word_to_num(token.text)
        # convert tokens to base form
        elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
            edit = token.lemma_
        # append tokens edited and not removed to list 
        if edit != "" and flag == True:
            clean_text.append(edit)        
    return clean_text

In [22]:
# preprocessing
df['processed_text'] = 0
for i in tqdm(range(43)):
  df['processed_text'][i * 1000:(i+1) *1000] = df['Review_Text'][i * 1000:(i+1) *1000].apply(text_preprocessing)

  0%|          | 0/43 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [23]:
# it took a long time so lets save the df
df['processed_text'] = df['processed_text'].str.join(' ')
df.to_csv('processed_data.csv')

In [24]:
df.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch,processed_text
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong,disneyland find disneyland hong kong similar l...
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong,d time visit hk disneyland time stay tomorrowl...
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong,thanks god wasn t hot humid visit park big iss...
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong,hk disneyland great compact park unfortunately...
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong,location city take hour kowlon kid like disney...


In [25]:
# function used for tokanization, padding and data splitting

def get_sequences(texts, tokenizer, train=True, max_seq_length=None):
    sequences = tokenizer.texts_to_sequences(texts)
    
    if train == True:
        max_seq_length = np.max(list(map(len, sequences)))
    
    sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
    
    return sequences


def preprocess_inputs(df):
    df = df.copy()
    
    # Limit data to only the review and rating column
    y = df['Rating']
    X = df['processed_text']
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Fit tokenizer
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    print("Vocab length:", len(tokenizer.word_index) + 1)
    
    # Convert texts to sequences
    X_train = get_sequences(X_train, tokenizer, train=True)
    X_test = get_sequences(X_test, tokenizer, train=False, max_seq_length=X_train.shape[1])
    
    return X_train, X_test, y_train, y_test, tokenizer

In [26]:
# spliting the data

X_train, X_test, y_train, y_test, tokenizer = preprocess_inputs(df)

Vocab length: 29574


## 3. Models and results

### 3.1 Classic ML models

#### 3.1.1 RF

In [27]:
rf_model =  RandomForestClassifier(n_estimators = 100)
_ = rf_model.fit(X_train, y_train)

In [28]:
rf_preds = rf_model.predict(X_test)
print(classification_report(y_test, rf_preds))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       440
           2       1.00      0.00      0.00       638
           3       0.00      0.00      0.00      1472
           4       0.27      0.03      0.06      3243
           5       0.55      0.98      0.70      7004

    accuracy                           0.54     12797
   macro avg       0.36      0.20      0.15     12797
weighted avg       0.42      0.54      0.40     12797



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


RF's accuracy is similar to the baseline, perhapse due to the sparsity of the input data.

#### 3.1.2 SVM

In [29]:
svm_model = LinearSVC()
_ = svm_model.fit(X_train, y_train)



In [30]:
svm_preds = svm_model.predict(X_test)
print(classification_report(y_test, svm_preds))

              precision    recall  f1-score   support

           1       0.03      0.03      0.03       440
           2       0.07      0.20      0.10       638
           3       0.12      0.09      0.10      1472
           4       0.25      0.34      0.29      3243
           5       0.59      0.43      0.49      7004

    accuracy                           0.34     12797
   macro avg       0.21      0.22      0.20     12797
weighted avg       0.40      0.34      0.36     12797



The SVM model have a very low accuracy, lets try to binaries the labels and see how he deals with it:

In [31]:
y_train_binary = y_train.apply(lambda x: 0 if x <4 else 1)
y_test_binary = y_test.apply(lambda x: 0 if x <4 else 1)

In [32]:
svm_binary_model = LinearSVC()
_ = svm_binary_model.fit(X_train, y_train_binary)



In [33]:
svm_binary_preds = svm_binary_model.predict(X_test)
print(classification_report(y_test_binary, svm_binary_preds))

              precision    recall  f1-score   support

           0       0.30      0.21      0.25      2550
           1       0.82      0.88      0.85     10247

    accuracy                           0.75     12797
   macro avg       0.56      0.54      0.55     12797
weighted avg       0.71      0.75      0.73     12797



### 3.2 DL models

#### 3.2.1 Word2Vec

In [None]:
# NN was used as a regression model in this case

inputs = tf.keras.Input(shape=(1645,))
x = tf.keras.layers.Embedding(
    input_dim=37846,
    output_dim=64
)(inputs)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='linear')(x)

w2v_model = tf.keras.Model(inputs=inputs, outputs=outputs)

w2v_model.compile(
    optimizer='adam',
    loss='mse'
)

history = w2v_model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=4,
            restore_best_weights=True
        )
    ], verbose=1
)

# try both regression and classification?

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


In [None]:
w2v_y_pred = np.squeeze(w2v_model.predict(X_test))
w2v_y_pred_int = pd.Series(w2v_y_pred).apply(lambda x: round(x))
print(classification_report(y_test, w2v_y_pred_int))

In [None]:
print(classification_report(y_test, y_pred_int_))

              precision    recall  f1-score   support

           1       0.60      0.03      0.05       440
           2       0.33      0.30      0.32       638
           3       0.35      0.42      0.38      1472
           4       0.35      0.48      0.41      3243
           5       0.78      0.67      0.72      7004
           6       0.00      0.00      0.00         0

    accuracy                           0.55     12797
   macro avg       0.40      0.32      0.31     12797
weighted avg       0.60      0.55      0.56     12797



  _warn_prf(average, modifier, msg_start, len(result))


The accuracy is very low, let's try to to binerize the target into three bins

In [None]:
y_train_3 = y_train.apply(lambda x: 1 if x in [4, 5] else (0 if x==3 else -1))
y_test_3 = y_test.apply(lambda x: 1 if x in [4, 5] else (0 if x==3 else -1))

In [None]:
inputs = tf.keras.Input(shape=(1645,))
x = tf.keras.layers.Embedding(
    input_dim=37846,
    output_dim=64
)(inputs)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='linear')(x)

w2v_3_model = tf.keras.Model(inputs=inputs, outputs=outputs)

w2v_3_model.compile(
    optimizer='adam',
    loss='mse'
)

history = w2v_3_model.fit(
    X_train,
    y_train_3,
    validation_split=0.2,
    batch_size=32,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=4,
            restore_best_weights=True
        )
    ], verbose=1
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


In [None]:
y_pred_3 = np.squeeze(w2v_3_model.predict(X_test))

In [None]:
y_pred_3_ = pd.Series(y_pred_3).apply(lambda x: round(x))

In [None]:
y_pred_3_.unique()

array([ 1,  0, -1])

In [None]:
y_pred_changed = []
for pred in y_pred_3_:
    if pred >= 4:
        y_pred_changed.append(5)
    elif pred < 3:
        y_pred_changed.append(1)
    else :
        y_pred_changed.append(pred)

In [None]:
print(classification_report(y_test_3, y_pred_changed))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00      1078
           0       0.00      0.00      0.00      1472
           1       0.80      1.00      0.89     10247

    accuracy                           0.80     12797
   macro avg       0.27      0.33      0.30     12797
weighted avg       0.64      0.80      0.71     12797



  _warn_prf(average, modifier, msg_start, len(result))


After label binnig the accuracy incrased from 55% to 80%. It's possible the model is having a hard time seperatin 4 from 5 and 1 from 2.

#### 3.2.2 Bert transformer

In [None]:
df_train, df_test = train_test_split(df, test_size=0.1)

def imdb_gen(df):
    def g():
        for row in df.itertuples():
            text = row.processed_text
            label = 1 if row.Rating > 3 else 0
            # label = row.Rating
            tokenized = tokenizer(text, max_length=128, padding='max_length',
                                  truncation=True ) ## ANSWER TO QUESTION 13 HERE
            yield {k: np.array(tokenized[k]) for k in tokenized}, label
    return g

input_names = ['input_ids', 'token_type_ids', 'attention_mask']
data_types = ({k: tf.int32 for k in input_names}, tf.int64)
data_shapes = ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([]))

imdb_data_train = tf.data.Dataset.from_generator(
    imdb_gen(df_train),
    data_types, data_shapes
).shuffle(100).batch(32).repeat(4)

imdb_data_test = tf.data.Dataset.from_generator(
    imdb_gen(df_test),
    data_types, data_shapes
).shuffle(100).batch(32).repeat(4)

In [None]:
mrpc_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

loss = SparseCategoricalCrossentropy(from_logits =True)
adam = Adam(learning_rate = 3e-5)

In [None]:
mrpc_model.compile(optimizer= adam, loss= loss, metrics = 'accuracy')

In [None]:
mrpc_model.fit(imdb_data_train, validation_data=imdb_data_test, epochs=100,
               steps_per_epoch=64, validation_steps=16, verbose=1, callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_accuracy',
            patience=4,
            restore_best_weights=True
        )
    ])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100


<keras.callbacks.History at 0x7f34fad15a90>

In [None]:
# Save the entire model as a SavedModel.
!mkdir -p saved_model
mrpc_model.save('saved_model/my_transformer_model')



INFO:tensorflow:Assets written to: saved_model/my_transformer_model/assets


INFO:tensorflow:Assets written to: saved_model/my_transformer_model/assets


In [None]:
# downloads the models from colab

!zip -r /content/my_transformer_model.zip /content/saved_model/my_transformer_model

files.download("/content/my_transformer_model.zip")

  adding: content/saved_model/my_transformer_model/ (stored 0%)
  adding: content/saved_model/my_transformer_model/saved_model.pb (deflated 92%)
  adding: content/saved_model/my_transformer_model/variables/ (stored 0%)
  adding: content/saved_model/my_transformer_model/variables/variables.data-00000-of-00001 (deflated 15%)
  adding: content/saved_model/my_transformer_model/variables/variables.index (deflated 81%)
  adding: content/saved_model/my_transformer_model/assets/ (stored 0%)
  adding: content/saved_model/my_transformer_model/keras_metadata.pb (deflated 95%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 4. Future Plans

* feature selection:
  * Integrate the unused feature in the dataset into the models

* Optimizations:
  * Optimize the preprocessing differently for each model
  * Optimize models's hyperparameters

* Additional models to test:
  * Doc2Vec