# W266 Final Project

Authors: Satheesh Joseph, Catherine Mou, Yi Zhang

## Downloading and loading the data

We acquired the dataset from the researchers in the form of Sqlite `.db` files.

In [2]:
import os, sys, re, json, time, unittest
import itertools, collections
from importlib import reload
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import datetime

import numpy as np
from scipy import stats
import pandas as pd
import sqlite3
import unicodedata
import nltk

import tensorflow as tf
from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import classification_report

2021-11-28 08:13:29.529622: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.11.0


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Flatten, Lambda, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Embedding
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import tensorflow.compat.v1 as tf1
from tensorflow.keras.models import Model
import tensorflow.compat.v1.keras.backend as K
import tensorflow_hub as hub

import tensorflow.keras as keras
from tensorflow.python.keras import utils


In [4]:
# Download the files if they're not here
if 'data' not in os.listdir('.') or not os.listdir('data'):
    os.system('wget https://storage.googleapis.com/mids-w266-final-project-data/yelpHotelData.db -P data/')
    os.system('wget https://storage.googleapis.com/mids-w266-final-project-data/yelpResData.db -P data/')
    print('Data downloaded successfully!')
else:
    print('Already downloaded data')

Already downloaded data


In [5]:
tf.__version__

'2.3.4'

In [7]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

### If GPU is out of memory

clean gpu process to resolve out of memorty problem

!lsof /dev/nvidia*

!sudo kill -9 3389

In [8]:
con = sqlite3.connect('data/yelpResData.db')
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

con = sqlite3.connect('data/yelpHotelData.db')
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('review',), ('restaurant',), ('reviewer',)]
[('review',), ('sqlite_stat1',), ('sqlite_stat2',), ('reviewer',), ('hotel',)]


In [9]:
# Reading from the hotels database
hotels_db = sqlite3.connect("data/yelpHotelData.db")
hotels = pd.read_sql_query("SELECT * FROM hotel", hotels_db)
hotel_reviews = pd.read_sql_query("SELECT * FROM review WHERE flagged in ('Y', 'N')", hotels_db)
hotel_reviewers = pd.read_sql_query("SELECT * FROM reviewer", hotels_db)


print(f'The data set contains {len(hotels)} hotels, {len(hotel_reviews)} reviews, and {len(hotel_reviewers)} reviewers')

The data set contains 283086 hotels, 5858 reviews, and 5123 reviewers


In [10]:
# Reading from the restaurant database
restaurant_db = sqlite3.connect("data/yelpResData.db")
restaurant_db.text_factory = lambda x: x.decode("utf-8", errors='ignore')
restaurants = pd.read_sql_query("SELECT * FROM restaurant", restaurant_db)
restaurant_reviews = pd.read_sql_query("SELECT * FROM review WHERE flagged in ('Y', 'N')", restaurant_db)
restaurant_reviewers = pd.read_sql_query("SELECT * FROM reviewer", restaurant_db)


print(f'The data set contains {len(restaurants)} restaurants, {len(restaurant_reviews)} reviews, and {len(restaurant_reviewers)} reviewers')

The data set contains 242652 restaurants, 67019 reviews, and 16941 reviewers


In [11]:
# Data Cleaning
hotel_reviews['reviewContent'] = hotel_reviews['reviewContent'].apply(lambda x: unicodedata.normalize('NFKD', x))
hotel_reviews['date'] = hotel_reviews['date'].apply(lambda x: datetime.datetime.strptime(x.strip().split(' ')[-1], '%m/%d/%Y'))

restaurant_reviews['reviewContent'] = restaurant_reviews['reviewContent'].apply(lambda x: unicodedata.normalize('NFKD', x))
restaurant_reviews['date'] = restaurant_reviews['date'].apply(lambda x: datetime.datetime.strptime(x.strip().split(' ')[-1], '%m/%d/%Y'))


In [12]:
# Split data according to the research paper
hotel_X_test = hotel_reviews[hotel_reviews['date'] >= datetime.datetime(2012, 1, 1)]
hotel_y_test = hotel_X_test['flagged'] == 'Y'
hotel_X_tran_and_dev = hotel_reviews[hotel_reviews['date'] < datetime.datetime(2012, 1, 1)]
hotel_X_train, hotel_X_dev, hotel_y_train, hotel_y_dev = train_test_split(hotel_X_tran_and_dev, hotel_X_tran_and_dev['flagged']=='Y')

restaurant_X_test = restaurant_reviews[restaurant_reviews['date'] >= datetime.datetime(2012, 1, 1)]
restaurant_y_test = restaurant_X_test['flagged'] == 'Y'
restaurant_X_tran_and_dev = restaurant_reviews[restaurant_reviews['date'] < datetime.datetime(2012, 1, 1)]
restaurant_X_train, restaurant_X_dev, restaurant_y_train, restaurant_y_dev = train_test_split(restaurant_X_tran_and_dev, restaurant_X_tran_and_dev['flagged']=='Y')


## GAN Based Model

In [13]:
# Generating fake reviews from gpt-2 based on the training fake review set.
# Catherine ran it in an AWS instance and it takes a really long time.
# Commenting out the code to not run over and over again.
##########################
!pip install transformers
!pip install torch



In [13]:
tf1.disable_eager_execution()

def encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def decode(le, one_hot):
    dec = np.argmax(one_hot, axis=1)
    return le.inverse_transform(dec)

def ELMoEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]

embed = hub.Module("https://tfhub.dev/google/elmo/3")
le = preprocessing.LabelEncoder()

In [14]:
hotel_reviews_fake = hotel_X_train[hotel_X_train['flagged'] == 'Y']
np.savetxt(r'/home/jupyter/w266-final-project/new GAN/hotel_fake.txt', hotel_reviews_fake['reviewContent'].values, fmt='%s')
res_reviews_fake = restaurant_X_train[restaurant_X_train['flagged'] == 'Y']
np.savetxt(r'/home/jupyter/w266-final-project/new GAN/res_fake.txt', res_reviews_fake['reviewContent'].values, fmt='%s')

In [15]:
source = hotel_reviews_fake['reviewContent']
print(len(source))
source = list(source)
type(source)

439


list

In [15]:
source[401]

'We stayed in July, 2011 for 3 nights. Okay, meant to say just a few comments and this ended up being much more...I will admit that I was a little worried we were too "casual" of a couple to stay there. No need to worry. There was a diverse group of people that stayed there - business travelers, young people, "my age" range and thankfully older, people in jeans or shorts or dressed to the nines. We wanted a hotel w/o kids, and found it. I know how that sounds, I have older kids and this was our much deserved getaway so did not want to be surrounded by kids of any age!Location was excellent! It was within walking distance of restaurants, Magnificent Mile, etc. There is a small parking lot right next to the hotel but runs about $40 per night. It is valet parking and unlimited exits w/o extra charges. Very nice hotel, though we didn\'t get a chance to visit the spa, bar, club or restaurant. Some other reviewers complained about noise from the club, but we didn\'t hear anything and we were

In [None]:
# -------------dont run this!--------------------
############ train gpt2 based on hotel data, already saved##############
!python GAN_tuning.py \
    --output_dir hotel_GAN \
    --model_name_or_path=gpt2 \
    --do_train \
    --train_data_file='/home/jupyter/w266-final-project/new GAN/hotel_fake.txt' \
    --per_gpu_train_batch_size=2 \
    --num_train_epochs=5

In [46]:
!python GAN_generation.py \
--model_name_or_path hotel_GAN \
--length=500 \
--seed=50 \
--review_num=4 \
--num_examples=430

2021-11-27 15:43:02.353943: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.11.0
11/27/2021 15:43:08 - INFO - __main__ -   Namespace(device=device(type='cuda'), k=0, length=500, model_name_or_path='hotel_GAN', n_gpu=1, no_cuda=False, num_examples=430, p=0.9, prompt='', repetition_penalty=1.0, review_num=4, seed=50, stop_token=None, temperature=1.0)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting 

In [16]:
generated_hotel_fake = pd.read_fwf('generated_hotel_fake.txt', header=None)
generated_hotel_fake = generated_hotel_fake[0]
generated_hotel_fake_Y = ['Y']*len(generated_hotel_fake)
choose_fake_X = np.asarray(list(generated_hotel_fake))
le.fit(generated_hotel_fake_Y)
choose_fake_Y = np.asarray(encode(le, list(generated_hotel_fake_Y)))

In [17]:
len(choose_fake_Y)

5677

In [50]:
input_text = Input(shape=(1,), dtype=tf.string)
embedding = Lambda(ELMoEmbedding, output_shape=(1024, ))(input_text)
dense1 = Dense(256, activation='relu')(embedding)
x2 = Dense(50, activation='relu')(dense1)
x3 = Dense(20, activation='relu')(x2)
pred = Dense(2, activation='softmax')(x3)
model = Model(inputs=[input_text], outputs=pred)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [51]:
with tf1.Session() as session:
    K.set_session(session)
    session.run(tf1.global_variables_initializer())
    session.run(tf1.tables_initializer())
    model.load_weights('/home/jupyter/w266-final-project/new GAN/elmo-hotel-model-multi.h5')
    predicts = model.predict(choose_fake_X, batch_size=32)
    
# hotel_y_preds = decode(le, predicts)
# print(hotel_y_preds)


2021-11-27 23:47:58.732652: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2199995000 Hz
2021-11-27 23:47:58.733281: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x564b2e110c30 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-11-27 23:47:58.733319: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2021-11-27 23:47:58.930562: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-27 23:47:58.931478: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x564b2fe0c160 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2021-11-27 23:47:58.931509: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2021-11-2




2021-11-27 23:48:02.672888: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-11-27 23:48:02.672937: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263]      0 
2021-11-27 23:48:02.672948: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0:   N 
2021-11-27 23:48:02.674444: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-27 23:48:02.675219: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-27 23:48:02.675842: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 13996 MB memory) -> physical GPU (device: 0, name: Tesla T4, pc

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
2021-11-27 23:48:04.285771: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.11
2021-11-27 23:48:05.417139: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.8


In [52]:
np.around(predicts,3)
# print(predicts*1000)

array([[0.648, 0.352],
       [0.796, 0.204],
       [0.215, 0.785],
       ...,
       [0.281, 0.719],
       [0.033, 0.967],
       [0.354, 0.646]], dtype=float32)

In [54]:
use_GAN = []
for i in predicts:
    if i[0]>=0.8:
        use_GAN.append('use')
    else:
        use_GAN.append('no use')
        

In [55]:
len(use_GAN)

5677

In [56]:
len(generated_hotel_fake)

5677

In [57]:
GAN_data_raw = pd.DataFrame(
    {'generated_hotel_fake': generated_hotel_fake,
     'use_GAN': use_GAN
    })

In [60]:
GAN_data = GAN_data_raw[GAN_data_raw['use_GAN'] == 'use']
# GAN_data
np.savetxt(r'/home/jupyter/w266-final-project/new GAN/final_GAN.txt', GAN_data['generated_hotel_fake'].values, fmt='%s')

In [18]:
GAN_data = pd.read_fwf('final_GAN.txt', header=None)
GAN_data = GAN_data[0]
GAN_label = ['Y'] * len(GAN_data)
GAN_label = pd.Series(GAN_label) 
hotel_X_train_concat = pd.concat([hotel_X_train['reviewContent'], GAN_data])
hotel_Y_train_concat = pd.concat([hotel_X_train['flagged'], GAN_label])

In [19]:
hotel_X_train_concat

4976    My husband and I stayed at the Allerton hotel ...
2473    My husband and I stayed here the night we elop...
262     I hop around all of the IHG properties in Chic...
1832    Like all of the other W properties, I love the...
3075    This place is slick. I arrived Friday night af...
                              ...                        
1969    Best hotel in the block  and now my room is em...
1970    The hotel arrived with little to no fuss.  As ...
1971    First time I stayed here in October.  The loca...
1972    great location as it has a lot of shops and re...
1973    I booked a cab but never took the time to book...
Length: 5376, dtype: object

In [20]:
# y = list(hotel_y_train_concat)
# x = list(hotel_X_train_concat)
y = list(hotel_Y_train_concat[:10])
x = list(hotel_X_train_concat[:10])
le = preprocessing.LabelEncoder()
le.fit(y)
x_enc = x
y_enc = encode(le, y)

# x_train = np.asarray(x_enc[:3402])
# y_train = np.asarray(y_enc[:3402])
x_train = np.asarray(x_enc[:])
y_train = np.asarray(y_enc[:])

In [21]:
y_train

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [94]:
input_text = Input(shape=(1,), dtype=tf.string)
embedding = Lambda(ELMoEmbedding, output_shape=(1024, ))(input_text)
dense1 = Dense(256, activation='relu')(embedding)
x2 = Dense(50, activation='relu')(dense1)
x3 = Dense(20, activation='relu')(x2)
pred = Dense(2, activation='softmax')(x3)
model = Model(inputs=[input_text], outputs=pred)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [95]:
with tf1.Session() as session:
    K.set_session(session)
    session.run(tf1.global_variables_initializer())
    session.run(tf1.tables_initializer())
    history = model.fit(x_train, y_train, epochs=6, batch_size=32)
    model.save_weights('./hotel-model-finalGAN-ELMo.h5')

2021-11-28 00:35:17.565997: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-28 00:35:17.566512: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla T4 computeCapability: 7.5
coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.75GiB deviceMemoryBandwidth: 298.08GiB/s
2021-11-28 00:35:17.566959: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.11.0
2021-11-28 00:35:17.567049: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.11
2021-11-28 00:35:17.567090: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10
2021-11-28 00:35:17.567123: I tensorflow/stream_executor/platform/default

Train on 5376 samples
Epoch 1/6
 128/5376 [..............................] - ETA: 5:02 - loss: 0.7045 - accuracy: 0.5234

2021-11-28 00:35:30.142573: W tensorflow/core/kernels/gpu_utils.cc:49] Failed to allocate memory for convolution redzone checking; skipping this check. This is benign and only means that we won't check cudnn for out-of-bounds reads and writes. This message will only be printed once.


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [106]:
hotel_X_dev_finalGAN = np.asarray(list(hotel_X_dev['reviewContent'])[:])
hotel_Y_dev_finalGAN = np.asarray(encode(le, list(hotel_X_dev['flagged']))[:]) 

with tf1.Session() as session:
    K.set_session(session)
    session.run(tf1.global_variables_initializer())
    model.load_weights('./hotel-model-finalGAN-ELMo.h5')
    predicts = model.predict(hotel_X_dev_finalGAN, batch_size=32)

hotel_Y_dev_finalGAN = decode(le, hotel_Y_dev_finalGAN)
hotel_y_preds = decode(le, predicts)

metrics.confusion_matrix(hotel_Y_dev_finalGAN, hotel_y_preds)
print(metrics.classification_report(hotel_Y_dev_finalGAN, hotel_y_preds))

2021-11-28 01:12:49.999594: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-28 01:12:50.000067: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla T4 computeCapability: 7.5
coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.75GiB deviceMemoryBandwidth: 298.08GiB/s
2021-11-28 01:12:50.000214: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.11.0
2021-11-28 01:12:50.000266: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.11
2021-11-28 01:12:50.000295: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10
2021-11-28 01:12:50.000319: I tensorflow/stream_executor/platform/default

              precision    recall  f1-score   support

           N       0.94      0.67      0.78       996
           Y       0.23      0.71      0.35       138

    accuracy                           0.67      1134
   macro avg       0.59      0.69      0.56      1134
weighted avg       0.86      0.67      0.73      1134



In [107]:
hotel_X_test_finalGAN = np.asarray(list(hotel_X_test['reviewContent'])[:])
hotel_Y_test_finalGAN = np.asarray(encode(le, list(hotel_X_test['flagged']))[:]) 

with tf1.Session() as session:
    K.set_session(session)
    session.run(tf1.global_variables_initializer())
    model.load_weights('./hotel-model-finalGAN-ELMo.h5')
    predicts = model.predict(hotel_X_test_finalGAN, batch_size=32)

hotel_Y_test_finalGAN = decode(le, hotel_Y_test_finalGAN)
hotel_y_preds = decode(le, predicts)

metrics.confusion_matrix(hotel_Y_test_finalGAN, hotel_y_preds)
print(metrics.classification_report(hotel_Y_test_finalGAN, hotel_y_preds))

2021-11-28 01:15:50.715339: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-28 01:15:50.715780: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla T4 computeCapability: 7.5
coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.75GiB deviceMemoryBandwidth: 298.08GiB/s
2021-11-28 01:15:50.715923: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.11.0
2021-11-28 01:15:50.715973: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.11
2021-11-28 01:15:50.716007: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10
2021-11-28 01:15:50.716037: I tensorflow/stream_executor/platform/default

              precision    recall  f1-score   support

           N       0.88      0.56      0.68      1103
           Y       0.22      0.62      0.32       219

    accuracy                           0.57      1322
   macro avg       0.55      0.59      0.50      1322
weighted avg       0.77      0.57      0.62      1322



In [57]:
def finalGAN_LSTM_hotel(num_epochs=2, sequence_length=50):
    m5_hotel_vectorizer = TextVectorization(output_sequence_length=sequence_length)
    m5_hotel_vectorizer.adapt(hotel_X_train_concat.to_numpy())
    m5_hotel_voc = m5_hotel_vectorizer.get_vocabulary()
    print(f"Model 5: Hotels reviews vocabulary size is {len(m5_hotel_voc)}")

    elmo_embeddings = get_elmo_embedding(m5_hotel_voc)

    # Building the embedding layer using Elmo results
    elmo_embedding_dim = 1024
    num_words = len(m5_hotel_voc)

    m5_hotel_embedding_matrix = np.zeros((num_words, elmo_embedding_dim))
    for i, word in enumerate(m5_hotel_voc):
        m5_hotel_embedding_matrix[i] = elmo_embeddings[i][0]

    m5_hotel_embedding_layer = Embedding(
        num_words,
        elmo_embedding_dim,
        embeddings_initializer=tf.keras.initializers.Constant(m5_hotel_embedding_matrix),
        trainable=False,
    )

    # # Vectorize the input
    hotel_X_train_ready = m5_hotel_vectorizer(hotel_X_train_concat).numpy()
    hotel_X_dev_ready = m5_hotel_vectorizer(hotel_X_dev['reviewContent']).numpy()
    hotel_X_test_ready = m5_hotel_vectorizer(hotel_X_test['reviewContent']).numpy()
    print(f'training set shape: {hotel_X_train_ready.shape}')

    # # Build and train the model with 
    model = Sequential(name='model_5_hotel')
    model.add(m5_hotel_embedding_layer)
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    print(model.summary())
    model.fit(hotel_X_train_ready, hotel_Y_train_concat, epochs=num_epochs)

#     # # Evaluate on the dev set
    hotel_y_predicted = model.predict(hotel_X_dev_ready)
    print(classification_report(hotel_y_predicted > 0.5, hotel_y_dev))
    print('Dev set class distribution')
    print(hotel_X_dev['flagged'].value_counts() / len(hotel_X_dev))

#     # # Evaluate on the test set
    hotel_y_predicted = model.predict(hotel_X_test_ready)
    print(classification_report(hotel_y_predicted > 0.5, hotel_y_test))
    print('Test set class distribution')
    print(hotel_X_test['flagged'].value_counts() / len(hotel_X_test))
finalGAN_LSTM_hotel()

2021-11-28 02:08:53.093026: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2199995000 Hz
2021-11-28 02:08:53.093332: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x56434e13bb00 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-11-28 02:08:53.093364: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2021-11-28 02:08:53.554667: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-28 02:08:53.555369: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x56434f8b58a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2021-11-28 02:08:53.555398: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2021-11-2

KeyboardInterrupt: 

In [29]:
hotel_X_train_concat = hotel_X_train_concat[:2].to_numpy()

In [None]:
sequence_length = 10
m5_hotel_vectorizer = TextVectorization(output_sequence_length=sequence_length)
m5_hotel_vectorizer.adapt(hotel_X_train_concat)
# m5_hotel_voc = m5_hotel_vectorizer.get_vocabulary()
# print(f"Model 5: Hotels reviews vocabulary size is {len(m5_hotel_voc)}")



In [None]:
elmo_embeddings = get_elmo_embedding(m5_hotel_voc)

# Building the embedding layer using Elmo results
elmo_embedding_dim = 1024
num_words = len(m5_hotel_voc)

m5_hotel_embedding_matrix = np.zeros((num_words, elmo_embedding_dim))
for i, word in enumerate(m5_hotel_voc):
    m5_hotel_embedding_matrix[i] = elmo_embeddings[i][0]

m5_hotel_embedding_layer = Embedding(
    num_words,
    elmo_embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(m5_hotel_embedding_matrix),
    trainable=False,
)

# # Vectorize the input
hotel_X_train_ready = m5_hotel_vectorizer(hotel_X_train_concat).numpy()
hotel_X_dev_ready = m5_hotel_vectorizer(hotel_X_dev['reviewContent']).numpy()
hotel_X_test_ready = m5_hotel_vectorizer(hotel_X_test['reviewContent']).numpy()
print(f'training set shape: {hotel_X_train_ready.shape}')

# # Build and train the model with 
model = Sequential(name='model_5_hotel')
model.add(m5_hotel_embedding_layer)
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())
model.fit(hotel_X_train_ready, hotel_Y_train_concat, epochs=num_epochs)

#     # # Evaluate on the dev set
hotel_y_predicted = model.predict(hotel_X_dev_ready)
print(classification_report(hotel_y_predicted > 0.5, hotel_y_dev))
print('Dev set class distribution')
print(hotel_X_dev['flagged'].value_counts() / len(hotel_X_dev))

#     # # Evaluate on the test set
hotel_y_predicted = model.predict(hotel_X_test_ready)
print(classification_report(hotel_y_predicted > 0.5, hotel_y_test))
print('Test set class distribution')
print(hotel_X_test['flagged'].value_counts() / len(hotel_X_test))

1, present what we did

2, chagne the narrative, we don't use behavior data, not using paper as benchmark, we use baseline, we also compare to other baseline. conclusion: just using language not good, significantly better than smaller dataset

3, technically wrong, we pretend we didn't know the test data is not data. 


# 