# W266 Final Project

Authors: Satheesh Joseph, Catherine Mou, Yi Zhang

## Downloading and loading the data

We acquired the dataset from the researchers in the form of Sqlite `.db` files.

In [1]:
import os, sys, re, json, time, unittest
import itertools, collections
from importlib import reload
from sklearn.model_selection import train_test_split

import numpy as np
from scipy import stats
import pandas as pd
import sqlite3

import nltk

import tensorflow as tf

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation, Embedding


In [3]:
# Download the files if they're not here
if 'data' not in os.listdir('.') or not os.listdir('data'):
    os.system('wget https://storage.googleapis.com/mids-w266-final-project-data/yelpHotelData.db -P data/')
    os.system('wget https://storage.googleapis.com/mids-w266-final-project-data/yelpResData.db -P data/')
    print('Data downloaded successfully!')
else:
    print('Already downloaded data')

Already downloaded data


In [4]:
con = sqlite3.connect('data/yelpResData.db')
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

con = sqlite3.connect('data/yelpHotelData.db')
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('review',), ('restaurant',), ('reviewer',)]
[('review',), ('sqlite_stat1',), ('sqlite_stat2',), ('reviewer',), ('hotel',)]


In [5]:
# Reading from the hotels database
hotels_db = sqlite3.connect("data/yelpHotelData.db")
hotels = pd.read_sql_query("SELECT * FROM hotel", hotels_db)
hotel_reviews = pd.read_sql_query("SELECT * FROM review WHERE flagged in ('Y', 'N')", hotels_db)
hotel_reviewers = pd.read_sql_query("SELECT * FROM reviewer", hotels_db)


print(f'The data set contains {len(hotels)} hotels, {len(hotel_reviews)} reviews, and {len(hotel_reviewers)} reviewers')

The data set contains 283086 hotels, 5858 reviews, and 5123 reviewers


In [6]:
# Reading from the restaurant database
restaurant_db = sqlite3.connect("data/yelpResData.db")
restaurant_db.text_factory = lambda x: x.decode("utf-8", errors='ignore')
restaurants = pd.read_sql_query("SELECT * FROM restaurant", restaurant_db)
restaurant_reviews = pd.read_sql_query("SELECT * FROM review WHERE flagged in ('Y', 'N')", restaurant_db)
restaurant_reviewers = pd.read_sql_query("SELECT * FROM reviewer", restaurant_db)


print(f'The data set contains {len(restaurants)} restaurants, {len(restaurant_reviews)} reviews, and {len(restaurant_reviewers)} reviewers')

The data set contains 242652 restaurants, 67019 reviews, and 16941 reviewers


# Exploratory Data Analysis

## ToDo: Performan EDA

In [7]:
hotel_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5858 entries, 0 to 5857
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   date           5858 non-null   object
 1   reviewID       5858 non-null   object
 2   reviewerID     5858 non-null   object
 3   reviewContent  5858 non-null   object
 4   rating         5858 non-null   int64 
 5   usefulCount    5858 non-null   int64 
 6   coolCount      5858 non-null   int64 
 7   funnyCount     5858 non-null   int64 
 8   flagged        5858 non-null   object
 9   hotelID        5858 non-null   object
dtypes: int64(4), object(6)
memory usage: 457.8+ KB


In [8]:
restaurant_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67019 entries, 0 to 67018
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   date           67019 non-null  object
 1   reviewID       67019 non-null  object
 2   reviewerID     67019 non-null  object
 3   reviewContent  67019 non-null  object
 4   rating         67019 non-null  int64 
 5   usefulCount    67019 non-null  int64 
 6   coolCount      67019 non-null  int64 
 7   funnyCount     67019 non-null  int64 
 8   flagged        67019 non-null  object
 9   restaurantID   67019 non-null  object
dtypes: int64(4), object(6)
memory usage: 5.1+ MB


In [9]:
reviews = pd.concat([restaurant_reviews, hotel_reviews.rename(columns={'hotelID':'restaurantID'})], ignore_index=True)
reviews.groupby('reviewerID').agg({"usefulCount": np.sum, 
                                   "coolCount": np.sum, 
                                   "funnyCount": np.sum}).sort_values(by=['usefulCount'], ascending=False)
reviews[reviews['reviewerID'] == 'w-w-k-QXosIKQ8HQVwU6IQ']['reviewContent']

94       ***Alinea is truly a one-of-a-kind experience;...
29403    ***Graham Elliot serves up refined casual food...
43054    ***Longman & Eagle is a true gastropub--a casu...
71630    ***While the rooms are small, Hotel Felix is a...
Name: reviewContent, dtype: object

In [10]:
reviews.groupby('flagged').agg('sum')

Unnamed: 0_level_0,rating,usefulCount,coolCount,funnyCount
flagged,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,252531,65773,42690,36444
Y,34611,0,0,0


# Baseline Model

Try a Simple LSTM model on the Hotel data set.

In [11]:
# Split train/test data for hotel reviews
X_train, X_test, y_train, y_test = train_test_split(hotel_reviews, hotel_reviews['flagged']=='Y')

In [12]:
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(X_train['reviewContent'])
sequences = tokenizer.texts_to_sequences(X_train['reviewContent'])
train_data = pad_sequences(sequences, maxlen=100)

tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(X_test['reviewContent'])
sequences = tokenizer.texts_to_sequences(X_test['reviewContent'])
test_data = pad_sequences(sequences, maxlen=100)


In [13]:
model = Sequential()
model.add(Embedding(20000, 100, input_length=100))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [14]:
model.fit(train_data, y_train, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f5c8e6dccd0>

In [15]:
model.evaluate(test_data, y_test)



[0.4302046597003937, 0.8627986311912537]