# Yelp Review Star Classifier

Build a classifier to predict the star of the review from the review context.

In [2]:
from time import time
from pprint import pprint

import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

In [3]:
# Read toronto restaurant review data
print('Loading Toronto restaurant reviews:')

# Read data from gcp
# df = pd.read_csv('gs://yelp_review_toronto_restaurant/toronto_restaurant_review.csv', index_col=0)

# Read data from local file
df = pd.read_csv('toronto_restaurant_review.csv')

Loading Toronto restaurant reviews:


In [39]:
df.head().to_clipboard()

In [4]:
# Preview review data
print(f'Total {df.shape[0]} reviews')
print('Review preview:')
df.head()

Total 376702 reviews
Review preview:


Unnamed: 0.1,Unnamed: 0,business_id,cool,date,funny,review_id,stars_x,text,useful,user_id,...,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars_y,state
0,0,AakkkTuGZA2KBodKi2_u8A,0,2012-07-16 00:37:14,1,JVcjMhlavKKn3UIt9p9OXA,1,I cannot believe how things have changed in 3 ...,1,TpyOT5E16YASd7EWjLQlrw,...,Toronto,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,43.649674,-79.435116,Pho Phuong,M6K 1T9,55,3.5,ON
1,1,AakkkTuGZA2KBodKi2_u8A,0,2014-02-24 01:45:02,0,vKhtzhPUz9RJbllyvHm3qA,3,"Pretty good, food,, about the same as other vi...",0,G-9ujgKmc1J2k7HSqXszsw,...,Toronto,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,43.649674,-79.435116,Pho Phuong,M6K 1T9,55,3.5,ON
2,2,AakkkTuGZA2KBodKi2_u8A,0,2016-02-12 00:25:23,0,Je6AF9sTKwXwOVw2YHR1dg,5,I've been going to this place since it opened ...,0,NA4sslQXta6U263fqzwKiw,...,Toronto,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,43.649674,-79.435116,Pho Phuong,M6K 1T9,55,3.5,ON
3,3,AakkkTuGZA2KBodKi2_u8A,0,2013-05-07 06:03:17,0,b_xVF8U5Vqljz58OUEjqgA,4,One of the best Vietnamese places I`ve tried i...,1,1fNQRju9gmoCEvbPQBSo7w,...,Toronto,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,43.649674,-79.435116,Pho Phuong,M6K 1T9,55,3.5,ON
4,4,AakkkTuGZA2KBodKi2_u8A,0,2011-11-30 16:46:24,0,vFPpG1xDBSWcvy_165fxKg,3,"This place is just ok. Nice atmosphere, big op...",0,fYJGKhZK2FZckYWDMdCooA,...,Toronto,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1,43.649674,-79.435116,Pho Phuong,M6K 1T9,55,3.5,ON


In [29]:
n_samples = 5000
n_jobs = 1

X = df.text[:n_samples]
y = df.stars_x[:n_samples]

In [8]:
def select_best_params(X, y, pipeline, params, grid_search):
    print(f'pipeline: {[name for name, _ in pipeline.steps]}')
    print(f'parameters: {params}')
    t0 = time()
    grid_search.fit(X, y)
    print(f'done in {time() - t0}s')
    print(f'Best score: {grid_search.best_score_}')
    print('Best parameters set:')
    best_params = grid_search.best_estimator_.get_params()
    for param_name in sorted(params.keys()):    
        print(f'{param_name}, {best_params[param_name]}')

### Multinomial Naive Bayes

In [5]:
# Define pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [6]:
# Define parameters
parameters = {
    'vect__min_df': (1, 5),
    'vect__max_df': (0.6, 0.75, 0.9),
    'vect__ngram_range': ((1,1), (1,2)),
    'vect__stop_words': (None, 'english'),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    # 'tfidf__use_idf': ('l1', 'l2'),
    # 'clf__alpha': (0.1, 1, 10),
    # 'clf__fit_prior': (True, False)
}

In [7]:
# Define grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=n_jobs, cv=5)

In [8]:
print(f'pipeline: {[name for name, _ in pipeline.steps]}')
print(f'parameters: {parameters}')
t0 = time()
grid_search.fit(df.text, df.stars_x)
print(f'done in {time() - t0}s')
print(f'Best score: {grid_search.best_score_}')
print('Best parameters set:')
best_params = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):    
    print(f'{param_name}, {best_params[param_name]}')

pipeline: ['vect', 'tfidf', 'clf']
parameters: {'vect__min_df': (1, 5), 'vect__max_df': (0.6, 0.75, 0.9), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__stop_words': (None, 'english')}
done in 7479.055411100388s
Best score: 0.5182159774561776
Best parameters set:
vect__max_df, 0.6
vect__min_df, 5
vect__ngram_range, (1, 1)
vect__stop_words, None


### Stochastic Gradient Descent

In [28]:
# Define pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier())
])

In [35]:
# Define parameters
parameters = {
    'vect__min_df': (1, 5),
    'vect__max_df': (0.6, 0.8),
    'vect__ngram_range': ((1,1), (1,2)),
    'vect__stop_words': (None, 'english'),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    # 'tfidf__use_idf': ('l1', 'l2'),
    'clf__max_iter': (5, 10),
    'clf__random_state': [42],
    'clf__tol': [None]
}

In [36]:
# Define grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=n_jobs, cv=5)

In [37]:
select_best_params(X, y, pipeline, parameters, grid_search)

pipeline: ['vect', 'tfidf', 'clf']
parameters: {'vect__min_df': (1, 5), 'vect__max_df': (0.6, 0.8), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__stop_words': (None, 'english'), 'clf__max_iter': (5, 10), 'clf__random_state': [42], 'clf__tol': [None]}
done in 126.31073689460754s
Best score: 0.5078
Best parameters set:
clf__max_iter, 5
clf__random_state, 42
clf__tol, None
vect__max_df, 0.6
vect__min_df, 1
vect__ngram_range, (1, 2)
vect__stop_words, None
