# Yelp Data Challenge - NLP

BitTiger DS501

Zhenning Tan 6/17/2017

In [1]:
import pandas as pd
import pprint

In [2]:
df = pd.read_csv('yelp_dataset_challenge_round9/last_2_years_restaurant_reviews.csv')

In [3]:
df.head()

Unnamed: 0,business_id,name,categories,ave_stars,cool,date,funny,review_id,stars,text,type,useful,user_id
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2016-07-28,0,iHP55csZHjPGqOMwIo70qQ,5,Exceptional...exceptional steakhouse!! Ordered...,review,0,TU5j2S_Ub__ojLOpD_UepQ
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2016-07-17,0,GWI2xpBBwxK9-w1etLz51A,5,In a city with overrated 'celebrity' steakhous...,review,0,OC_WdUmY2fK-c1SD4JqSsw
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2016-10-30,0,CyZFXdnTCgpnKHNtIiKvaQ,3,The service was great. The appetizer bread was...,review,0,A6zYXofgFj6UhonFPrEDHw
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2016-08-08,0,QxE_WJYBMsgzPk9ZBJ6bgA,5,"Great service, great food, great environment. ...",review,0,WHT6g24E7_B9aZiKgUgB6Q
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2016-10-15,0,gvvuzwPWHuRdKVv-P6OTRw,5,"We were served chicken skewers, prosciutto, an...",review,0,Cn8UFE9uvIt-yFnASEmJnQ


### Define your feature variables, here is the text of the review

In [4]:
# Take the values of the column that contains review text data, save to a variable named "documents"
documents = df["text"]

In [5]:
# inspect your documents, e.g. check the size, take a peek at elements of the numpy array
print documents.shape
print documents[0]

(111548L,)
Exceptional...exceptional steakhouse!! Ordered ribeye bone out and I've never had such an amazing cut of meat in my life! I died and went to heaven! The service is phenomenal! Had a huge party of about 15 and everyone was well attended too! The sides are superb too! Such a great atmosphere! Sophisticated an chic! Loved loved this place! A MUST when visiting Vegas!


### Define your target variable (any categorical variable that may be meaningful)

#### For example, I am interested in perfect (5 stars) and imperfect (1-4 stars) rating

In [6]:
# Make a column and take the values, save to a variable named "target"
target = (df["stars"] == 5).astype("int")

#### You may want to look at the statistic of the target variable

In [7]:
# To be implemented
target.value_counts()

0    56928
1    54620
Name: stars, dtype: int64

Two classes are balanced

## Let's create training dataset and test dataset

In [8]:
from sklearn.model_selection import train_test_split

Documents is your X, target is your y
Now split the data to training set and test set

In [9]:
# Split to documents_train, documents_test, target_train, target_test
documents_train, documents_test, target_train, target_test = train_test_split(documents, target,
                                                                              test_size = 0.3, random_state = 100)

In [10]:
documents_train.shape, documents_test.shape

((78083L,), (33465L,))

## Let's get NLP representation of the documents

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [60]:
# Create TfidfVectorizer, and name it vectorizer
vectorizer = TfidfVectorizer(stop_words = "english",
                             max_features = 1000)

In [61]:
# Train the model with your training data
X_train = vectorizer.fit_transform(documents_train).todense()

In [62]:
X_train.shape

(78083L, 1000L)

In [63]:
# Get the vocab of your tfidf
vocab = vectorizer.get_feature_names()
print vocab[:100]

[u'00', u'10', u'100', u'11', u'12', u'15', u'16', u'20', u'24', u'25', u'30', u'40', u'45', u'50', u'95', u'99', u'able', u'absolutely', u'accommodating', u'actually', u'add', u'added', u'affordable', u'afternoon', u'ago', u'ahead', u'amazing', u'ambiance', u'ambience', u'american', u'appetizer', u'appetizers', u'area', u'aren', u'arrived', u'asada', u'asian', u'ask', u'asked', u'asking', u'ate', u'atmosphere', u'attention', u'attentive', u'attitude', u'authentic', u'available', u'average', u'avocado', u'avoid', u'away', u'awesome', u'awful', u'ayce', u'bacon', u'bad', u'bag', u'baked', u'banana', u'bar', u'barely', u'bartender', u'based', u'basic', u'basically', u'bbq', u'bean', u'beans', u'beat', u'beautiful', u'beef', u'beer', u'beers', u'believe', u'bellagio', u'belly', u'benedict', u'best', u'better', u'big', u'birthday', u'bit', u'bite', u'bites', u'black', u'bland', u'blue', u'boba', u'bomb', u'bone', u'bottle', u'bowl', u'bowls', u'box', u'boy', u'boyfriend', u'bread', u'break

In [64]:
# Use the trained model to transform your test data
X_test = vectorizer.transform(documents_test)

In [65]:
X_test.shape

(33465, 1000)

## Similar review search engine

In [18]:
import numpy as np

# We will need these helper methods pretty soon

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    return  [labels[i] for i in np.argsort(lst)[:n]]


In [19]:
# Let's use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [66]:
# Draw an arbitrary review from test (unseen in training) documents
unseen_review = np.random.choice(documents_test)
unseen_review

'Horrible service. We went there on a Tuesday night and it was nearly empty. The waiter was never checked on us I had to call every time we wanted to get more drinks fries order everything. Great food when it finally came but horrible service'

In [67]:
# Transform the drawn review(s) to vector(s)
unseen_review_vec = vectorizer.transform([unseen_review]).toarray()
unseen_review_vec.shape

(1L, 1000L)

In [68]:
# Calculate the similarity score(s) between vector(s) and training vectors
scores = cosine_similarity(X_train, unseen_review_vec)

In [69]:
scores.shape

(78083L, 1L)

In [70]:
# Let's find top 5 similar reviews
n = 5
pprint.pprint(get_top_values(scores.flatten(), n, list(documents_train)))

["horrible, horrible, horrible.  food, service price.  it's an overpriced tourist trap.  Pease do yourself a favor, go to in n out.",
 "Went out for lunch on Christmas Eve with my family here and the service was horrible.  I figured since it was slow and no one was there the service would be great and the food would come out fast but it was the opposite!   The waiter didn't even ask us what we wanted to drink also we ordered a latte and it was COLD!   Then didn't even apologize for it or ask if we wanted something else it was horrible the server we had was name : Karina . Don't ever get her for your waiter ask for someone else , she wasn't every happy or cheerful overall it was a horrible service",
 'Horrible service. Horrible food. Appetizer came out AFTER our entrees. Do yourself a favor and go somewhere else',
 'Horrible horrible horrible!!!! It was my sons birthday ruined by the horrible service and rude people that work here! We have been coming here for years never again!! NEVER'

In [71]:
print 'Our search query:'
print  unseen_review

Our search query:
Horrible service. We went there on a Tuesday night and it was nearly empty. The waiter was never checked on us I had to call every time we wanted to get more drinks fries order everything. Great food when it finally came but horrible service


In [72]:
print 'Most %s similar reviews:' % n
pprint.pprint(get_top_values(scores.flatten(), n, list(documents_train)))  

Most 5 similar reviews:
["horrible, horrible, horrible.  food, service price.  it's an overpriced tourist trap.  Pease do yourself a favor, go to in n out.",
 "Went out for lunch on Christmas Eve with my family here and the service was horrible.  I figured since it was slow and no one was there the service would be great and the food would come out fast but it was the opposite!   The waiter didn't even ask us what we wanted to drink also we ordered a latte and it was COLD!   Then didn't even apologize for it or ask if we wanted something else it was horrible the server we had was name : Karina . Don't ever get her for your waiter ask for someone else , she wasn't every happy or cheerful overall it was a horrible service",
 'Horrible service. Horrible food. Appetizer came out AFTER our entrees. Do yourself a favor and go somewhere else',
 'Horrible horrible horrible!!!! It was my sons birthday ruined by the horrible service and rude people that work here! We have been coming here for ye

#### Q: Does the result make sense to you?

A: The result makes perfect sense. Both the query and returned results are about similar restaurants or food and they have the same sentiment 

## Classifying positive/negative review

#### Naive-Bayes Classifier

In [73]:
# Build a Naive-Bayes Classifier

from sklearn.naive_bayes import MultinomialNB

NB = MultinomialNB()
NB.fit(X_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [74]:
# Get score for training set
NB.score(X_train, target_train)

0.80805040789928662

In [75]:
# Get score for test set
NB.score(X_test, target_test)

0.80693261616614376

#### Logistic Regression Classifier

In [76]:
# Build a Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression

logistic_clf = LogisticRegression(random_state = 100)
logistic_clf.fit(X_train, target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=100, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [77]:
# Get score for training set
logistic_clf.score(X_train, target_train)

0.83034719465184481

In [78]:
# Get score for test set
logistic_clf.score(X_test, target_test)

0.82393545495293585

#### Q: What are the key features(words) that make the positive prediction?

In [79]:
# Let's find it out by ranking
n = 20
print get_top_values(logistic_clf.coef_.flatten(), n, vocab)

[u'amazing', u'best', u'awesome', u'incredible', u'delicious', u'thank', u'excellent', u'perfect', u'gem', u'great', u'perfection', u'disappoint', u'phenomenal', u'favorite', u'fantastic', u'highly', u'bomb', u'perfectly', u'love', u'greeted']


#### Q: What are the key features(words) that make the negative prediction?

In [80]:
# Let's find it out by ranking
n = 20
print get_bottom_values(logistic_clf.coef_.flatten(), n, vocab)

[u'worst', u'horrible', u'rude', u'mediocre', u'slow', u'terrible', u'ok', u'bland', u'disappointing', u'okay', u'awful', u'unfortunately', u'overpriced', u'poor', u'dry', u'average', u'worse', u'charged', u'decent', u'soggy']


A: [u'worst',
 u'horrible',
 u'ok',
 u'slow',
 u'rude',
 u'mediocre',
 u'terrible',
 u'okay',
 u'disappointing',
 u'bland',
 u'unfortunately',
 u'overpriced',
 u'poor',
 u'awful',
 u'dry',
 u'lacking',
 u'average',
 u'decent',
 u'meh',
 u'reason']

#### Random Forest Classifier

In [81]:
# Build a Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state = 100)
rf_clf.fit(X_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=100,
            verbose=0, warm_start=False)

In [82]:
# Get score for training set
rf_clf.score(X_train, target_train)

0.99006185725445994

In [83]:
# Get score for test set
rf_clf.score(X_test, target_test)

0.77310623038995963

#### Q: What do you see from the training score and the test score?

A: test score is much worse than training score. This is due to overfitting. Use  cross validation to optimize the model 

#### Q: Can you tell what features (words) are important by inspecting the RFC model?

In [84]:
n = 20
print get_top_values(rf_clf.feature_importances_, n, vocab)
print get_bottom_values(rf_clf.feature_importances_, n, vocab)

[u'amazing', u'great', u'best', u'delicious', u'awesome', u'love', u'didn', u'bad', u'definitely', u'good', u'ok', u'vegas', u'just', u'favorite', u'friendly', u'place', u'excellent', u'wasn', u'worst', u'said']
[u'clearly', u'speak', u'soy', u'placed', u'box', u'mistake', u'supposed', u'brown', u'selections', u'king', u'showed', u'tomatoes', u'pulled', u'window', u'known', u'asking', u'missing', u'menus', u'dont', u'mashed']


Words with strong sentiment have high importance in the model. Neutral words or words without sentiment have low importance. However, in the low importance words, there are some words with negative sentiment, like "crap", "wtf", "unprofessional". 

## Extra Credit #1: Use cross validation to evaluate your classifiers

[sklearn cross validation](http://scikit-learn.org/stable/modules/cross_validation.html)

In [87]:
from sklearn.model_selection import cross_val_score

In [86]:
cross_val_score(logistic_clf, X_train, target_train, cv = 5)

array([ 0.82334486,  0.82160466,  0.82242572,  0.82594775,  0.82056865])

In [88]:
cross_val_score(rf_clf, X_train, target_train, cv = 5)

array([ 0.77609169,  0.77268361,  0.7722208 ,  0.77382172,  0.77196465])

Cross validation shows that the score on cross validation is similar to test data

## Extra Credit #2: Use grid search to find best predictable classifier


[sklearn grid search tutorial (with cross validation)](http://scikit-learn.org/stable/modules/grid_search.html#grid-search)

[sklearn grid search documentation (with cross validation)](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV)

In [40]:
from sklearn.model_selection import GridSearchCV

#### Optimize logistic regression classifier

In [41]:
params = {"C": [0.5, 0.7, 1],
         "penalty": ["l2", "l1"]}
gs_logistic = GridSearchCV(logistic_clf, param_grid = params, cv=5)
gs_logistic.fit(X_train, target_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=100, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l2', 'l1'], 'C': [0.5, 0.7, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [42]:
print gs_logistic.best_estimator_
print gs_logistic.best_score_

LogisticRegression(C=0.7, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=100, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.828976857959


In [43]:
gs_logistic.cv_results_

{'mean_fit_time': array([ 1.49439998,  1.56599998,  1.55580001,  1.71240005,  1.62759995,
         1.78979993]),
 'mean_score_time': array([ 0.05619993,  0.05680003,  0.05280004,  0.05900002,  0.05799999,
         0.06100006]),
 'mean_test_score': array([ 0.82840055,  0.82865669,  0.82845178,  0.82897686,  0.82841336,
         0.82882318]),
 'mean_train_score': array([ 0.83977626,  0.83763751,  0.84108896,  0.84036217,  0.84212632,
         0.84199825]),
 'param_C': masked_array(data = [0.5 0.5 0.7 0.7 1 1],
              mask = [False False False False False False],
        fill_value = ?),
 'param_penalty': masked_array(data = ['l2' 'l1' 'l2' 'l1' 'l2' 'l1'],
              mask = [False False False False False False],
        fill_value = ?),
 'params': ({'C': 0.5, 'penalty': 'l2'},
  {'C': 0.5, 'penalty': 'l1'},
  {'C': 0.7, 'penalty': 'l2'},
  {'C': 0.7, 'penalty': 'l1'},
  {'C': 1, 'penalty': 'l2'},
  {'C': 1, 'penalty': 'l1'}),
 'rank_test_score': array([6, 3, 4, 1, 5, 2]),
 'spl

In [44]:
gs_logistic.best_estimator_.score(X_test, target_test)

0.8300014940983117

In [45]:
print get_top_values(gs_logistic.best_estimator_.coef_.flatten(), n, vocab)
print get_bottom_values(gs_logistic.best_estimator_.coef_.flatten(), n, vocab)

[u'amazing', u'best', u'incredible', u'heaven', u'awesome', u'thank', u'delicious', u'gem', u'perfect', u'disappoint', u'excellent', u'phenomenal', u'great', u'perfection', u'bomb', u'soooo', u'fantastic', u'favorite', u'sooooo', u'highly']
[u'worst', u'horrible', u'disappointing', u'rude', u'mediocre', u'terrible', u'slow', u'bland', u'ok', u'lacking', u'awful', u'okay', u'poor', u'meh', u'unfortunately', u'disgusting', u'worse', u'overpriced', u'charged', u'nasty']


There is not much improvement for logistic regression after screen C and penalty. However, through cross validation, we can see how the classifier performs with different parameters. This gives us confidence on our model performance. 

#### Optimize random forest classifier

In [106]:
from time import time
t0 = time()
params = {"n_estimators": [30, 50, 100],
         "max_depth":[15],
         "max_features": [30]}
gs_rf = GridSearchCV(rf_clf, param_grid = params, cv=5)
gs_rf.fit(X_train, target_train)
print "time used:", time()-t0, "seconds"

time used: 358.618999958 seconds


In [107]:
print gs_rf.best_estimator_
print gs_rf.best_score_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features=30, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=100,
            verbose=0, warm_start=False)
0.784216794949


In [108]:
gs_rf.cv_results_

{'mean_fit_time': array([ 10.08859997,  16.5928    ,  32.62979999]),
 'mean_score_time': array([ 0.15960007,  0.23779998,  0.4382    ]),
 'mean_test_score': array([ 0.78022105,  0.78298733,  0.78421679]),
 'mean_train_score': array([ 0.81705685,  0.81972388,  0.82251578]),
 'param_max_depth': masked_array(data = [15 15 15],
              mask = [False False False],
        fill_value = ?),
 'param_max_features': masked_array(data = [30 30 30],
              mask = [False False False],
        fill_value = ?),
 'param_n_estimators': masked_array(data = [30 50 100],
              mask = [False False False],
        fill_value = ?),
 'params': ({'max_depth': 15, 'max_features': 30, 'n_estimators': 30},
  {'max_depth': 15, 'max_features': 30, 'n_estimators': 50},
  {'max_depth': 15, 'max_features': 30, 'n_estimators': 100}),
 'rank_test_score': array([3, 2, 1]),
 'split0_test_score': array([ 0.78153413,  0.78422333,  0.78755282]),
 'split0_train_score': array([ 0.81648923,  0.81876251,  0.

There are many factors to tune in random forest model, including the number of trees, maximum features on split, tree depth etc . Due to the limitation of my computation power, I am not able to fully explore the grid. 