# Yelp Data Challenge - NLP

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('data/last_2_years_restaurant_reviews.csv')

In [4]:
df.head()

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,type,useful,user_id
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2015-06-26,0,nCqdz-NW64KazpxqnDr0sQ,1,I mainly went for the ceasar salad prepared ta...,review,0,0XVzm4kVIAaH4eQAxWbhvw
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2015-06-29,0,iwx6s6yQxc7yjS7NFANZig,4,Nice atmosphere and wonderful service. I had t...,review,0,2aeNFntqY2QDZLADNo8iQQ
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2015-04-05,0,2HrBENXZTiitcCJfzkELgA,2,To be honest it really quit aweful. First the ...,review,0,WFhv5pMJRDPWSyLnKiWFXA
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2016-02-16,0,6YNPXoq41qTMZ2TEi0BYUA,2,"The food was decent, but the service was defin...",review,0,2S6gWE-K3DHNcKYYSgN7xA
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2016-02-08,1,4bQrVUiRZ642odcKCS0OhQ,2,If you're looking for craptastic service and m...,review,1,rCTVWx_Tws2jWi-K89iEyw


### Define feature variables, here is the text of the review

In [5]:
# Take the values of the column that contains review text data
documents = df.text.values

In [6]:
documents[:2]

array([ 'I mainly went for the ceasar salad prepared tableside.  I ate in the bar, the bartender was very nice and helpful.  I got the grilled cheese with tomato soup.  Grilled cheese was very good but the soup was nothing special.  Now the salad that i read one reviewer said the best in vegas, which is the only reason i came.  Knowing that they put anchovies in it when they prepare tableside, i was going to tell them to hold off on that once they get started.  So as im waiting for them to come up and make it, they bring it already prepared.  What is that?  The whole point of getting it is to watch it being done and see that its made fresh.  So obviously the anchovies were already in it, and since i explained i didnt want them, they made another.   I was told its a fire hazard to prepare it in the bar area so they made it on the side when i wasnt looking.  The few bites i took werent that good.  So i watch them make the 2nd salad in the hallway.  Needless to say, it was totally flavorl

### Define target variable

#### For example, I am interested in perfect (5 stars) and imperfect (1-4 stars) rating

In [7]:
df['perfection'] = df['stars'].apply(lambda x : int(x == 5))
target = df['perfection'].values


#### Look at the statistic of the target variable

In [8]:
print "mean:{}".format(target.mean())


mean:0.46076595353


## Create training dataset and test dataset

In [9]:
from sklearn.cross_validation import train_test_split



In [11]:
documents_train, documents_test, target_train, target_test = train_test_split(documents, target, 
                                                                              test_size=0.3, random_state=11)

## Get NLP representation of the documents

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
# Create TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = "english", max_features = 5000)

In [14]:
# Train the model with training data
vectors_train = vectorizer.fit_transform(documents_train).toarray()

In [34]:
# Get the vocab of tfidf
words = vectorizer.get_feature_names()


In [16]:
# Use the trained model to transform test data
vectors_test = vectorizer.transform(documents_test).toarray()

## Similar review search engine

In [17]:
import numpy as np

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    return [labels[i] for i in np.argsort(lst)[:n]]


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
# Draw an arbitrary review from test (unseen in training) documents
query = documents_test[666]

In [20]:
# Transform the drawn review(s) to vector(s)
vector_query = vectorizer.transform([query]).toarray()
vector_query.shape,vectors_train.shape

((1, 5000), (243333, 5000))

In [21]:
# Calculate the similarity score(s) between vector(s) and training vectors
scores = cosine_similarity(vector_query,vectors_train)


In [24]:
documents_train

array([ "The food is delicious every time I come here. I like my stuff very spicy.. I usually order level 25, up to level 50 and it still has taste and flavor, not overpowered by the extreme spice. They are always friendly and I've never had a bad experience.",
       'Love the combination of White Peach Oolong tea with Lychee jelly. Loco moco with it is perfectionnnnn',
       'Seems like this place has gotten a majority of their negative reviews very recently... \n\nWe ordered a teriyaki chicken dish, the middle pieces were literally raw... We show it the waiter, he agrees, takes the whole dish back. He comes back sometime later with the same dish with the same chicken cooked. The raw chicken was sitting on a bed of cabbage n he re-cooked the chicken and put it on the same used plate. Unbelievable! \n\nOtherwise none of the dishes were that great...\n\nWe ordered a "Popcorn Lobster"roll... It was almost $20 for this roll... Would be a fair price if we had actually gotten lobster, how

In [25]:
# find top 5 similar reviews
n = 5
similar_reviews = get_top_values(scores[0], n, documents_train)

In [26]:
print 'Our search query:'
print  query


Our search query:
Finally decided i would try this joint out despite the long line thats always outside. NOW i understand why people wait in line... Its pretty much worth it, ant the prices are great! I love the fact that this isnt just a burger place they have other popular options. Costumer service was pretty good too (:


In [33]:
print 'Most %s similar reviews:' % n
for i, reviews in enumerate(similar_reviews):
    print 'No.%d' % (i+1)
    print reviews

Most 5 similar reviews:
No.1
Good food but very long time to wait in line
No.2
It's a long line.. Expect to wait. So we went to just ordered their Blizzard type. It's pretty good!!! Definitely we will try to get here and eat here and try their special burger
No.3
If you never tried it, must experience it. Great little burgers that will fill you up. 10 should be enough for a person. Good place for late night in Vegas. Opens 24/7.  Line isnt so long. Worth to try and wait for. Great for hangovers after the club.
No.4
Loved this place so much we went twice!!
Dont let the long line discourage you, the wait isnt very long and its well worth it. 
The hells kitchen burger has everything you would ever want and more! We also ordered jalapeno poppers which were a little large for just two people but so good, truffle fries were fantastic. If you like beer this place is also for you so many options on tap!!
I just wish they had one of these in Seattle!
No.5
The food and service is great! Expect a

#### Comment: The result makes sense. These 5 comments are all positive comments and all refer to "wait" "long" and "line" mentioned in query.

## Classifying positive/negative review

#### Naive-Bayes Classifier

In [37]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(vectors_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [38]:
# Get score for training set
nb_model.score(vectors_train, target_train)

0.80487644503622613

In [39]:
# Get score for test set
nb_model.score(vectors_test, target_test)

0.80151698214525435

#### Logistic Regression Classifier

In [42]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(vectors_train, target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [43]:
lr_model.score(vectors_train, target_train)

0.83580936412241658

In [44]:
lr_model.score(vectors_test,target_test)

0.82732102103829852

In [52]:
# key features(words) that make the positive prediction
n = 20
features_for_positive = get_top_values(lr_model.coef_[0], n, words)
features_for_positive

[u'amazing',
 u'best',
 u'awesome',
 u'thank',
 u'incredible',
 u'phenomenal',
 u'perfect',
 u'delicious',
 u'fantastic',
 u'highly',
 u'outstanding',
 u'excellent',
 u'perfection',
 u'great',
 u'heaven',
 u'favorite',
 u'notch',
 u'fabulous',
 u'perfectly',
 u'die']

In [53]:
# key features(words) that make the negative prediction
n = 20
features_for_negative = get_bottom_values(lr_model.coef_[0], n, words)
features_for_negative


[u'worst',
 u'horrible',
 u'bland',
 u'disappointing',
 u'rude',
 u'ok',
 u'mediocre',
 u'terrible',
 u'okay',
 u'slow',
 u'lacking',
 u'lacked',
 u'poor',
 u'awful',
 u'overpriced',
 u'meh',
 u'average',
 u'unfortunately',
 u'disgusting',
 u'tasteless']

#### Random Forest Classifier

In [54]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(vectors_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [55]:
rf_model.score(vectors_train, target_train)

0.9897424517019886

In [56]:
rf_model.score(vectors_test,target_test)

0.77243350018219126

Comment: The model overfits the training data.

#### Important features (words) of the RFC model

In [57]:
n = 20
get_top_values(rf_model.feature_importances_, n, words)

[u'best',
 u'amazing',
 u'delicious',
 u'awesome',
 u'great',
 u'love',
 u'good',
 u'vegas',
 u'bad',
 u'didn',
 u'definitely',
 u'worst',
 u'ok',
 u'place',
 u'excellent',
 u'wasn',
 u'pretty',
 u'food',
 u'just',
 u'rude']

## Use cross validation to evaluate classifiers


In [59]:
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier()
score_rf = cross_val_score(rf, vectors_train, target_train, cv=5)
score_rf

array([ 0.76861593,  0.76783036,  0.7746476 ,  0.768216  ,  0.77027083])

In [60]:
lr = LogisticRegression()
score_lr = cross_val_score(lr, vectors_train, target_train, cv=5)
score_lr

array([ 0.82481302,  0.82423408,  0.82934698,  0.82548391,  0.82566884])

In [61]:
nb = MultinomialNB()
score_nb = cross_val_score(nb, vectors_train, target_train, cv=5)
score_nb

array([ 0.8026013 ,  0.7997    ,  0.80462746,  0.80066165,  0.80201784])

## Use grid search to find best predictable classifier


In [72]:
from sklearn.model_selection import GridSearchCV
param_grid = [
  {'C': [1, 10, 100, 1000], 'penalty': ['l1','l2'],'random_state':[1]}
 ]
lr_gridsearch = GridSearchCV(lr,param_grid, cv = 10)
lr_gridsearch.fit(vectors_train, target_train)


GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'penalty': ['l1', 'l2'], 'C': [1, 10, 100, 1000], 'random_state': [1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [76]:
print lr_gridsearch.best_params_
print lr_gridsearch.cv_results_['mean_test_score'].mean()

{'penalty': 'l1', 'C': 1, 'random_state': 1}
0.824612259743
