# Yelp Data Challenge - NLP

BitTiger DS501-1802

May 2018

Yan Wei

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [8]:
df = pd.read_csv('data/last_2_years_restaurant_reviews.csv')

In [9]:
df.head()

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id,date_new
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2016-03-31,0,6SgvNWJltnZhW7duJgZ42w,5,This is mine and my fiancé's favorite steakhou...,0,oFyOUOeGTRZhFPF9uTqrTQ,2016-03-31
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2015-06-29,0,iwx6s6yQxc7yjS7NFANZig,4,Nice atmosphere and wonderful service. I had t...,0,2aeNFntqY2QDZLADNo8iQQ,2015-06-29
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2015-03-16,0,UVUMu_bELdA56Ryfbur-DA,5,Every year a group of us (we had 6 this year) ...,1,gmPP4YFrgYsYQqPYokMgFA,2015-03-16
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2016-02-10,0,UxFpgng8dPMWOj99653k5Q,5,Truly Fantastic! Best Steak ever. Service was...,0,aVOGlN9fZ-BXcbtj6dbf0g,2016-02-10
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"['Cajun/Creole', 'Steakhouses', 'Restaurants']",4.0,0,2017-02-14,0,Xp3ppynEvVu1KxDHQ3ae8w,5,Delmonico Steakhouse is a steakhouse owned by ...,0,KC8H7qTZVPIEnanw9fG43g,2017-02-14


### Define your feature variables, here is the text of the review

In [10]:
# Take the values of the column that contains review text data, save to a variable named "documents".
documents = df['text'].values

In [18]:
# inspect your documents, e.g. check the size, take a peek at elements of the numpy array
documents.shape, documents.dtype

((515752,), dtype('O'))

In [19]:
documents[:2]

array([ "This is mine and my fiancé's favorite steakhouse here in Las Vegas! We often stop in and see our favorite server, Michael Jackson, who simply completes the fine dining experience for us. He is always so pleasant and knowledgable about the menu and suggests amazing dishes each time! I have never been disappointed here and will be back soon!",
       "Nice atmosphere and wonderful service. I had the dinner special which was a wedge salad, a petite ribeye, and desert.   \n\nThe salad was as expected. Nothing to jump up and down about. The petite ribeye was tasty. Just a minute over cooked, but overall it was good. The desert was ice cream and cake. As I'm not a cake eater I can't tell you about that, but the ice cream was good. \n\nOverall, it was a pleasant dinner."], dtype=object)

### Define your target variable (any categorical variable that may be meaningful)

#### For example, I am interested in perfect (5 stars) and imperfect (1-4 stars) rating

In [38]:
# Make a column and take the values, save to a variable named "target"
target = []
for i in range(df.shape[0]):
    if df.loc[i, 'stars'] == 5:
        rating = 1
    else:
        rating = 0
    target.append(rating)
target = np.asarray(target)

In [39]:
target[:5]

array([1, 0, 1, 1, 1])

In [40]:
target.shape, target.dtype

((515752,), dtype('int64'))

#### You may want to look at the statistic of the target variable

In [41]:
# To be implemented, 46.4% of reviews give 5 stars.
target.mean()

0.46397299477268145

## Let's create training dataset and test dataset

In [42]:
from sklearn.cross_validation import train_test_split



In [None]:
# Documents is your X, target is your y
# Now split the data to training set and test set

In [43]:
# Split to documents_train, documents_test, target_train, target_test
documents_train, documents_test, target_train, target_test = train_test_split(documents, target, test_size = 0.3)

## Let's get NLP representation of the documents

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [47]:
# Create TfidfVectorizer, and name it vectorizer
from nltk.corpus import stopwords
vectorizer = TfidfVectorizer(stop_words = stopwords.words('english'), max_features = 2000)

In [48]:
# Train the model with your training data
vectors = vectorizer.fit_transform(documents_train).toarray()

In [49]:
# Get the vocab of your tfidf
words = vectorizer.get_feature_names()

In [50]:
# Use the trained model to transform your test data
X_test = vectorizer.transform(documents_test)

In [51]:
vectors.shape

(361026, 2000)

## Similar review search engine

In [105]:
import numpy as np

# We will need these helper methods pretty soon

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    return [labels[i] for i in np.argsort(lst)[:n]]


In [55]:
# Let's use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [88]:
# Draw an arbitrary review from test (unseen in training) documents
query = np.random.choice(documents_test)

In [89]:
query

'A really nice, relaxed setting, especially if you sit outside. We got there too early on a Sunday morning for the Bellagio fountains to be on, which was a shame, but it was still a pleasant place to be. Service was friendly and prompt. The food, overall, was very good. My 5 year old son got a yogurt parfait with cherry compote, which was perhaps too small for adult appetites, but suited him perfectly. My wife enjoyed her breakfast -- I forgot what it was, maybe Eggs Florentine. I was a little disappointed in what I ordered, which was the Egg White Omelette. It was a serious pile of egg whites, with a drizzle of "sauce vert" - a green sauce - on the top. Egg white overload. Really my fault, though, because the dish was as described and I should have made a better choice.\n\nWhen in Vegas, I\'d definitely return to Mon Ami Gabi.'

In [83]:
# Transform the drawn review(s) to vector(s)

In [90]:
query_vectorized = vectorizer.transform([query]).todense()

In [92]:
# Calculate the similarity score(s) between vector(s) and training vectors
cos_similarity = cosine_similarity(query_vectorized, vectors)

In [114]:
cos_similarity.shape

(1, 361026)

In [101]:
documents_train.shape

(361026,)

In [102]:
type(documents_train)

numpy.ndarray

In [103]:
type(cos_similarity)

numpy.ndarray

In [116]:
# Let's find top 5 similar reviews
n = 5
top_5_reviews = get_top_values(cos_similarity, n, documents_train)

In [107]:
print('Our search query:')
print(query) 

Our search query:
A really nice, relaxed setting, especially if you sit outside. We got there too early on a Sunday morning for the Bellagio fountains to be on, which was a shame, but it was still a pleasant place to be. Service was friendly and prompt. The food, overall, was very good. My 5 year old son got a yogurt parfait with cherry compote, which was perhaps too small for adult appetites, but suited him perfectly. My wife enjoyed her breakfast -- I forgot what it was, maybe Eggs Florentine. I was a little disappointed in what I ordered, which was the Egg White Omelette. It was a serious pile of egg whites, with a drizzle of "sauce vert" - a green sauce - on the top. Egg white overload. Really my fault, though, because the dish was as described and I should have made a better choice.

When in Vegas, I'd definitely return to Mon Ami Gabi.


In [117]:
print('Most %s similar reviews:' % n)
print(top_5_reviews)  

Most 5 similar reviews:
[array([ "McFries from McDonald's generally taste great, but this time :  Soggy, Cold, No Flavor, and pod was missing a scoop of fries. What happen to savory and crispy?",
       "Worst experience everrrrrr!!! Waiting about 20 minutes for someone to approach us. Ryan the bartender was very rude. When we asked him about beer selections he was very short and not interested in helping us out. We are in the serving industry so we know how to tip, I understand when it's busy but is attitude was very unprofessional. We will not be back here for sure.",
       'Best spaghetti and meatballs i have ever had out to eat. Will eat here again. Sanwiches are great too. True italian.',
       ...,
       "Breakfast was so delicious!  Best egg white omelette and pancake I've had in a very long time!  Service was friendly and quick too.",
       'Beautiful patio views of the Bellagio fountains..both daytime and nighttime. \nGreat breakfast place! Egg white turkey omelet is a goo

#### Q: Does the result make sense to you?

A: The top two reviews confuses me cuz they are negative comments while our query is quite positive. The other similar reviews mentioned egg white, same as the query. 

## Classifying positive/negative review

In [125]:
X = vectors
y = target_train
y_test = target_test

In [120]:
X.shape

(361026, 2000)

In [121]:
y.shape

(361026,)

#### Naive-Bayes Classifier

In [123]:
# Build a Naive-Bayes Classifier

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X, y) 

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [124]:
# Get score for training set
model.score(X, y)

0.80505005179682354

In [126]:
# Get score for test set
model.score(X_test, y_test)

0.80325220066440028

#### Logistic Regression Classifier

In [149]:
# Build a Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression()

model_lr.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [150]:
# Get score for training set
model_lr.score(X, y)

0.83165478386598191

In [151]:
# Get score for test set
model_lr.score(X_test, y_test)

0.82869718082287402

#### Q: What are the key features(words) that make the positive prediction?

In [153]:
model_lr.coef_.shape

(1, 2000)

In [154]:
type(model_lr.coef_[0])

numpy.ndarray

In [155]:
# Let's find it out by ranking
n = 20
get_top_values(model_lr.coef_[0], n, words)

['amazing',
 'best',
 'awesome',
 'phenomenal',
 'incredible',
 'thank',
 'delicious',
 'fantastic',
 'perfect',
 'heaven',
 'perfection',
 'impeccable',
 'outstanding',
 'excellent',
 'favorite',
 'highly',
 'fabulous',
 'great',
 'beyond',
 'perfectly']

In [142]:
sorted_index = np.argsort(model.coef_)[::-1]

In [148]:
sorted_index.shape

(1, 2000)

In [146]:
sorted_index[:4]

array([[1975,  845, 1466, ...,  124,  179,   68]])

A: It totally makes sense. Positive reviews are 5-star reviews in our case, thus top 20 words are all strongly positive comments, such as 'incredible', 'fantastic', 'phenomenal'. 

#### Q: What are the key features(words) that make the negative prediction?

In [156]:
# Let's find it out by ranking
n = 20
get_bottom_values(model_lr.coef_[0], n, words) 

['worst',
 'horrible',
 'rude',
 'terrible',
 'disappointing',
 'mediocre',
 'bland',
 'ok',
 'lacked',
 'awful',
 'okay',
 'poor',
 'disgusting',
 'worse',
 'slow',
 'lacking',
 'meh',
 'charged',
 'average',
 'overpriced']

A: It also sounds reasonable to see the top rank 20 words for negative reviews (less than 5 stars) are those with negative sentiment such as 'worst', 'horrible', 'disappointing'. 

#### Random Forest Classifier

In [159]:
# Build a Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators = 200, max_depth = 30, min_samples_leaf = 3, random_state = 1)

model_rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=3, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [160]:
# Get score for training set
model_rf.score(X, y)

0.84139092475334187

In [161]:
# Get score for test set
model_rf.score(X_test, y_test)

0.79569690937528281

#### Q: What do you see from the training score and the test score?

A: Training score is clearly better than test score, which suggests overfitting. Should tune random forest further to reduce its complexity. 

#### Q: Can you tell what features (words) are important by inspecting the RFC model?

In [163]:
get_top_values(model_rf.feature_importances_, n, words)

['amazing',
 'best',
 'great',
 'delicious',
 'love',
 'ok',
 'awesome',
 'bad',
 'vegas',
 'definitely',
 'worst',
 'good',
 'would',
 'pretty',
 'favorite',
 'horrible',
 'highly',
 'minutes',
 'perfect',
 'terrible']

## Extra Credit #1: Use cross validation to evaluate your classifiers

[sklearn cross validation](http://scikit-learn.org/stable/modules/cross_validation.html)

In [168]:
# To be implemented
from sklearn.model_selection import cross_val_score
score = cross_val_score(model_lr, X, y, cv=5)

In [169]:
score

array([ 0.82815832,  0.82790665,  0.82689564,  0.82947164,  0.82925005])

In [170]:
score.mean()

0.8283364637952364

## Extra Credit #2: Use grid search to find best predictable classifier


[sklearn grid search tutorial (with cross validation)](http://scikit-learn.org/stable/modules/grid_search.html#grid-search)

[sklearn grid search documentation (with cross validation)](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV)

In [171]:
# laptop crushed, had to interrupt kernel...
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score

# clf = RandomForestClassifier()

# param_grid = {'n_estimators': [100, 200],
#             'max_depth': [10, 15, 20],
#              'min_samples_leaf': [5, 10],
#              'n_jobs':[-1]
#              }

clf = LogisticRegression()

param_grid = {'penalty': ['l1', 'l2'],
            'C': [0.1, 1, 10]
             }

acc_scorer = make_scorer(roc_auc_score)

model_gs = GridSearchCV(clf, param_grid, cv=5, scoring=acc_scorer)

model_gs = model_gs.fit(X, y)

In [173]:
# Select best combination of parameters, named it model_best.
model_best = model_gs.best_estimator_


In [174]:
model_best

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [179]:
print('Score for logistic regression on test set: \n')
print('    Previous: %.6f' % (model_lr.score(X_test, y_test)))
print('    Grid Search: %.6f' % (model_best.score(X_test, y_test)))

Score for logistic regression on test set: 

    Previous: 0.828697
    Grid Search: 0.833919
