<a href="https://colab.research.google.com/github/yongjinjiang/Yelp-PredictRating/blob/master/Yelp_Dataset_Part_II_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## In this notebook, we would build three models: 
- Naive Bayes, 
- Logistic regression,
- random forest 
We will compare their performance

# Yelp Data Challenge 
## Part II - NLP

In [0]:
from google.colab import drive
drive.mount('/content/drive/')

In [0]:
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/Yelp-NLP")

In [0]:
import pandas as pd
import numpy as np

In [0]:
df = pd.read_csv('./last_3_years_restaurant_reviews.csv')

In [0]:
df.head()

In [0]:
df.shape

In [0]:
df['review_cnt'] = df.groupby(['business_id'])['review_id'].transform('count')

In [0]:
df['review_cnt'].quantile(q=[0.5,0.75,0.90])

In [0]:
import matplotlib.pyplot as plt

% matplotlib inline

In [0]:
df['review_cnt'].value_counts().plot.hist()

### Define feature variables, here is the text of the review

In [0]:
documents = df['text']

In [0]:
# get non na indx
indx = pd.notnull(documents)

In [0]:
documents = documents[indx]

In [0]:
# Take the values of the column that contains review text data, save to a variable named "documents"
documents = documents.values

In [0]:
# inspect your documents, e.g. check the size, take a peek at elements of the numpy array
documents[:5]

### Define target variable (any categorical variable that may be meaningful)

#### For example, I am interested in perfect (5 stars) and imperfect (1-4 stars) rating

In [0]:
df.info()

In [0]:
#checking summary statisics for avg_star and stars
var_list = [u'avg_stars',u'stars',u'cool',u'funny',u'useful']
df[var_list].describe()

In [0]:
# Make a column and take the values, save to a variable named "target"
# tried cutoff >3, pos/neg around 7:3; cutoff > 4, pos/neg around 1:1
#condition1 = (df['stars']>3)
#df_reduced.drop(['is_pos_review'],axis=1,inplace=True)

In [0]:
df['is_pos_review'] = (df['stars']>4)

In [0]:
df['is_pos_review'].value_counts(normalize=True)

In [0]:
target = df['is_pos_review'].values

In [0]:
target = target[indx]

In [0]:
target[:10]

#### You may want to look at the statistic of the target variable

In [0]:
# To be implemented
#checking positive reviews's avg_stars rating
df[df['is_pos_review']==True]['avg_stars'].mean()

In [0]:
documents.size,target.size

## Let's create training dataset and test dataset

In [0]:
from sklearn.cross_validation import train_test_split

In [0]:
# Documents is your X, target is your y
# Now split the data to training set and test set

In [0]:
# Split to documents_train, documents_test, target_train, target_test
# using test_size 0.8 to reduce the training size due to computational cost
documents_train,documents_test,target_train,target_test = train_test_split(documents,target,test_size=0.2, random_state=0)

## Let's get NLP representation of the documents

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
# Create TfidfVectorizer, and name it vectorizer
vectorizer = TfidfVectorizer(stop_words='english',max_features=5000,min_df=1)

In [0]:
# Train the model with your training data
x_train = vectorizer.fit_transform(documents_train).toarray()

In [0]:
# Get the vocab of your tfidf
features_name = vectorizer.get_feature_names()

In [0]:
# Use the trained model to transform your test data
x_test = vectorizer.transform(documents_test).toarray()

##### using 2-gram of words

In [0]:
# Create TfidfVectorizer, and name it vectorizer
vectorizer2 = TfidfVectorizer(stop_words='english',max_features=5000,min_df=1,ngram_range=(1, 2))
x_train2 = vectorizer2.fit_transform(documents_train).toarray()
features_name2 = vectorizer2.get_feature_names()
x_test2 = vectorizer2.transform(documents_test).toarray()

## Similar review search engine

In [0]:
import numpy as np

# We will need these helper methods pretty soon

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    return [labels[i] for i in np.argsort(lst)[:n]]


In [0]:
# Let's use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [0]:
# Draw an arbitrary review from test (unseen in training) documents
sample_review = [documents_test[10]]
sample_review

In [0]:
# Transform the drawn review(s) to vector(s)
vector_review = vectorizer.transform(sample_review).toarray()

In [0]:
vector_review2 = vectorizer2.transform(sample_review).toarray()

In [0]:
# Calculate the similarity score(s) between vector(s) and training vectors
similarity_scores = cosine_similarity(vector_review, x_test[:100])

In [0]:
similarity_scores2 = cosine_similarity(vector_review2,x_test2[:100])

In [0]:
similarity_scores[0]

In [0]:
# Let's find top 5 similar reviews
n = 5
top_similar_review = get_top_values(similarity_scores[0], n, documents_test[:100])

In [0]:
n = 5
top_similar_review2 = get_top_values(similarity_scores2[0],n,documents_test[:100])

In [0]:
print('Our search query:')
print(sample_review) # To be added

In [0]:
print('Most %s similar reviews:' % n)
for i in range(len(top_similar_review)):
    print ('top %s review:' % i)
    print (top_similar_review[i])
 # To be added

In [0]:
print('Most %s similar reviews:' % n)
for i in range(len(top_similar_review2)):
    print ('top %s review:' % i)
    print (top_similar_review2[i])

#### Q: Does the result make sense to you?

A: The top5 reviews captures some key elements for the sample review, such as casino, service,decor, cleaning; but the cosine similarity doesn't catch the meaning of sentence very well. The sample review is a strong postive review using double negative formatting, however the top 5 reviews are more on the negative side of the hotels/casino.

Fitting 2 models using different grams, the result of top 5 reviews seem to be unchanged.

## Classifying positive/negative review

### Helper Function

In [0]:
from sklearn.metrics import precision_score,accuracy_score,recall_score,f1_score,roc_auc_score

In [0]:
# Helper method to print metric scores    
def get_performance_metrics(y_train, y_train_pred, y_test, y_test_pred, threshold=0.5):
    metric_names = ['AUC','Accuracy','Precision','Recall','f1-score']
    metric_values_train = [roc_auc_score(y_train, y_train_pred),
                    accuracy_score(y_train, y_train_pred>threshold),
                    precision_score(y_train, y_train_pred>threshold),
                    recall_score(y_train, y_train_pred>threshold),
                    f1_score(y_train, y_train_pred>threshold)
                   ]
    metric_values_test = [roc_auc_score(y_test, y_test_pred),
                    accuracy_score(y_test, y_test_pred>threshold),
                    precision_score(y_test, y_test_pred>threshold),
                    recall_score(y_test, y_test_pred>threshold),
                    f1_score(y_test, y_test_pred>threshold)
                   ]
    all_metrics = pd.DataFrame({'metrics':metric_names,
                                'train':metric_values_train,
                                'test':metric_values_test},columns=['metrics','train','test']).set_index('metrics')
    print(all_metrics)

In [0]:
#helper function to plot roc curve
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

def plot_roc_curve(y_train, y_train_pred, y_test, y_test_pred):
    roc_auc_train = roc_auc_score(y_train, y_train_pred)
    fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred)

    roc_auc_test = roc_auc_score(y_test, y_test_pred)
    fpr_test, tpr_test, _ = roc_curve(y_test, y_test_pred)
    plt.figure()
    lw = 2
    plt.plot(fpr_train, tpr_train, color='green',
             lw=lw, label='ROC Train (AUC = %0.4f)' % roc_auc_train)
    plt.plot(fpr_test, tpr_test, color='darkorange',
             lw=lw, label='ROC Test (AUC = %0.4f)' % roc_auc_test)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [0]:
# helper function to train models
# define function to perform train, test, and get model performance
def train_test_model(clf, X_train, y_train, X_test, y_test):
    # Fit a model by providing X and y from training set
    clf.fit(X_train, y_train)

    # Make prediction on the training data
    y_train_pred = clf.predict(X_train)
    p_train_pred = clf.predict_proba(X_train)[:,1]

    # Make predictions on test data
    y_test_pred = clf.predict(X_test)
    p_test_pred = clf.predict_proba(X_test)[:,1]

    # print model results
    get_performance_metrics(y_train, p_train_pred, y_test, p_test_pred)
    plot_roc_curve(y_train, p_train_pred, y_test, p_test_pred)

### Naive-Bayes Classifier

In [0]:
# Build a Naive-Bayes Classifier

from sklearn.naive_bayes import MultinomialNB

# Build a Naive-Bayes Classifier
clf_nb = MultinomialNB()

clf_nb.fit(x_train, target_train)

In [0]:
# Get score for training set & test set
train_test_model(clf_nb, x_train, target_train, x_test, target_test)

### Logistic Regression Classifier

In [0]:
# Build a Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression

clf_lrc = LogisticRegression()

clf_lrc.fit(x_train, target_train)

In [0]:
# Get score for training set & test set
train_test_model(clf_lrc, x_train, target_train, x_test, target_test)

#### Q: What are the key features(words) that make the positive prediction?

In [0]:
# Let's find it out by ranking
n = 20
get_top_values(clf_lrc.coef_[0], n, features_name)

A: The top 20 words all positive adjective words. 

#### Q: What are the key features(words) that make the negative prediction?

In [0]:
# Let's find it out by ranking
n = 20
get_bottom_values(clf_lrc.coef_[0], n, features_name)

A: all the words are negative adjective words. 

### Random Forest Classifier

In [0]:
# Build a Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

clf_rfc = RandomForestClassifier(max_depth=20, n_estimators=10, min_samples_leaf=100)

clf_rfc.fit(x_train, target_train)

In [0]:
# Get score for training set & test set
train_test_model(clf_rfc, x_train, target_train, x_test, target_test)

#### Q: What do you see from the training score and the test score?

A: Comparing the three classifiers, random frost has lowest accuracy and AUC for both train/test data. For different metrics, the train metric and test metric are quite close.

#### Q: Can you tell what features (words) are important by inspecting the RFC model?

In [0]:
n = 20
get_top_values(clf_rfc.feature_importances_, n, features_name)

Compared to Logistic regression results, top 20 features in RF are mix of positive words and negative words. it also have some neutral words like (hotel, money...)

## Additional Approach #1: Use cross validation to evaluate classifiers

[sklearn cross validation](http://scikit-learn.org/stable/modules/cross_validation.html)

In [0]:
# too slow, not used
from sklearn.metrics import precision_score,accuracy_score,recall_score,f1_score,roc_auc_score
from sklearn.model_selection import cross_val_score,KFold
def get_scores(clf,X,y,num_folds=5):
    metric_names = ['AUC','Accuracy','Precision','Recall','f1-score']
    accuracy_score = cross_val_score(clf, X, y, cv=num_folds,scoring='accuracy')
    auc_score = cross_val_score(clf, X, y, cv=num_folds,scoring='roc_auc')
    pre_score = cross_val_score(clf, X, y, cv=num_folds,scoring='precision')
    recall_score = cross_val_score(clf, X, y, cv=num_folds,scoring='recall')
    f1_score = cross_val_score(clf, X, y, cv=num_folds,scoring='f1')  
    metric_mean = [auc_score.mean(),accuracy_score.mean(),pre_score.mean(),recall_score.mean(),f1_score.mean()]
    metric_std = [auc_score.std(),accuracy_score.std(),pre_score.std(),recall_score.std(),f1_score.std()]
    all_metrics = pd.DataFrame({'metrics':metric_names,
                                'cv_mean':metric_mean,
                                'cv_std':metric_std},columns=['metrics','cv_mean','cv_std']).set_index('metrics')
    print(all_metrics)

In [0]:
from sklearn.metrics import precision_score,accuracy_score,recall_score,f1_score,roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
def get_scores(clf,X,y,num_folds=5):
    scoring = {'accuracy': make_scorer(accuracy_score),
               'prec': 'precision',
                'roc_auc': make_scorer(roc_auc_score),
                'recall': make_scorer(recall_score),
                'f1': make_scorer(f1_score)}
    cv_results = cross_validate(clf, X, y, scoring=scoring,cv=num_folds,return_train_score=False)
    return cv_results

In [0]:
cv_metrics_nb = get_scores(clf_nb,x_train,target_train,num_folds=5)

In [0]:
cv_metrics_lrc = get_scores(clf_lrc,x_train,target_train,num_folds=5)

In [0]:
cv_metrics_rfc = get_scores(clf_rfc,x_train,target_train,num_folds=5)

In [0]:
cv_metrics_nb

In [0]:
cv_metrics_lrc

In [0]:
cv_metrics_rfc

CV scores are lower than single train/test split, but the model performance ranking is consistent with previous results. Logistic regression classifier > naive bayes > random forest

## Additional Approach #2: Use grid methods to find best predictable classifier


[sklearn grid search tutorial (with cross validation)](http://scikit-learn.org/stable/modules/grid_search.html#grid-search)

[sklearn grid search documentation (with cross validation)](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV)

### Logistic Regression Classifier

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = LogisticRegression()

# Choose some parameter combinations to try
param_grid = {'penalty':['l1','l2'],
               'C':[0.5,1,5,10],
               'solver':['liblinear']
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(roc_auc_score)

# Run the grid search
# read theory
grid_obj = GridSearchCV(clf, param_grid, cv=5, scoring=acc_scorer)
grid_obj = grid_obj.fit(x_train, target_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(x_train, target_train)

In [0]:
# Train test model
train_test_model(clf, x_train, target_train, x_test, target_test)

### Naive Bayes Classifier

In [0]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = MultinomialNB()

# Choose some parameter combinations to try
param_grid = {'alpha':[0.5,1,2]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(roc_auc_score)

# Run the grid search
# read theory
grid_obj = GridSearchCV(clf, param_grid, cv=5, scoring=acc_scorer)
grid_obj = grid_obj.fit(x_train, target_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(x_train, target_train)

In [0]:
train_test_model(clf, x_train, target_train, x_test, target_test)

### Random Forest Classifier

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = RandomForestClassifier()

# Choose some parameter combinations to try
param_grid = {'n_estimators': [100,200], 
              'max_features': ['auto'], 
              'criterion': ['gini'],
              'max_depth': [8,16,32], 
              'min_samples_split': [5,10,20,60],
              'min_samples_leaf': [2,5,10,20],
              'n_jobs':[-1]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(roc_auc_score)

# Run the grid search
# read theory
grid_obj = GridSearchCV(clf, param_grid, cv=5, scoring=acc_scorer)
grid_obj = grid_obj.fit(x_train, target_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(x_train, target_train)

In [0]:
train_test_model(clf, x_train, target_train, x_test, target_test)

In [0]:
print('hello world')