In [85]:
import pandas as pd
import _pickle
import numpy as np
from os.path import join
from tqdm import tqdm, tqdm_notebook
import re
from sklearn.feature_extraction.text import CountVectorizer
import gc
import logging
import nltk

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

This analysis looks at predicting restaurant ratings from review text primarily.

Note: Inspired and with code snippets from https://www.kaggle.com/c/word2vec-nlp-tutorial/

# 1. Set variables

In [87]:
#1. Set variables
input_folder = join('pickles','3.agg_dfs')
save_folder = join('pickles','4.restaurant_text_predictions')
stopwords_path = join('pickles','useful_objects','eng_stopwords.pkl')

# Register TQDM (progress bar) with Pandas to see progress for apply functions
tqdm_notebook().pandas(desc='Progress Bar')

# Load stopwords pickle (originally from NLTK downloads)
stopwords = _pickle.load(open(stopwords_path,'rb'))




# 2. Load data and filter for restaurants

In [33]:
# 2. Load data
business_data = _pickle.load(open(join(input_folder,'business_data.pkl'),'rb'))
review_data = _pickle.load(open(join(input_folder,'review_data.pkl'),'rb'))


# 3. Filter restaurants only for business data and reviews

restaurant_filter = business_data.categories.apply(lambda x: 'Restaurants' in x)

restaurant_data = business_data[restaurant_filter]
restaurant_ids = set(restaurant_data.business_id)

restaurant_review_filter = review_data['business_id'].progress_apply(lambda x: x in restaurant_ids)
restaurant_reviews = review_data[restaurant_review_filter]

restaurant_reviews = pd.merge(restaurant_reviews,restaurant_data,
                              how='left',
                              left_on='business_id',
                              right_on='business_id')

# Filter for reviews in majority English-speaking states to ensure language processing appropriate
english_speaking_states = set(['PA','NC','SC','WI','IL','AZ','CA','NV','FL','NM','ON','TX','EDH','MLN',
                               'HAM','SCB','ELN','FIF','NTH','XGL','KHL','MN','AK'])

eng_restaurant_reviews = restaurant_reviews[restaurant_reviews.state.apply(lambda x: x in english_speaking_states)]

# Bag of Words Model

## a. Prepare and save file

In [59]:
# 4. Import stopwords

# Note - Those stopwords were initially downloaded from the NLTK English stopwords corpus
stopwords = _pickle.load(open(stopwords_path),'rb')


# 5. Process text
eng_restaurant_reviews['processed_text_BoW'] = eng_restaurant_reviews['text']

def clean_reviews_BoW(review_text):
    """Converts review text for bag of words analysis"""
    
    # 1. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # 2. Convert to lower case, split into individual words
    words = letters_only.lower().split()
    
    # 3. Convert stopwords to set
    stops = set(stopwords)
    
    # 4. Remove stopwords
    relevant_words = [word for word in words if not word in stops]
    
    # 5. Join words back
    return (' '.join(relevant_words))

# Split into training and test data

perc_training_data = 0.7
num_reviews = eng_restaurant_reviews.processed_text_BoW.size

num_training_obs = int(round(perc_training_data * num_reviews, 0))

eng_restaurant_reviews = eng_restaurant_reviews.reset_index()
eng_restaurant_reviews = eng_restaurant_reviews.rename(columns={'index':'original_index'})

np.random.seed(500)
training_indices = set(np.random.choice(num_reviews, num_training_obs, replace=False))


eng_restaurant_reviews['idx'] = eng_restaurant_reviews.index
eng_restaurant_reviews['training_indices'] = eng_restaurant_reviews.idx.apply(lambda x: x in training_indices)
eng_restaurant_reviews['processed_text_BoW'] = eng_restaurant_reviews['text'].progress_apply(clean_reviews_BoW)

# Reduce features to reduce memory use
eng_restaurant_reviews = eng_restaurant_reviews[['business_id','stars_x','stars_y','date','text','city','open','state',
                                                'review_count','processed_text_BoW','training_indices']]
_pickle.dump(eng_restaurant_reviews,open(join(save_folder,'eng_restaurant_reviews.pkl'),'wb'))

## b. Prepare Features to Run Model

In [None]:
# Note - You start from here if needed as this loads the relevant file from the previous step
eng_restaurant_reviews = _pickle.load(open(join(save_folder,'eng_restaurant_reviews.pkl'),'rb'))

In [None]:
# Functions
def reduce_ratings(series):
    """
    Reduces ratings to 0: Below average (1 or 2), 1: Average (3) or Above Average (4 and 5)
    as this is sufficient accuracy to gauge business quality
    """
    ratings_3 = {1:0, 2:0, 3:1, 4:2, 5:2}
    series = ratings_3[series]
    return series

def run_model(name,model):
    """
    Runs sci-kitlearn classification models and provides accuracy as a metric
    """
    print('Fitting {} model'.format(name))
    model.fit(BoW_X_train,BoW_Y_train)
    print('Model {} has finished training'.format(name))
    print('The {} model has an accuracy of {} \n'.format(name, model.score(BoW_X_test,BoW_Y_test)))

In [38]:
# Vectorizer limited to 5,000 most common words for memory constraints and as additional features do not
# increase performance significantly 

vectorizer = CountVectorizer(analyzer = 'word',
                            tokenizer = None,
                            preprocessor = None,
                            stop_words = None,
                            max_features = 5000)
    

# Prepare training and test data
eng_restaurant_reviews['reduced_rating'] = eng_restaurant_reviews.stars_x.progress_apply(reduce_ratings)
eng_restaurant_reviews_training = eng_restaurant_reviews[eng_restaurant_reviews['training_indices'] == True]
eng_restaurant_reviews_test = eng_restaurant_reviews[eng_restaurant_reviews['training_indices'] == False]

BoW_training_data_features = vectorizer.fit_transform(eng_restaurant_reviews_training.processed_text_BoW)

BoW_X_test = vectorizer.transform(eng_restaurant_reviews_test.processed_text_BoW)
BoW_Y_test = np.array(eng_restaurant_reviews_test.reduced_rating)

BoW_X_train = BoW_training_data_features
BoW_Y_train = np.array(eng_restaurant_reviews_training.reduced_rating)

## c.Run and Compare Models

In [47]:
# Define models to be tested
LR = LogisticRegression(verbose=10, n_jobs=-1)
LR2 = LogisticRegression(verbose=10, n_jobs=-1, solver='sag',
                        random_state=555, multi_class='multinomial',
                        max_iter = 600)
SVM = LinearSVC(verbose=10)
ExtraTrees = ExtraTreesClassifier(verbose=10,
                                  random_state=500,
                                  n_jobs=-1)

In [48]:
# Run models
for name,model in [('Logistic Regression simple',LR),
                   ('Logistic Regression multinomial', LR2),
                   ('Support Vector Machines', SVM),
                   ('Extremely Randomized Trees', ExtraTrees)]:
    run_model(name,model)

Fitting Logistic Regression simple model
[LibLinear]Model Logistic Regression simple has finished training
The Logistic Regression simple model has an accuracy of 0.8370720063086776 

Fitting Logistic Regression multinomial model
max_iter reached after 1089 seconds
Model Logistic Regression multinomial has finished training


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 18.1min
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 18.1min finished


The Logistic Regression multinomial model has an accuracy of 0.8395102249597716 

Fitting Support Vector Machines model
[LibLinear]



Model Support Vector Machines has finished training
The Support Vector Machines model has an accuracy of 0.8335660013427253 

Fitting Extremely Randomized Trees model
building tree 1 of 10building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10



[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed: 18.5min remaining: 43.1min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed: 18.6min remaining: 18.6min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 18.7min remaining:  8.0min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 18.9min finished


Model Extremely Randomized Trees has finished training


[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    0.7s remaining:    1.6s
[Parallel(n_jobs=10)]: Done   5 out of  10 | elapsed:    0.7s remaining:    0.7s
[Parallel(n_jobs=10)]: Done   7 out of  10 | elapsed:    0.7s remaining:    0.3s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.7s finished


The Extremely Randomized Trees model has an accuracy of 0.7853600315433882 



In [51]:
LR3 = LogisticRegression(verbose=10, n_jobs=-1, solver='sag',
                        random_state=555, multi_class='multinomial',
                        max_iter = 2000)
run_model('Logistic Regression 3',LR3)

Fitting Logistic Regression 3 model
convergence after 522 epochs took 1218 seconds
Model Logistic Regression 3 has finished training


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 20.3min
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 20.3min finished


The Logistic Regression 3 model has an accuracy of 0.8395102249597716 



## d. Evaluate Performance 

In [71]:
pd.DataFrame(confusion_matrix(BoW_Y_test, LR3.predict(BoW_X_test)))

Unnamed: 0,0,1,2
0,74896,7232,11039
1,13925,19949,30439
2,4317,8349,299049


In [78]:
target_names = ['Below average (1 or 2 stars)','Average (3 stars)','Above average (4 or 5 stars)']
print(classification_report(BoW_Y_test, LR3.predict(BoW_X_test),target_names=target_names))

                              precision    recall  f1-score   support

Below average (1 or 2 stars)       0.80      0.80      0.80     93167
           Average (3 stars)       0.56      0.31      0.40     64313
Above average (4 or 5 stars)       0.88      0.96      0.92    311715

                 avg / total       0.82      0.84      0.82    469195



As may be expected, the average class reviews do not have text discriminative enough to make them sufficiently distinctive between truly mediocre and above average restaurants. 

However, from a product perspective it may make sense to keep 3 classees to differentiate between restaurants

## e. Potential Refinements with more time and resources

* Increase number of samples
* Perform grid search on SVM and LR models
* Look at other models
* Rebalance examples, especially average class
* See what performance is with just two classes (suspect accuracy would increase further, but less differentiation)