In [2]:
import psycopg2 as pg
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import Data_Clean_Process as dc
import tn_helper as tn
# from yellowbrick.classifier import ConfusionMatrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.kernel_approximation import Nystroem, RBFSampler
# from sklearn import svm
from sklearn.svm import LinearSVC
%matplotlib inline


# Classifying Yelp Restaurants as Successful#

Yelp holds contests for students and the public where they release a large amount of data, and then have the particpants compete to reach one of a few specialized goals, ranging from building social network graphs to image recognition. We are using this data to tackle our own question, can we predict whether a restaurant has a good rating (four or five stars) or a bad rating (one, two or three stars). Because of the size of the dataset, we are storing it on a google cloud sql server, so our first step is to fetch only the data that we want.

## Data Processing ##
Once the data was on our sql server, we selected a subset of restaurants that we wanted to focus on and then looked at all of the reviews for each restaurant. We then computed some descriptive statistics about the review set for each restaurant to use as features in our classification model. The final list of features we used is listed at the end of this section, but some of the things we looked at were the review wordcounts and the engagement the reveiws received from other users. After creating those features on the sql server we then downloaded the data to python to set up the model.

In [9]:
#business_data = dc.download_data("serverpassword")
#usiness_data.to_pickle("business_data.zip")
business_data = pd.read_pickle("business_data.zip")

Since our data cleaning was just formatting strings and filling in missing values without imputation, we clean before doing our train test split.

In [None]:
clean_data, numerical_features, categorical_features = dc.clean_data(
    business_data)


X_train, X_test, y_train, y_test = dc.tt_split(clean_data, numerical_features, categorical_features)


## Pipelines ##
We set up piplines to use for hyperparameter tuning for all of the models that we used to classify our yelp restaurants. The first step is to make a column transformer that applies the correct preprocessing steps to each feature column in our data.

In [None]:
preprocess = make_column_transformer(
    (StandardScaler(), numerical_features),
    (OneHotEncoder(drop='first'), categorical_features)
)


Below we have the pipelines for the four models we tested, Logistic Regression, Random Forest, and two SVM methods.

### Logistic Regression ###

In [None]:
pipe_lr = Pipeline([('ct', preprocess),
                    ('clf', LogisticRegression())])

grid_params_lr = [{'clf__penalty': ['l1', 'l2'],
                   'clf__C': [.001, .01, .1, 1, 10, 100, 1000],
                   }]

log_results = pd.read_pickle('./CV_Results/Logistic_CV')


### Random Forest ###

In [None]:
pipe_rf = Pipeline([('scl', preprocess),
                    ('clf', RandomForestClassifier(oob_score=True, n_jobs=-1, verbose=1))])

grid_params_rf = [{'clf__criterion': ['gini'],
                   'clf__n_estimators': [300, 500],
                   'clf__min_samples_leaf': [10, 13, 15, 17],
                   'clf__max_depth': [20, 25, 30],
                   'clf__max_features': ['log2', 'sqrt']
                   }]

rf_results = pd.read_pickle('./CV_Results/Random_Forest_final_CV')


### SVM ###
Our two svm methods below use two different methods to approximate the radial kernel SVM using a linear SVC. One uses Nystroem method and the other uses a monte carlo method to approximate the radial kernel.

In [None]:
pipe_svm_ny = Pipeline([('ct', preprocess),
                        ("feature_map", Nystroem(gamma=.2)),
                        ('clf', LinearSVC(verbose=1, max_iter=15000, dual=False))])

grid_params_svm_ny = [{'feature_map__gamma': [.005, .01, .1],
                       'clf__C': [3000, 5000, 7000],
                       'clf__penalty': ['l1']
                       }]

svm_ny_results = pd.read_pickle('./CV_Results/SVM_Ny_RBF_CV')


In [None]:
pipe_svm_mc = Pipeline([('ct', preprocess),
                        ("feature_map", RBFSampler()),
                        ('clf', LinearSVC(verbose=1, max_iter=15000, dual=False))])

grid_params_svm_mc = [{'feature_map__gamma': [.005, .01, .02],
                       'clf__C': [1000, 2000, 3000],
                       'clf__penalty': ['l1']
                       }]

svm_mc_results = pd.read_pickle('./CV_Results/SVM_MC_RBF_3_CV')


## Hyperparamter Tuning ##

This executes the grid search over all the choices of parameters that have been specified in the pipeline. To run the search, set estimator to the pipline you want, set param_grid to the corresponding param grid dictionary, set cv to the number of crossvalidation steps, and set scoring equal to the scoring metric you would like to use in crossvalidation.Depending on the choice of model and hyperparameters this grid search may take a very long time to run, so we have left it commented out. Below we have summaries of the grid search results for each model. The full results of the grid searches are saved as pickled dataframes in the CV_Results folder.

In [None]:
# gridsearch = GridSearchCV(estimator=pipe,
#                           param_grid=grid_params_rf,
#                           scoring='accuracy',
#                           return_train_score=True,
#                           cv=5)

# gridsearch.fit(X_train, y_train)


Below we have a list of how the tuned versions of our models performed, with the list of hyper parameters, the mean cv test score and the mean fit time for the model.

In [None]:
results=[rf_results, svm_mc_results, svm_ny_results, log_results]

In [None]:
pd.options.display.max_colwidth=150
tn.top_models(results)

As you can see, the Random Forest model performed the best, but after tuning all of the models performed similarly in terms of accuracy. We decided to use Random forest as our model not only because it had the best performance, but because it had the second fastest mean fit time.

## Looking at the Model ##

In [None]:
random_forest = RandomForestClassifier(
    max_depth=30, max_features='sqrt', min_samples_leaf=10, n_estimators=300, n_jobs=-1,oob_score=True)

t_X_train=preprocess.fit_transform(X_train)
t_X_test=preprocess.fit_transform(X_test)

random_forest.fit(t_X_train,y_train)


Now we run our final model on all of the testing data, and then verify it against the testing data.

In [None]:
# training accuracy
random_forest.score(t_X_train,y_train)

In [None]:
# testing accuracy
random_forest.score(t_X_test,y_test)

In [None]:
cm=ConfusionMatrix(random_forest)
cm.poof()


The test score for our model on data that has not been used for hyperparameter tuning agrees with our crossvalidation results, so we can be pretty confident in the ability of our model to classify unseen data.

### Important Features ###

In [None]:
# importances=random_forest.feature_importances_
# ohe=OneHotEncoder(drop='first').fit(X_test[categorical_features])
# categorical_names=ohe.get_feature_names()
# feature_names=numerical_features+list(categorical_names)
# importances_df=pd.concat([pd.DataFrame(importances),pd.DataFrame(feature_names)],axis=1)
# importances_df.columns=['Importance','Feature']
# importances_df.sort_values(by='Importance',ascending=False).head(10)
importances_df=feature_importance(random_forest, X_train, categorical_features)
importances_df.head(10)

In [None]:
plt.figure(figsize=(16, 8))
plt.xticks(rotation='vertical')
plt.bar(x=importances_df.Feature, height=importances_df.Importance)
plt.savefig('features.png',)


In [None]:
categorical_features[22],categorical_features[24]