# Modeling


In [1]:
#Import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score  
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC

from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix, accuracy_score

In [2]:
data = pd.read_csv('data.csv')

In [3]:
data.describe()

Unnamed: 0.1,Unnamed: 0,attributes.Ambience.divey,attributes.Dietary Restrictions.vegan,attributes.Happy Hour,attributes.Order at Counter,attributes.Outdoor Seating,attributes.Alcohol,attributes.Ambience.classy,attributes.Parking.lot,attributes.Ambience.touristy,...,attributes.Ambience.intimate,attributes.Good For.latenight,attributes.Good For.dinner,attributes.Good for Kids,attributes.Parking.validated,attributes.Has TV,attributes.Ambience.casual,attributes.Good For Groups,attributes.Ambience.romantic,attributes.Ambience.upscale
count,18325.0,18325.0,18325.0,18325.0,18325.0,18325.0,18325.0,18325.0,18325.0,18325.0,...,18325.0,18325.0,18325.0,18325.0,18325.0,18325.0,18325.0,18325.0,18325.0,18325.0
mean,9162.0,0.020518,0.003383,0.05708,0.011241,0.327585,0.536426,0.015007,0.539482,0.005184,...,0.005948,0.036344,0.397435,0.809986,0.003547,0.462156,0.502483,0.847749,0.009277,0.00704
std,5290.116177,0.141769,0.05807,0.232003,0.105431,0.469345,0.498685,0.121583,0.498452,0.071816,...,0.076897,0.187149,0.489381,0.392322,0.059453,0.498579,0.500007,0.359274,0.095872,0.083609
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4581.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,9162.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
75%,13743.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
max,18324.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [23]:
data['stars'] = data['stars'].apply(np.int64)

In [24]:
data.dtypes

Unnamed: 0                                    int64
attributes.Ambience.divey                     int64
attributes.Dietary Restrictions.vegan         int64
attributes.Happy Hour                         int64
attributes.Order at Counter                   int64
attributes.Outdoor Seating                    int64
attributes.Alcohol                            int64
attributes.Ambience.classy                    int64
attributes.Parking.lot                        int64
attributes.Ambience.touristy                  int64
attributes.Good For.brunch                    int64
attributes.Waiter Service                     int64
attributes.Parking.street                     int64
attributes.Ambience.hipster                   int64
attributes.Music.live                         int64
attributes.Dietary Restrictions.dairy-free    int64
attributes.Music.background_music             int64
attributes.Price Range                        int64
attributes.Good For.breakfast                 int64
attributes.P

In [26]:
data.stars.nunique()

5

In [27]:
#Shuffle dataset

data = data.sample(frac=1, random_state=25).reset_index(drop=True)

In [28]:
#preprocessing & split data into y, X and then into train and test.
def preprocess_inputs(dataframe):
    #Split y, X
    y= dataframe['stars']
    X=dataframe.drop('stars', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7, shuffle=True, random_state=25)
    return X_train, X_test, y_train, y_test

In [29]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [30]:
y_train.value_counts()

3    5658
4    4967
2    1624
5     347
1     231
Name: stars, dtype: int64

In [31]:
y_test.value_counts()

3    2435
4    2115
2     717
5     141
1      90
Name: stars, dtype: int64

#### Logistic Regression

In [32]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

y_pred_logistic = logistic_model.decision_function(X_test)
    
print("Logistic Regression:" + " {:.2f}%".format(logistic_model.score(X_test, y_test)*100))


Logistic Regression: 48.74%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
KNN_model = KNeighborsClassifier()
KNN_model.fit(X_train, y_train)

y_pred_KNN = KNN_model.predict_proba(X_test)
    
print("K-Nearest Neighbors:" + " {:.2f}%".format(KNN_model.score(X_test, y_test)*100))


K-Nearest Neighbors: 42.80%


#### Decision Tree

In [34]:
Tree_model = DecisionTreeClassifier()
Tree_model.fit(X_train, y_train)

y_pred_tree = Tree_model.predict(X_test)
    
print("Decision Tree:" + " {:.2f}%".format(Tree_model.score(X_test, y_test)*100))

Decision Tree: 41.51%


In [35]:
print(confusion_matrix(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree))

[[   4   23   39   21    3]
 [  31  164  320  179   23]
 [  55  334 1163  821   62]
 [  40  214  860  939   62]
 [   1   20   59   49   12]]
              precision    recall  f1-score   support

           1       0.03      0.04      0.04        90
           2       0.22      0.23      0.22       717
           3       0.48      0.48      0.48      2435
           4       0.47      0.44      0.46      2115
           5       0.07      0.09      0.08       141

    accuracy                           0.42      5498
   macro avg       0.25      0.26      0.25      5498
weighted avg       0.42      0.42      0.42      5498



#### Support Vector MAchine (linear kerenel)

In [36]:
SVM_model = LinearSVC()
SVM_model.fit(X_train, y_train)

y_pred_SVM = SVM_model.decision_function(X_test)
    
print("Support Vector Machine (Linear Kernel):" + " {:.2f}%".format(SVM_model.score(X_test, y_test)*100))


Support Vector Machine (Linear Kernel): 22.12%


