# Yelp Data Analysis

This notebook contains code used to analyze data from Yelp restaurants in the State of Arizona

In [1]:
# Importing the modules
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from sklearn.cross_validation import train_test_split



In [2]:
# Importing the dataset
df = pd.read_csv('yelp_data.csv') # importing the dataset
df = df*1 # changing boolean variables to 1 and 0

In [3]:
# Separating out to binary variables
alcohol = pd.get_dummies(df['Business - Alcohol'])
attire = pd.get_dummies(df['Business - Attire'])
corkage = pd.get_dummies(df['Business - BYOB/Corkage'])
noise = pd.get_dummies(df['Business - Noise Level'])
smoking = pd.get_dummies(df['Business - Smoking'])
wifi= pd.get_dummies(df['Business - Wi-Fi'])

In [4]:
# Merging the dataframes back together
frames = [df, alcohol, attire, corkage, noise, smoking, wifi]
yelp_test = pd.concat(frames, axis=1)

In [5]:
# Removing the unnecessary columns
del yelp_test['Business - Alcohol']
del yelp_test['Business - Attire']
del yelp_test['Business - BYOB/Corkage']
del yelp_test['Business - Noise Level']
del yelp_test['Business - Smoking']
del yelp_test['Business - Wi-Fi']
yelp_test.fillna(0,inplace=True) # replacing all n/a datapoints to 0

In [6]:
# Cleaning the dataframe's columns
yelp_test.columns.values[30] = 'no_bar'
yelp_test.columns.values[34] = 'no_corkage'
yelp_test.columns.values[36] = 'free_corkage'
yelp_test.columns.values[37] = 'average_noise'
yelp_test.columns.values[41] = 'no_smoking'
yelp_test.columns.values[42] = 'outdoor_smoking'
yelp_test.columns.values[43] = 'yes_smoking'
yelp_test.columns.values[44] = 'free_wifi'
yelp_test.columns.values[45] = 'no_wifi'
yelp_test.columns.values[46] = 'paid_wifi'

In [7]:
# Creating X and Y variables for regression
X = yelp_test.drop('Business - Stars', axis = 1) # removing Y variable
X = X.drop('Business - Id', axis = 1) # removing index
X = X.drop('User - Id', axis = 1) # removing index
Y = yelp_test['Business - Stars']

In [8]:
# Removing dummy variables to avoid multicolinearity
X = X.drop('no_bar', axis = 1)
X = X.drop('formal', axis = 1)
X = X.drop('free_corkage', axis = 1)
X = X.drop('very_loud', axis = 1)
X = X.drop('yes_smoking', axis = 1)
X = X.drop('paid_wifi', axis = 1)

In [9]:
# Module does not take into account a b0 constant need to add an x0
X['Intercept'] = 1

In [10]:
# Prepping for Backward Elmination to Remove Unimportant Variables
# Dropping the variable with the Highest P-Value if it is greater than 0.05
# Running the regression
result = smf.OLS(Y, X.astype(float)).fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:       Business - Stars   R-squared:                       0.326
Model:                            OLS   Adj. R-squared:                  0.326
Method:                 Least Squares   F-statistic:                     3644.
Date:                Sat, 16 Sep 2017   Prob (F-statistic):               0.00
Time:                        13:11:20   Log-Likelihood:            -1.6257e+05
No. Observations:              285764   AIC:                         3.252e+05
Df Residuals:                  285725   BIC:                         3.256e+05
Df Model:                          38                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Business

In [11]:
# Dropping the Business - Takes Reservations Variable
X_drop_reservations = X.drop('Business - Takes Reservations', axis = 1)

# Running the regression
result = smf.OLS(Y, X_drop_reservations.astype(float)).fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:       Business - Stars   R-squared:                       0.326
Model:                            OLS   Adj. R-squared:                  0.326
Method:                 Least Squares   F-statistic:                     3742.
Date:                Sat, 16 Sep 2017   Prob (F-statistic):               0.00
Time:                        13:11:57   Log-Likelihood:            -1.6257e+05
No. Observations:              285764   AIC:                         3.252e+05
Df Residuals:                  285726   BIC:                         3.256e+05
Df Model:                          37                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Business

In [12]:
# Dropping the Corkage Variables
# Please note some variables were removed previously as a result of multicolinarity
X_drop_corkage = X_drop_reservations.drop('no_corkage', axis = 1)
X_drop_corkage = X_drop_corkage.drop('yes_corkage', axis = 1)

# Running the regression
result = smf.OLS(Y, X_drop_corkage.astype(float)).fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:       Business - Stars   R-squared:                       0.326
Model:                            OLS   Adj. R-squared:                  0.326
Method:                 Least Squares   F-statistic:                     3956.
Date:                Sat, 16 Sep 2017   Prob (F-statistic):               0.00
Time:                        13:12:25   Log-Likelihood:            -1.6258e+05
No. Observations:              285764   AIC:                         3.252e+05
Df Residuals:                  285728   BIC:                         3.256e+05
Df Model:                          35                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Business

In [13]:
# Dropping the Outdoor Seating Variable
X_drop_seating = X_drop_corkage.drop('Business - Outdoor Seating', axis = 1)

# Running the regression
result = smf.OLS(Y, X_drop_seating.astype(float)).fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:       Business - Stars   R-squared:                       0.326
Model:                            OLS   Adj. R-squared:                  0.326
Method:                 Least Squares   F-statistic:                     4072.
Date:                Sat, 16 Sep 2017   Prob (F-statistic):               0.00
Time:                        13:13:04   Log-Likelihood:            -1.6258e+05
No. Observations:              285764   AIC:                         3.252e+05
Df Residuals:                  285729   BIC:                         3.256e+05
Df Model:                          34                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Business

In [14]:
# Dropping the Dancing Variable
X_drop_dancing = X_drop_seating.drop('Business - Good For Dancing', axis = 1)

# Running the regression
result = smf.OLS(Y, X_drop_dancing.astype(float)).fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:       Business - Stars   R-squared:                       0.326
Model:                            OLS   Adj. R-squared:                  0.326
Method:                 Least Squares   F-statistic:                     4195.
Date:                Sat, 16 Sep 2017   Prob (F-statistic):               0.00
Time:                        13:13:26   Log-Likelihood:            -1.6258e+05
No. Observations:              285764   AIC:                         3.252e+05
Df Residuals:                  285730   BIC:                         3.256e+05
Df Model:                          33                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Business

In [15]:
# Splitting the dataset into the Training set and Test set
# Note: splitting from latest model
X_train, X_test, y_train, y_test = train_test_split(X_drop_dancing, Y, test_size = 0.2, random_state = 0)

In [31]:
# Fitting Multiple Linear Regression to the Training set
# Same as Simple Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

# Analyizing how accurate the test was
results = y_pred - y_test
score = sum(abs(results))/57153
print "The average difference between predicted rating and actual rating is %s" % (score)

The average difference between predicted rating and actual rating is 0.403681408357


In [17]:
# Picking which regressions to run
# Need to take the intercept for every model

# Running regression on Users average rating
user_stars = X[['User - Average Stars', 'Intercept']]

# Running the regression
result = smf.OLS(Y, user_stars.astype(float)).fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:       Business - Stars   R-squared:                       0.032
Model:                            OLS   Adj. R-squared:                  0.032
Method:                 Least Squares   F-statistic:                     9416.
Date:                Sat, 16 Sep 2017   Prob (F-statistic):               0.00
Time:                        13:17:00   Log-Likelihood:            -2.1440e+05
No. Observations:              285764   AIC:                         4.288e+05
Df Residuals:                  285762   BIC:                         4.288e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
User - Average Stars     0.1410 

In [18]:
# Testing the ability of the model
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(user_stars, Y, test_size = 0.2, random_state = 0)

# Fitting Multiple Linear Regression to the Training set
# Same as Simple Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

# Analyizing how accurate the test was
results = y_pred - y_test
score = sum(abs(results))/57153
print "The average difference between predicted rating and actual rating is %s" % (score)

0.41016688784


In [19]:
# Running regression on Dress Code
user_dress_code = X[['dressy', 'casual', 'Intercept']]

# Running the regression
result = smf.OLS(Y, user_dress_code.astype(float)).fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:       Business - Stars   R-squared:                       0.015
Model:                            OLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     2157.
Date:                Sat, 16 Sep 2017   Prob (F-statistic):               0.00
Time:                        13:17:32   Log-Likelihood:            -2.1689e+05
No. Observations:              285764   AIC:                         4.338e+05
Df Residuals:                  285761   BIC:                         4.338e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
dressy         0.8212      0.013     64.564      0.0

In [20]:
# Testing the ability of the model
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(user_dress_code, Y, test_size = 0.2, random_state = 0)

# Fitting Multiple Linear Regression to the Training set
# Same as Simple Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

# Analyizing how accurate the test was
results = y_pred - y_test
score = sum(abs(results))/57153
print "The average difference between predicted rating and actual rating is %s" % (score)

0.417201633986


In [21]:
# Running regression on Business Review Count
user_review_count = X[['Business - Review Count', 'Intercept']]

# Running the regression
result = smf.OLS(Y, user_review_count.astype(float)).fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:       Business - Stars   R-squared:                       0.129
Model:                            OLS   Adj. R-squared:                  0.129
Method:                 Least Squares   F-statistic:                 4.224e+04
Date:                Sat, 16 Sep 2017   Prob (F-statistic):               0.00
Time:                        13:18:07   Log-Likelihood:            -1.9934e+05
No. Observations:              285764   AIC:                         3.987e+05
Df Residuals:                  285762   BIC:                         3.987e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Business - Review Count   

In [22]:
# Testing the ability of the model
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(user_review_count, Y, test_size = 0.2, random_state = 0)

# Fitting Multiple Linear Regression to the Training set
# Same as Simple Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

# Analyizing how accurate the test was
results = y_pred - y_test
score = sum(abs(results))/57153
print "The average difference between predicted rating and actual rating is %s" % (score)

0.37919333469


In [23]:
# Running regression on Business Price Range
user_price = X[['Business - Price Range', 'Intercept']]

# Running the regression
result = smf.OLS(Y, user_price.astype(float)).fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:       Business - Stars   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     482.7
Date:                Sat, 16 Sep 2017   Prob (F-statistic):          6.82e-107
Time:                        13:18:42   Log-Likelihood:            -2.1879e+05
No. Observations:              285764   AIC:                         4.376e+05
Df Residuals:                  285762   BIC:                         4.376e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Business - Price Range     0

In [24]:
# Testing the ability of the model
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(user_price, Y, test_size = 0.2, random_state = 0)

# Fitting Multiple Linear Regression to the Training set
# Same as Simple Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

# Analyizing how accurate the test was
results = y_pred - y_test
score = sum(abs(results))/57153
print "The average difference between predicted rating and actual rating is %s" % (score)

0.420801737701


In [25]:
# Running regression on Business Accepts Credit Cards
user_cards = X[['Business - Accepts Credit Cards', 'Intercept']]

# Running the regression
result = smf.OLS(Y, user_cards.astype(float)).fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:       Business - Stars   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     543.7
Date:                Sat, 16 Sep 2017   Prob (F-statistic):          3.85e-120
Time:                        13:19:14   Log-Likelihood:            -2.1876e+05
No. Observations:              285764   AIC:                         4.375e+05
Df Residuals:                  285762   BIC:                         4.376e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Business -

In [26]:
# Testing the ability of the model
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(user_cards, Y, test_size = 0.2, random_state = 0)

# Fitting Multiple Linear Regression to the Training set
# Same as Simple Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

# Analyizing how accurate the test was
results = y_pred - y_test
score = sum(abs(results))/57153
print "The average difference between predicted rating and actual rating is %s" % (score)

0.421820800631


In [27]:
# Running regression on Personal Choice Variables
user_pc = X[['Business - Accepts Credit Cards', 'Business - Happy Hour',
             'Business - Has TV', 'beer_and_wine','full_bar',
             'no_smoking', 'outdoor_smoking','Intercept']]

# Running the regression
result = smf.OLS(Y, user_pc.astype(float)).fit()
print result.summary()

                            OLS Regression Results                            
Dep. Variable:       Business - Stars   R-squared:                       0.037
Model:                            OLS   Adj. R-squared:                  0.037
Method:                 Least Squares   F-statistic:                     1555.
Date:                Sat, 16 Sep 2017   Prob (F-statistic):               0.00
Time:                        13:19:48   Log-Likelihood:            -2.1369e+05
No. Observations:              285764   AIC:                         4.274e+05
Df Residuals:                  285756   BIC:                         4.275e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Business -

In [29]:
# Testing the ability of the model
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(user_pc, Y, test_size = 0.2, random_state = 0)

# Fitting Multiple Linear Regression to the Training set
# Same as Simple Linear Regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

# Analyizing how accurate the test was
results = y_pred - y_test
score = sum(abs(results))/57153
print "The average difference between predicted rating and actual rating is %s" % (score)

0.403681408357
