In [7]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [9]:
# read data
df1 = pd.read_csv('df_main.csv',index_col=0)
df_bp = pd.read_csv('df_businessparking.csv',index_col=0)
df_m = pd.read_csv('df_music.csv',index_col=0)
df_a = pd.read_csv('df_ambience.csv',index_col=0)

In [10]:
# ANOVA Part1
formula = 'stars~C(NoiseLevel)+C(BikeParking)+C(BusinessAcceptsCreditCards)+C(RestaurantsReservations)+C(RestaurantsTakeOut)+C(WiFi)+C(RestaurantsDelivery)+C(HasTV)+C(RestaurantsPriceRange2)+C(Alcohol)+C(RestaurantsGoodForGroups)+C(OutdoorSeating)+C(HappyHour)'
anova1 = anova_lm(ols(formula,df1).fit())
print(anova1)
# NoiseLevel, BikeParking, RestaurantsReservations, RestaurantsTakeOut, RestaurantsDelivery, HasTV, HappyHour are significant. Next we will apply Tuckey multiple comparison.

                                  df      sum_sq    mean_sq          F  \
C(NoiseLevel)                    3.0   15.340208   5.113403  27.657016   
C(BikeParking)                   1.0    0.150910   0.150910   0.816232   
C(BusinessAcceptsCreditCards)    1.0    0.506040   0.506040   2.737034   
C(RestaurantsReservations)       1.0    1.777579   1.777579   9.614444   
C(RestaurantsTakeOut)            1.0    1.666594   1.666594   9.014157   
C(WiFi)                          2.0    0.530615   0.265307   1.434977   
C(RestaurantsDelivery)           1.0    1.393333   1.393333   7.536162   
C(HasTV)                         1.0   11.077203  11.077203  59.913602   
C(RestaurantsPriceRange2)        3.0    0.696107   0.232036   1.255019   
C(Alcohol)                       2.0    0.197467   0.098733   0.534022   
C(RestaurantsGoodForGroups)      1.0    0.400753   0.400753   2.167563   
C(OutdoorSeating)                1.0    0.134605   0.134605   0.728041   
C(HappyHour)                     1.0  

In [11]:
# Tuckey Multiple Comparison
NL = pairwise_tukeyhsd(df1['stars'],df1['NoiseLevel'])
print(NL.summary())
# NoiseLevel: very_loud has less stars than average or very_quiet

  Multiple Comparison of Means - Tukey HSD, FWER=0.05   
 group1   group2  meandiff p-adj   lower   upper  reject
--------------------------------------------------------
average      loud  -0.1297 0.0113 -0.2379 -0.0215   True
average     quiet   0.0553 0.8269  -0.119  0.2296  False
average very_loud  -0.7823  0.001 -1.0277  -0.537   True
   loud     quiet    0.185 0.0725  -0.011   0.381  False
   loud very_loud  -0.6527  0.001 -0.9138 -0.3915   True
  quiet very_loud  -0.8377  0.001 -1.1324 -0.5429   True
--------------------------------------------------------


In [12]:
RD = pairwise_tukeyhsd(df1['stars'],df1['RestaurantsDelivery'])
print(RD.summary())
# RestaurantsDelivery: False has less stars than True

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
 False   True   0.0413 0.1767 -0.0187 0.1013  False
---------------------------------------------------


In [13]:
HT = pairwise_tukeyhsd(df1['stars'],df1['HasTV'])
print(HT.summary())
# HasTV: True has less stars than average or False (maybe it is consistent with preference in quiet places)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
 False   True  -0.2696 0.001 -0.3326 -0.2065   True
---------------------------------------------------


In [14]:
RR = pairwise_tukeyhsd(df1['stars'],df1['RestaurantsReservations'])
print(RR.summary())
# RestaurantsReservations: True has less stars than False

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
 False   True  -0.0586 0.0618 -0.1201 0.0029  False
---------------------------------------------------


In [15]:
RT = pairwise_tukeyhsd(df1['stars'],df1['RestaurantsTakeOut'])
print(RT.summary())
# RestaurantsTakeOut: No significant difference

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper reject
--------------------------------------------------
 False   True  -0.0633 0.0734 -0.1325 0.006  False
--------------------------------------------------


In [16]:
# ANOVA Part2(Business Parking)
formula = 'stars~C(garage)+C(street)+C(valet)+C(validated)+C(lot)'
anova2 = anova_lm(ols(formula,df_bp).fit())
print(anova2)
# Street and garage are significant. 

                  df      sum_sq   mean_sq          F        PR(>F)
C(garage)        1.0    1.196757  1.196757   4.466788  3.474542e-02
C(street)        1.0    8.897256  8.897256  33.208209  1.027393e-08
C(valet)         1.0    0.629200  0.629200   2.348433  1.256465e-01
C(validated)     1.0    0.090530  0.090530   0.337895  5.611454e-01
C(lot)           1.0    1.208607  1.208607   4.511016  3.386107e-02
Residual      1332.0  356.873951  0.267923        NaN           NaN


In [17]:
# Tuckey Multiple Comparison
S = pairwise_tukeyhsd(df_bp['stars'],df_bp['street'])
print(S.summary())
# street: False has less stars than True

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj lower  upper  reject
-------------------------------------------------
 False   True   0.1585 0.001 0.1022 0.2148   True
-------------------------------------------------


In [18]:
G = pairwise_tukeyhsd(df_bp['stars'],df_bp['garage'])
print(G.summary())
# garage: True has less stars than False

Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
 False   True  -0.0776 0.0372 -0.1506 -0.0046   True
----------------------------------------------------


In [19]:
# ANOVA Part3(Music)
formula = 'stars~C(dj)+C(background_music)+C(no_music)+C(jukebox)+C(live)+C(video)+C(karaoke)'
anova3 = anova_lm(ols(formula,df_m).fit())
print(anova3)
# Background_music, jukebox, dj are significant. Next we will apply Tuckey multiple comparison.

                         df      sum_sq   mean_sq          F        PR(>F)
C(dj)                   1.0    9.300811  9.300811  34.860620  4.488710e-09
C(background_music)     1.0    1.971737  1.971737   7.390319  6.642696e-03
C(no_music)             1.0    0.003652  0.003652   0.013687  9.068830e-01
C(jukebox)              1.0    1.586050  1.586050   5.944717  1.489145e-02
C(live)                 1.0    0.175717  0.175717   0.658609  4.171963e-01
C(video)                1.0    0.002357  0.002357   0.008834  9.251319e-01
C(karaoke)              1.0    0.483603  0.483603   1.812604  1.784253e-01
Residual             1332.0  355.377513  0.266800        NaN           NaN


In [20]:
# Tuckey Multiple Comparison
BM = pairwise_tukeyhsd(df_m['stars'],df_m['background_music'])
print(BM.summary())
# background_music: True has less stars than False

Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
 False   True  -0.3759 0.0044 -0.6343 -0.1174   True
----------------------------------------------------


In [21]:
J = pairwise_tukeyhsd(df_m['stars'],df_m['jukebox'])
print(J.summary())
# jukebox: True has less stars than False

Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
 False   True  -0.2905 0.0234 -0.5416 -0.0394   True
----------------------------------------------------


In [22]:
DJ = pairwise_tukeyhsd(df_m['stars'],df_m['dj'])
print(DJ.summary())
# DJ: True has less stars than False

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
 False   True  -0.5631 0.001 -0.7511 -0.3752   True
---------------------------------------------------


In [23]:
# ANOVA Part4(Ambience)
formula = 'stars~C(touristy)+C(hipster)+C(romantic)+C(divey)+C(intimate)+C(upscale)+C(classy)+C(casual)'
anova4 = anova_lm(ols(formula,df_a).fit())
print(anova4)
# Casual, intimate, romantic, hipster are significant. Next we will apply Tuckey multiple comparison.

                 df      sum_sq   mean_sq          F    PR(>F)
C(touristy)     1.0    0.686323  0.686323   2.543483  0.110988
C(hipster)      1.0    1.119910  1.119910   4.150337  0.041824
C(romantic)     1.0    1.265193  1.265193   4.688750  0.030538
C(divey)        1.0    0.053464  0.053464   0.198137  0.656302
C(intimate)     1.0    1.874627  1.874627   6.947284  0.008492
C(upscale)      1.0    0.385845  0.385845   1.429925  0.231990
C(classy)       1.0    0.098645  0.098645   0.365572  0.545531
C(casual)       1.0    4.800268  4.800268  17.789577  0.000026
Residual     1329.0  358.612024  0.269836        NaN       NaN


In [24]:
# Tuckey Multiple Comparison
R = pairwise_tukeyhsd(df_a['stars'],df_a['romantic'])
print(R.summary())
# romantic: False has less stars than True

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower  upper  reject
--------------------------------------------------
 False   True   0.1455 0.0332 0.0116 0.2793   True
--------------------------------------------------


In [25]:
I = pairwise_tukeyhsd(df_a['stars'],df_a['intimate'])
print(I.summary())
# intimate: False has less stars than True

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower  upper  reject
--------------------------------------------------
 False   True   0.1864 0.0013 0.0731 0.2996   True
--------------------------------------------------


In [26]:
H = pairwise_tukeyhsd(df_a['stars'],df_a['hipster'])
print(H.summary())
# hipster: False has less stars than True

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower  upper  reject
--------------------------------------------------
 False   True   0.1009 0.0489 0.0005 0.2013   True
--------------------------------------------------


In [27]:
C = pairwise_tukeyhsd(df_a['stars'],df_a['casual'])
print(C.summary())
# casual: True has less stars than False

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
 False   True  -0.1154 0.001 -0.1718 -0.0591   True
---------------------------------------------------


In [28]:
## Decision Tree
#y = df1['stars'] # labels
#x = df1.drop('stars',axis=1) 
#x = pd.concat([df1.drop('stars',axis=1),df_bp.drop('stars',axis=1),df_m.drop('stars',axis=1),df_a.drop('stars',axis=1)], axis=1) # data
#x = pd.get_dummies(x) # one hot encoding since categorical data cannot be handled well
#x_train , x_test , y_train , y_test = train_test_split(x, y, train_size=0.8, test_size=0.2)
## Random Forest
#forest = RandomForestClassifier()
#forest.fit(x_train , y_train.astype('int'))
## Predict
#y_pred_test = forest.predict(x_test)
## Model Performance
#print("Accuracy:", accuracy_score(y_test.astype('int') , y_pred_test.astype('int'))) 
#print("\nConfusion Matrix:")
#print(confusion_matrix(y_test.astype('int') , y_pred_test.astype('int')))

#Accuracy: 0.6477272727272727

#Confusion Matrix:
#[[ 0  0  1]
# [ 0 15 16]
# [ 0 14 42]]