In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import warnings

In [2]:
df = pd.read_csv('data/Updated_Zomato_Data.csv', header=0, index_col=0)

In [3]:
df.head(20)

Unnamed: 0,online,reservations,rating,votes,location,rest_type,cost_for_two,type
0,Yes,Yes,4.1,775,Banashankari,Casual Dining,800.0,Buffet
1,Yes,No,4.1,787,Banashankari,Casual Dining,800.0,Buffet
2,Yes,No,3.8,918,Banashankari,Other,800.0,Buffet
3,No,No,3.7,88,Banashankari,Quick Bites,300.0,Buffet
4,No,No,3.8,166,Basavanagudi,Casual Dining,600.0,Buffet
5,Yes,No,3.8,286,Basavanagudi,Casual Dining,600.0,Buffet
6,No,No,3.6,8,Mysore Road,Casual Dining,800.0,Buffet
7,Yes,Yes,4.6,2556,Banashankari,Other,600.0,Cafes
8,Yes,No,4.0,324,Banashankari,Cafe,700.0,Cafes
9,Yes,No,4.2,504,Banashankari,Cafe,550.0,Cafes


In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

cat_features = df.select_dtypes(include="object").columns
num_features = df.select_dtypes(exclude="object").columns

for x in cat_features:
    df[x]=le.fit_transform(df[x])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print("Feature",x)
    print("Mapping", le_name_mapping)

Feature online
Mapping {'No': 0, 'Yes': 1}
Feature reservations
Mapping {'No': 0, 'Yes': 1}
Feature location
Mapping {'BTM': 0, 'Banashankari': 1, 'Banaswadi': 2, 'Bannerghatta Road': 3, 'Basavanagudi': 4, 'Basaveshwara Nagar': 5, 'Bellandur': 6, 'Bommanahalli': 7, 'Brigade Road': 8, 'Brookefield': 9, 'CV Raman Nagar': 10, 'Central Bangalore': 11, 'Church Street': 12, 'City Market': 13, 'Commercial Street': 14, 'Cunningham Road': 15, 'Domlur': 16, 'East Bangalore': 17, 'Ejipura': 18, 'Electronic City': 19, 'Frazer Town': 20, 'HBR Layout': 21, 'HSR': 22, 'Hebbal': 23, 'Hennur': 24, 'Hosur Road': 25, 'ITPL Main Road, Whitefield': 26, 'Indiranagar': 27, 'Infantry Road': 28, 'JP Nagar': 29, 'Jakkur': 30, 'Jalahalli': 31, 'Jayanagar': 32, 'Jeevan Bhima Nagar': 33, 'KR Puram': 34, 'Kaggadasapura': 35, 'Kalyan Nagar': 36, 'Kammanahalli': 37, 'Kanakapura Road': 38, 'Kengeri': 39, 'Koramangala': 40, 'Koramangala 1st Block': 41, 'Koramangala 2nd Block': 42, 'Koramangala 3rd Block': 43, 'Koramang

In [5]:
X = df.drop(columns=['rating'],axis=1)
X

Unnamed: 0,online,reservations,votes,location,rest_type,cost_for_two,type
0,1,1,775,1,2,800.0,0
1,1,0,787,1,2,800.0,0
2,1,0,918,1,6,800.0,0
3,0,0,88,1,7,300.0,0
4,0,0,166,4,2,600.0,0
...,...,...,...,...,...,...,...
51712,0,0,27,89,6,1500.0,6
51713,0,0,0,89,6,600.0,6
51714,0,0,0,89,6,2000.0,6
51715,0,1,236,26,6,2500.0,6


In [6]:
y = df['rating']
y

0        4.100000
1        4.100000
2        3.800000
3        3.700000
4        3.800000
           ...   
51712    3.600000
51713    3.731785
51714    3.731785
51715    4.300000
51716    3.400000
Name: rating, Length: 32080, dtype: float64

In [7]:
print(X.shape)
print(y.shape)

(32080, 7)
(32080,)


In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
print(X)

[[ 0.78353407  2.24202513  0.39002461 ... -1.14501333  0.39576986
  -2.41598024]
 [ 0.78353407 -0.44602533  0.40233543 ... -1.14501333  0.39576986
  -2.41598024]
 [ 0.78353407 -0.44602533  0.53672855 ...  0.52878978  0.39576986
  -2.41598024]
 ...
 [-1.27626868 -0.44602533 -0.4050492  ...  0.52878978  2.9452386
   2.62164124]
 [-1.27626868  2.24202513 -0.16293641 ...  0.52878978  4.00751724
   2.62164124]
 [-1.27626868 -0.44602533 -0.39171248 ...  0.52878978  1.88295996
   2.62164124]]


In [9]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)
print("X_train: ",X_train.shape)
print("X_test: ",X_test.shape)
print("y_train: ",y_train.shape)
print("y_test: ",y_test.shape)

X_train:  (25664, 7)
X_test:  (6416, 7)
y_train:  (25664,)
y_test:  (6416,)


In [10]:
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [11]:
models = {
    "Linear Regression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "K-Neighbors Regressor" : KNeighborsRegressor(),
    "Decision Tree Regressor" : DecisionTreeRegressor(),
    "Random Forest Regressor" : RandomForestRegressor(n_estimators=500, random_state=10),
    "Extra Tree Regressor" : ExtraTreesRegressor(n_estimators=500, random_state=10),
    "XGBRegressor" : XGBRegressor(),
    "CatBoosting Regressor" : CatBoostRegressor(verbose=False),
    "AdaBoost Regressor" : AdaBoostRegressor(), 
    "Support Vector Regressor" : SVR()
}

model_list = []
r2_list = []

len_models = len(list(models))

for i in range(len_models):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model performance for Training set")
    print(" - Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print(" - Mean Absolute Error: {:.4f}".format(model_train_mae))
    print(" - R2 Score: {:.4f}".format(model_train_r2))

    print('-' * 37)

    print("Model performance for Testing set")
    print(" - Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print(" - Mean Absolute Error: {:.4f}".format(model_test_mae))
    print(" - R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)

    print('=' * 37)
    print('\n')

Linear Regression
Model performance for Training set
 - Root Mean Squared Error: 0.3649
 - Mean Absolute Error: 0.2813
 - R2 Score: 0.3017
-------------------------------------
Model performance for Testing set
 - Root Mean Squared Error: 0.3637
 - Mean Absolute Error: 0.2800
 - R2 Score: 0.3017


Lasso
Model performance for Training set
 - Root Mean Squared Error: 0.4366
 - Mean Absolute Error: 0.3387
 - R2 Score: 0.0000
-------------------------------------
Model performance for Testing set
 - Root Mean Squared Error: 0.4353
 - Mean Absolute Error: 0.3363
 - R2 Score: -0.0001


Ridge
Model performance for Training set
 - Root Mean Squared Error: 0.3649
 - Mean Absolute Error: 0.2813
 - R2 Score: 0.3017
-------------------------------------
Model performance for Testing set
 - Root Mean Squared Error: 0.3637
 - Mean Absolute Error: 0.2800
 - R2 Score: 0.3017


K-Neighbors Regressor
Model performance for Training set
 - Root Mean Squared Error: 0.2548
 - Mean Absolute Error: 0.1823
 - 

In [12]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=["Model Name", "R2_Score"]).sort_values(by=["R2_Score"], ascending=False)

Unnamed: 0,Model Name,R2_Score
5,Random Forest Regressor,0.843409
6,Extra Tree Regressor,0.838695
4,Decision Tree Regressor,0.782334
7,XGBRegressor,0.678474
8,CatBoosting Regressor,0.629098
3,K-Neighbors Regressor,0.453953
10,Support Vector Regressor,0.421014
0,Linear Regression,0.301678
2,Ridge,0.301677
9,AdaBoost Regressor,0.231766


In [13]:
model_DT = DecisionTreeRegressor()
model_DT.fit(X_train,y_train)
y_pred = model_DT.predict(X_test)
print(list(zip(df.columns[:-1],model_DT.feature_importances_)))

[('online', 0.02335506642104331), ('reservations', 0.0238678085617598), ('rating', 0.5997014612887414), ('votes', 0.16095621529569673), ('location', 0.06736955653139423), ('rest_type', 0.11950583143030429), ('cost_for_two', 0.005244060471060191)]
