# End-to-End Deployment of Zomato Restaurant Ratings

## 2-) MODELLING PART

In [53]:
import numpy as np 
import pandas as pd 

from pycaret.regression import *

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [2]:
data = pd.read_csv('Final_Zomato.csv')

In [3]:
data.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
data.head() # rate is our target variable.

Unnamed: 0,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,menu_item
0,1,1,4.1,775,1,20,1386,800.0,5047
1,1,0,4.1,787,1,20,594,800.0,5047
2,1,0,3.8,918,1,16,484,800.0,5047
3,0,0,3.7,88,1,62,1587,300.0,5047
4,0,0,3.8,166,4,20,1406,600.0,5047


In [31]:
X = data.drop('rate',axis=1)
y = data['rate']

#### Auto-ML with Pycaret
- The reason that using Pycaret module ,I want to see which model can fit better on dataset.
- This split is for Pycaret, thats what Pycaret wants

In [8]:
train, test = train_test_split(data, test_size = .3, random_state=1)

##### Let's create regression object

In [9]:
reg = setup(data=train, target = 'rate',
            fold_strategy= 'kfold',
            fold =10,
            silent = True
           )

Unnamed: 0,Description,Value
0,session_id,4512
1,Target,rate
2,Original Data,"(16273, 9)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,2
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(11391, 8)"


In [10]:
best_model = compare_models(n_select = 3) # this will bring us best top 3 models with their parameter values (default)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0591,0.0255,0.1594,0.8603,0.0362,0.017,1.796
rf,Random Forest Regressor,0.0872,0.0317,0.1778,0.8263,0.0404,0.0249,2.525
dt,Decision Tree Regressor,0.0759,0.0546,0.2331,0.7011,0.0528,0.0214,0.043
lightgbm,Light Gradient Boosting Machine,0.1888,0.0739,0.2718,0.5945,0.0615,0.0533,0.207
knn,K Neighbors Regressor,0.1985,0.0957,0.3091,0.475,0.0692,0.056,0.076
gbr,Gradient Boosting Regressor,0.2263,0.1044,0.3229,0.4278,0.0728,0.0643,0.906
lr,Linear Regression,0.265,0.1415,0.3759,0.2249,0.0834,0.0748,0.966
ridge,Ridge Regression,0.265,0.1415,0.3759,0.2249,0.0834,0.0748,0.028
lar,Least Angle Regression,0.265,0.1415,0.3759,0.2249,0.0834,0.0748,0.033
br,Bayesian Ridge,0.2649,0.1415,0.3759,0.2249,0.0834,0.0748,0.023


In [11]:
best_model

[ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     max_samples=None, min_impurity_decrease=0.0,
                     min_impurity_split=None, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=100, n_jobs=-1, oob_score=False,
                     random_state=4512, verbose=0, warm_start=False),
 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=-1, oob_score=False,
                       random_state=4512, verbose=0, warm_start=False),
 Decisi

### Model Selection with GridSearchCV and Sklearn 

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= .3, random_state = 1) 

In [14]:
def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'ExtraTreesRegressor' : {
            'model': ExtraTreesRegressor(random_state=1),
            'params': {
                'n_estimators': [120, 180, 250, 300],    
            }
        },
        'RandomForestRegressor': {
            'model': RandomForestRegressor(random_state=1),
            'params': {
                'n_estimators': [10, 100, 150, 200],
                'max_features' : ['auto', 'sqrt']
            }
        }
    }
    
    scores = []
    for algo_name, config in algos.items():
        model_selection =  GridSearchCV(config['model'], config['params'], cv=10, return_train_score=False)
        model_selection.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': model_selection.best_score_,
            'best_params': model_selection.best_params_
        })
         
    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

In [15]:
outcome = find_best_model_using_gridsearchcv(X,y)

In [16]:
outcome # Best train R2 score is %87

Unnamed: 0,model,best_score,best_params
0,ExtraTreesRegressor,0.877389,{'n_estimators': 300}
1,RandomForestRegressor,0.856764,"{'max_features': 'sqrt', 'n_estimators': 200}"


#### Model Building with Extra Trees Regressiom 

In [39]:
ET_model = ExtraTreesRegressor(n_estimators= 300, random_state=1)

In [40]:
ET_model.fit(X_train, y_train)

ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=300, n_jobs=None, oob_score=False,
                    random_state=1, verbose=0, warm_start=False)

In [48]:
ET_model.predict(X_test[:1])

array([3.69866667])

In [49]:
y_test[:1]

18612    3.7
Name: rate, dtype: float64

In [46]:
from sklearn.metrics import r2_score
y_pred=ET_model.predict(X_test)
r2_score(y_test,y_pred) # Test Score

0.9332909033477896

### Model Saving 

In [51]:
import pickle 
# Saving model to disk
pickle.dump(ET_model, open('model.pkl','wb'))
model=pickle.load(open('model.pkl','rb'))