In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from dython import nominal
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.display import HTML    # To centralize the plots
HTML("""
<style>
.output_png {
    display: table-cell;
    text-align: center;
    vertical-align: middle;
}
</style>
""")

In [None]:
df = pd.read_csv('zomato.csv', encoding='ISO-8859-1') # Specifying the encoding is important or it will raise UTF error

In [None]:
df.shape

In [None]:
df.columns

In [None]:
pd.set_option('display.max_columns',21)
df.head()


In [None]:

df.info()

In [None]:
df.describe() # Looking at just the numerical columns

In [None]:
sns.heatmap(df.isnull().sum().values.reshape(-1,1), \
            annot=True, cmap=plt.cm.Blues, yticklabels=df.columns)
plt.xlabel('Null Values')
plt.show()

In [None]:
df.dropna(inplace=True)

In [None]:
sns.heatmap(df.isnull().sum().values.reshape(-1,1), \
            annot=True, cmap=plt.cm.Blues, yticklabels=df.columns)
plt.xlabel('Null Values')
plt.show()

In [None]:
df['Switch to order menu']

In [None]:
df['Switch to order menu'].value_counts()

In [None]:
df.drop('Switch to order menu', axis=1, inplace = True)

In [None]:

df.columns

In [None]:
df['Restaurant Name'].value_counts()

In [None]:
df.Locality.value_counts().value_counts() # Remember, we can specify a column both as df['column'] and df.column

In [None]:
df['Has Table booking'].value_counts()

In [None]:
df['Has Online delivery'].value_counts()

In [None]:
df['Is delivering now'].value_counts()

In [None]:
df.City.value_counts()

In [None]:
nominal.associations(df,figsize=(20,10),mark_columns=True,title="Correlation Matrix") # correlation matrix
plt.show()

In [None]:
print( f"Total number of restaurants:    {df['Restaurant Name'].value_counts().shape[0]}")
print(f"Restaurants with 1 value count: {(df['Restaurant Name'].value_counts() == 1).sum()}")

In [None]:
df['Restaurant Name'].value_counts().head(10)

In [None]:
def dummy(rest_name,column):
    df[column] = df['Restaurant Name'].apply(lambda x: 1 if str(x).strip()==rest_name\
                                             else 0)

In [None]:
dummy('Cafe Coffee Day','cafe_coffee_day')

In [None]:
df.loc[df['cafe_coffee_day']==1].head(3)

In [None]:
def dum_col(x):
    return x.strip().lower().replace(' ','_')

def dummy(lst,column):
    for i in lst.index:
        df[dum_col(i)] = df[column].apply(lambda x: i in x)

In [None]:
restaurants = df['Restaurant Name'].value_counts().head(10)
dummy(restaurants,'Restaurant Name')

In [None]:
df.head()

In [None]:
print(f"Number of Cafe Coffee Day's: {df.loc[df['cafe_coffee_day']==1].size}")

In [None]:
df.shape

In [None]:
features = ['Price range','Votes','Country Code','Restaurant ID','Longitude',
            'Has Table booking','Has Online delivery','cafe_coffee_day',
            "domino's_pizza",'subway','green_chick_chop',"mcdonald's",'keventers',
            'pizza_hut','giani','baskin_robbins','barbeque_nation',
            'Aggregate rating']# --> Only added to see correlation, must be removed later

In [None]:
nominal.associations(df[features],figsize=(20,10),mark_columns=True,\
                     title="Correlation Matrix (features)")
plt.show()

In [None]:
features = ['Price range','Votes','Country Code','Restaurant ID','Longitude',
            'Has Table booking','Has Online delivery','barbeque_nation']

In [None]:
X = pd.get_dummies(df[features])
X

In [None]:

y = df['Aggregate rating']

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 2)

In [None]:
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

In [None]:
from sklearn import metrics
mse = metrics.mean_squared_error(y_test, y_pred)
rmse = metrics.mean_squared_error(y_test,y_pred,squared=False)
mae = metrics.mean_absolute_error(y_test, y_pred)
medae = metrics.median_absolute_error(y_test, y_pred)


print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Median Absolute Error (MEDAE): {medae}")
print(f'Test variance: {np.var(y_test)}')

In [None]:
residuals = y_test - y_pred
# plot the residuals
plt.scatter(np.linspace(0,5,1909), residuals,c=residuals,cmap='magma', edgecolors='black', linewidths=.1)
plt.colorbar(label="Quality", orientation="vertical")
# plot a horizontal line at y = 0
plt.hlines(y = 0,
xmin = 0, xmax=5,
linestyle='--',colors='black')
# set xlim
plt.xlim((0, 5))
plt.xlabel('Aggregate Rating'); plt.ylabel('Residuals')
plt.show()

In [None]:
print(f"Error range: {residuals.max()-residuals.min()}")

In [None]:
# from sklearn.model_selection import RandomizedSearchCV

# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# max_features = ['auto', 'sqrt']
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# min_samples_split = [2, 5, 10]
# min_samples_leaf = [1, 2, 4]
# bootstrap = [True, False]

# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}

# rf2 = RandomForestRegressor(random_state=2)

# rf_rscv = RandomizedSearchCV(estimator=rf2, param_distributions=random_grid,\
#                              n_iter = 100, cv = 3, verbose=2, random_state=2, n_jobs = -1)
# rf_rscv.fit(X_train,y_train)
# print(rf_rscv.best_params_)

# Output:
#      n_estimators= 1200,
#      min_samples_split= 10,
#      min_samples_leaf= 1,
#      max_depth = 30,
#      bootstrap= True,
#      random_state=2

In [None]:
rf_random = RandomForestRegressor(
      n_estimators= 1200,
      min_samples_split= 10,
      min_samples_leaf= 1,
      max_depth = 30,
      max_features='sqrt',
      bootstrap= True,
      random_state=2) # Best RandomizedSearch parameters

rf_random.fit(X_train,y_train)
random_pred = rf_random.predict(X_test)

In [None]:
random_mse = metrics.mean_squared_error(y_test, random_pred)
random_rmse = metrics.mean_squared_error(y_test, random_pred, squared=False)
random_mae = metrics.mean_absolute_error(y_test, random_pred)
random_medae = metrics.median_absolute_error(y_test, random_pred)

print(f"Mean Squared Error (MSE): {random_mse}")
print(f"Root Mean Squared Error (RMSE): {random_rmse}")
print(f"Mean Absolute Error (MAE): {random_mae}")
print(f"Median Absolute Error (MEDAE): {random_medae}")
print(f'Test variance: {np.var(y_test)}')

In [None]:
print('Improvements:')
print(f"Mean Squared Error (MSE):       {mse} => {random_mse}")
print(f"Root Mean Squared Error (RMSE): {rmse} => {random_rmse}")
print(f"Mean Absolute Error (MAE):      {mae} => {random_mae}")
print(f"Median Absolute Error (MEDAE):  {mae} => {random_medae}")
print(f'Test variance: {np.var(y_test)}')

In [None]:
f_residuals = y_test - random_pred
# plot the residuals
plt.scatter(np.linspace(0,5,1909), f_residuals, c = f_residuals, cmap='magma', edgecolors='black', linewidths=.1)
plt.colorbar(label = "Quality", orientation = "vertical")
# plot a horizontal line at y = 0
plt.hlines(y = 0, xmin = 0, xmax = 5, linestyle = '--', colors = 'black')
# set xlim
plt.xlim((0, 5))
plt.xlabel('Aggregate Rating'); plt.ylabel('Residuals')
plt.show()