#                            RESTAURANT RATE PREDICTIONS

# Importing important Libraries

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')


In [2]:
import os
import tensorflow as tf
import h5py
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from sklearn.metrics import accuracy_score
from sys import getsizeof

ModuleNotFoundError: No module named 'tensorflow'

# Loading the data and Getting an idea about the Data

In [None]:
df = pd.read_csv("zomato.csv")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.info()

# Cleaning the Data

#### Deleting Unnnecessary Columns

In [None]:
df=df.drop(['url','dish_liked','phone'],axis=1)

##we can delete the address and location column as well as there is city column is already there

In [None]:
df.head()

#### Removing the Duplicates

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

#### Remove the Null values from the dataset

In [None]:
df.isnull().sum()

In [None]:
feature_na=[feature for feature in df.columns if df[feature].isnull().sum()>0]
feature_na

In [None]:
#% of missing values
for feature in feature_na:
    print('{} has {} % missing values'.format(feature,np.round(df[feature].isnull().sum()/len(df)*100,4)))

In [None]:
df.dropna(how='any',inplace=True)
df.info()

#### Changing the Columns Names

In [None]:
df.columns

In [None]:
df = df.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'type','listed_in(city)':'city'})
df.columns

#### Changing the data type of Cost

In [None]:
df.cost

In [None]:
#the data type is changed to float from string object
df['cost'] = df['cost'].astype(str)
df['cost'] = df['cost'].apply(lambda x: x.replace(',',''))
df['cost'] = df['cost'].astype(float)
df.info()

#### Removing '/5' from Rates

In [None]:
df['rate'].unique()

In [None]:
def split(x):
    return x.split('/')[0]

In [None]:
df['rate']=df['rate'].apply(split)

In [None]:
#Replacing the NEW and - by zero
df.replace('NEW',0,inplace=True)
df.replace('-',0,inplace=True)

In [None]:
#changing the data type to float
df['rate']=df['rate'].astype(float)

In [None]:
df['rate'].unique()

#### changing the online_order and _book_table data to boolean values

In [None]:
##changing the online_order and _book_table data to boolean values
df.online_order.replace(('Yes','No'),(True, False),inplace=True)
df.book_table.replace(('Yes','No'),(True, False),inplace=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['location'].equals(df['city'])

# Data Visualization and EDA

#### Restaurants delivering Online or not

In [None]:
plt.figure(figsize=(11,7))
sns.countplot(df['online_order'])
plt.title('Restaurants delivering online or Not')

###### Most of the restaurant allows online delivery of food

In [None]:
df.groupby(df.online_order).rate.mean()

#### Restaurants allowing table booking or not

In [None]:
plt.figure(figsize=(11,7))
sns.countplot(df['book_table'])
plt.title('Restaurants allowing table booking or not')

###### most of the restaurants doesnot allow table booking and that restaurant are allowing booking of table has better rating

In [None]:
df.groupby(df.book_table).rate.mean()

#### Number of restaurant in a particular location

In [None]:
plt.figure(figsize=(14,7))
sns.countplot(df['city'])
sns.countplot(df['city']).set_xticklabels(sns.countplot(df['city']).get_xticklabels(), rotation=90, ha="right")
plt.title('Location')

In [None]:
df.groupby(df.location).rate.mean()

#### Restaurant Type

In [None]:
plt.figure(figsize=(14,7))
sns.countplot(df['rest_type'])
sns.countplot(df['rest_type']).set_xticklabels(sns.countplot(df['rest_type']).get_xticklabels(), rotation=90, ha="right")
plt.title('Restuarant Type')

In [None]:
df.groupby(df.rest_type).rate.mean()

##### the bakery and beverage shop has lowest rating 

#### Types of Services

In [None]:
#Types of Services
plt.figure(figsize=(14,7))
sns.countplot(df['type'])
sns.countplot(df['type']).set_xticklabels(sns.countplot(df['type']).get_xticklabels(), rotation=90, ha="right")
plt.title('Type of Service')

In [None]:
df.groupby(df.type).rate.mean()

#### Cost of Restaurant

In [None]:
plt.figure(figsize=(14,7))
sns.countplot(df['cost'])
sns.countplot(df['cost']).set_xticklabels(sns.countplot(df['cost']).get_xticklabels(), rotation=90, ha="right")
plt.title('Cost of Restuarant')

In [None]:
sns.boxplot(df['cost'])

In [None]:
plt.figure(figsize=(6,6))
sns.distplot(df['cost'])
plt.show()

In [None]:
df.groupby(df.type).rate.mean()

#### No. of restaurants in a Location

In [None]:
fig = plt.figure(figsize=(20,7))
loc = sns.countplot(x="location",data=df, palette = "Set1")
loc.set_xticklabels(loc.get_xticklabels(), rotation=90, ha="right")
plt.ylabel("Frequency",size=15)
plt.xlabel("Location",size=18)
loc
plt.title('NO. of restaurants in a Location',size = 20,pad=20)

In [None]:
df.groupby(df.location).rate.mean()

#### Most famous restaurant chains in Bengaluru

In [None]:
plt.figure(figsize=(15,7))
chains=df['name'].value_counts()[:20]
sns.barplot(x=chains,y=chains.index,palette='Set1')
plt.title("Most famous restaurant chains in Bangaluru",size=20,pad=20)
plt.xlabel("Number of outlets",size=15)

In [None]:
sns.boxplot(x='online_order',y='votes',data=df)

In [None]:
df.cost.unique()

###### The major types of restaurants are:
       1.Casual Dining
       2.Quick bites
       3.Cafes

In [None]:
df['rate'].skew()

In [None]:
sns.distplot(df.rate)

In [None]:
df.head()

In [None]:
df.cost

## Data Pre Processing

In [None]:
#Encode the input Variables
def Encode(df):
    for column in df.columns[~df.columns.isin(['rate', 'cost', 'votes'])]:
        df[column] = df[column].factorize()[0]
    return df

df_en = Encode(df.copy())

In [None]:
df_en.head()

In [None]:
#Get Correlation between different variables
corr = df_en.corr(method='kendall')
plt.figure(figsize=(15,8))
sns.heatmap(corr, annot=True)
df_en.columns

# Model Building

#### Defining Independent and Dependent Variables

In [None]:
from sklearn.model_selection import train_test_split #Splitting of Dataset
#Defining the independent variables and dependent variables
x = df_en.iloc[:,[2,3,5,6,7,8,9,11]]
y = df_en['rate']
#Getting Test and Training Set
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.1,random_state=353)
x_train.head()
y_train.head()

#### Importing libraries need for model building

In [None]:
from sklearn.linear_model import LogisticRegression #Logistic Regression is a Machine Learning classification algorithm
from sklearn.linear_model import LinearRegression #Linear Regression is a Machine Learning classification algorithm
from sklearn.model_selection import train_test_split #Splitting of Dataset
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score

#### Linear Regression Analysis

In [None]:
#Prepare a Linear Regression Model
reg=LinearRegression()
reg.fit(x_train,y_train)
y_pred=reg.predict(x_test)
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

#### Decision Tree Regression

In [None]:
#Prepairng a Decision Tree Regression
from sklearn.tree import DecisionTreeRegressor
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.1,random_state=105)
DTree=DecisionTreeRegressor(min_samples_leaf=.0001)
DTree.fit(x_train,y_train)
y_predict=DTree.predict(x_test)
from sklearn.metrics import r2_score
r2_score(y_test,y_predict)

#### Random Forest Regression

In [None]:
#Preparing Random Forest Regression
from sklearn.ensemble import RandomForestRegressor
RForest=RandomForestRegressor(n_estimators=500,random_state=329,min_samples_leaf=.0001)
RForest.fit(x_train,y_train)
y_predict=RForest.predict(x_test)
from sklearn.metrics import r2_score
r2_score(y_test,y_predict)

#### Extra Tree Regression

In [None]:
#Preparing Extra Tree Regression
from sklearn.ensemble import  ExtraTreesRegressor
ETree=ExtraTreesRegressor(n_estimators = 100)
ETree.fit(x_train,y_train)
y_predict=ETree.predict(x_test)
from sklearn.metrics import r2_score
r2_score(y_test,y_predict)

# Saving the cleaned data for model building

In [None]:
df_en.head()

In [None]:
df_en.shape


In [None]:
my_data=df_en.iloc[:,[2,3,4,5,6,7,8,9,11]]
my_data.head()

In [None]:
my_data.to_csv('zomato_df.csv')

In [None]:
x=df_en.iloc[:,[2,3,5,6,7,8,9,11]]
x.head()

In [None]:
y=df["rate"]
y.head()

In [None]:
x_train = x_train / 255
x_test = x_test / 255

In [None]:
model = keras.Sequential([
    Flatten(input_shape=(28, 28)),
    Dense(128, activation='relu'),
    Dense(10)
])


In [None]:
model.compile(optimizer='adam',
              loss= SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
# model.save("./saved_model/")

In [None]:
converter = tf.lite.TFLiteConverter.from_saved_model("./saved_model")
tflite_model = converter.convert()

In [None]:
# converter = tf.lite.TFLiteConverter.from_saved_model("./saved_model")
# converter.optimizations = [tf.lite.Optimize.DEFAULT]
# tflite_quant_model = converter.convert()

In [None]:
# with open("tflite_model.tflite", "wb") as f:
#     f.write(tflite_model)

In [None]:
# with open("tflite_quant_model.tflite", "wb") as f:
#     f.write(tflite_quant_model)

# Saving Model To Disk

#### Using Pickle

In [None]:
import bz2
import pickle
import _pickle as cPickle

In [None]:
import pickle
pickle.dump(ETree, open('model.pkl' , 'wb'))
model=pickle.load(open('model.pkl','rb'))