# Predicting the success of a Zomato Restaurant

In [None]:
!pip install wordcloud

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objs as go

from plotly.offline import iplot
pyo.init_notebook_mode()

from geopy.geocoders import Nominatim

import folium
from folium.plugins import HeatMap

from wordcloud import WordCloud, STOPWORDS

import re

import warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
df=pd.read_csv('/Users/yogakshijaiman/Desktop/Projects/Zomato-Restaurant-Success-Prediction/zomato.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

# Step 1:
Obtaining NaN values, data types of features and its overview.

In [None]:
df.isnull().sum()

In [None]:
# Using list comprehension for finding features with NaN values
[feature for feature in df.columns if df[feature].isnull().sum()>1]

In [None]:
# Finding the percentage of missing values
feature_na=[]
for feature in df.columns:
    if(df[feature].isnull().sum()>1):
        feature_na.append(feature)

for feature in feature_na:
    print('{} has {} % missing values.'.format(feature, np.round(df[feature].isnull().sum()/len(df)*100, 3)))

In [None]:
df.info()

# Step 2:
Perform data cleaning on approx_cost feature.

In [None]:
df['approx_cost(for two people)'].dtype

In [None]:
# Passing a filter in the data frame to display all NaN values in approx_cost features
df[df['approx_cost(for two people)'].isnull()]

In [None]:
# Check if there is any special character that is causing the data type to become an object
df['approx_cost(for two people)'].unique()
# We can observe that the ',' is causing the data type to be an object

In [None]:
# Replacing the ',' using lambda
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].astype(str).apply(lambda x:x.replace(',', ''))

# Aliter
# def remove_comma(x):
#     return x.replace(',', '')
# df['approx_cost(for two people)'].astype(str).apply(remove_comma)

In [None]:
df['approx_cost(for two people)'].unique()
# We can now easily convert the feature into float

In [None]:
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].astype(float)

In [None]:
df['approx_cost(for two people)'].dtype

# Step 3:
Perform data cleaning on rate feature.

In [None]:
df['rate'].dtype

In [None]:
df['rate'].unique()
# We can observe that the '/', 'NEW' is causing the data type to be an object

In [None]:
df['rate'].isnull().sum()

In [None]:
def split(x):
    return x.split('/')[0]

In [None]:
df['rate'] = df['rate'].astype(str).apply(split)

In [None]:
df['rate'].replace('-', 0, inplace=True)
df['rate'].replace('NEW', 0, inplace=True)

In [None]:
df['rate'].unique()
# We can now easily convert the feature into float

In [None]:
df['rate'] = df['rate'].astype(float)

In [None]:
df['rate'].unique()

# Step 4:
Types of restaurants.

In [None]:
plt.figure(figsize=(20,12))
df['rest_type'].value_counts().nlargest(20).plot.bar(color='purple')

In [None]:
def mark(x):
    if x in ('Quick Bites', 'Casual Dining'):
        return 'Quick Bites + Casual Dining'
    else:
        return 'others'

In [None]:
df['Top_types']= df['rest_type'].apply(mark)

In [None]:
df.head() 

In [None]:
values = df['Top_types'].value_counts().values
labels = df['Top_types'].value_counts().index

In [None]:
fig=px.pie(data_frame=df, names=labels, values=values)
fig.show()

# Step 5:
Create a new dataframe with votes, cost and rating of each restaurant.

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
rest=df.groupby('name').agg({'votes': 'sum', 'url': 'count', 'approx_cost(for two people)': 'mean', 'rate': 'mean'}).reset_index()
rest

In [None]:
rest.columns=['name', 'total_votes', 'total_unities', 'avg_approx_cost', 'mean_rating']
rest.head()

In [None]:
rest['votes_per_unity'] = rest['total_votes']/rest['total_unities']
rest.head()

In [None]:
popular = rest.sort_values(by='total_unities', ascending=False)
popular

In [None]:
popular.shape

In [None]:
popular['name'].nunique()

# Step 6:
Restaurant overview analysis.

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(3,1, figsize=(20,30))
ax1.text(0.50, 0.35, int(popular['total_votes'].mean()), fontsize = 40, ha='center')
ax1.text(0.50, 0.25, 'is the average of votes received by the restaurants', fontsize=25,ha='center')
ax1.axis('off')

sns.barplot(x='total_votes', y='name', data=popular.sort_values(by='total_votes', ascending=False).query('total_votes>0').head(5), ax=ax2, palette='plasma')
ax2.set_title('Top 5 most voted restaurants:')

sns.barplot(x='total_votes', y='name', data=popular.sort_values(by='total_votes', ascending=False).query('total_votes>0').tail(5), ax=ax3, palette='plasma')
ax3.set_title('Top 5 least voted restaurants:')

# Step 7
Analyse the most exprensive and cheapest restaurants.

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(3,1, figsize=(20,30))
ax1.text(0.50, 0.35, int(popular['avg_approx_cost'].mean()), fontsize = 40, ha='center')
ax1.text(0.50, 0.25, 'is the mean approx cost for Bengaluru Restaurants', fontsize=25,ha='center')
ax1.axis('off')

sns.barplot(x='avg_approx_cost', y='name', data=popular.sort_values(by='avg_approx_cost', ascending=False).query('avg_approx_cost>0').head(5), ax=ax2, palette='plasma')
ax2.set_title('Top 5 most expensive restaurants:')

sns.barplot(x='avg_approx_cost', y='name', data=popular.sort_values(by='avg_approx_cost', ascending=False).query('avg_approx_cost>0').tail(5), ax=ax3, palette='plasma')
ax3.set_title('Top 5 cheapest restaurants:')

# Step 8
Analyse restaurants that offer Table Booking and Online Order services.

In [None]:
x=df['book_table'].value_counts()
labels = ['No booking available', 'Booking available']

In [None]:
trace = go.Pie(labels=labels, values=x, hoverinfo='label+percent', textinfo='percent')
iplot([trace])

In [None]:
x=df['online_order'].value_counts()
labels = ['No online order available', 'Online order available']

In [None]:
fig=px.pie(df, values=x, names=labels, title='Pie Chart')
fig.show()

# Step 9
Find best budget restaurants in any location.

In [None]:
filter = (df['approx_cost(for two people)']<=400) & (df['location']=='BTM') & (df['rate']>4) & (df['rest_type']=='Quick Bites')
budget = df[filter]
budget.head()

In [None]:
budget['name'].unique()

In [None]:
def return_budget(location,restaurant_type):
    budget=df[(df['approx_cost(for two people)']<=400) & (df['location']==location) & 
                     (df['rate']>4) & (df['rest_type']==restaurant_type)]
    return(budget['name'].unique())

In [None]:
return_budget('BTM',"Quick Bites")

# Step 10
Extract latitudes and longitudes for geographical data analysis.

In [None]:
geolocator=Nominatim(user_agent='app')

In [None]:
data=df['name'][0]
data

In [None]:
data=geolocator.geocode(data)

In [None]:
data.latitude

In [None]:
data.longitude

In [None]:
# Should fine locations for only unique places and not for all entries in the data frame
locations=pd.DataFrame({'Name':df['location'].unique()})
locations

In [None]:
lat=[]
long=[]

for location in locations['Name']:
    location = geolocator.geocode(location)
    if location is None:
        lat.append(np.nan)
        long.append(np.nan)
    else:
        lat.append(location.latitude)
        long.append(location.longitude)

In [None]:
locations['latitude']=lat
locations['longitude']=long

In [None]:
locations.head()

# Step 11:
Perform spatial analysis to find where most of the restaurants are situated.

In [None]:
Rest_locations = df['location'].value_counts().reset_index()
Rest_locations.columns=['Name', 'count']
Rest_locations

In [None]:
# Merge locations and Rest_locations on the basis of common column name
Restaurant_locations = Rest_locations.merge(locations, on='Name').dropna()
Restaurant_locations

In [None]:
basemap = folium.Map(location=[12.97, 77.59])
basemap

In [None]:
HeatMap(data=Restaurant_locations[['latitude', 'longitude', 'count']]).add_to(basemap)

In [None]:
basemap

# Step 12:
Analysing customer preferences.

In [None]:
data = df[df['rest_type']=='Quick Bites']
data.head()

In [None]:
data=df[df['rest_type']=='Quick Bites']
data.head()

In [None]:
dishes=''

for word in data['dish_liked'].dropna():
    words=word.split()
    for i in range(len(words)):
        words[i] = words[i].lower()
        
    dishes = dishes + ' '.join(words) + ' '

In [None]:
data['dish_liked'].isnull().sum()

In [None]:
dishes

In [None]:
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, width=1500, height=1500).generate(dishes)

In [None]:
plt.imshow(wordcloud)
plt.axis('off')

# Step 13
Analyse customer reviews.

In [None]:
data1 = df['reviews_list'][0]
data1=data1.lower()
data1

In [None]:
data2 = re.sub('[^a-zA-Z]', ' ', data1)
data2

In [None]:
data3 = re.sub('rated', ' ', data2)
data3

In [None]:
data4 = re.sub('x', ' ', data3)
data4

In [None]:
data5 = re.sub(' +',' ', data4)
data5

In [None]:
dataset = df[df['rest_type'] == 'Quick Bites']
dataset.head()

In [None]:
total_review=' '
for review in dataset['reviews_list']:
    review=review.lower()
    review=re.sub('[^a-zA-Z]', ' ',review)
    review=re.sub('rated', ' ',review)
    review=re.sub('x',' ',review)
    review=re.sub(' +',' ',review)
    total_review=total_review + str(review)
 

In [None]:
wordcloud = WordCloud(stopwords = stopwords, width=1500, height=1500).generate(total_review) 

In [None]:
plt.figure(figsize=(12,8))
plt.imshow(wordcloud)
plt.axis('off')

In [None]:
df.head() 

# Step 14
Split restaurants into 2 categories to prepare it for modelling purpose:
1. New restaurants (with zero rating)
2. Restaurants for training data

In [None]:
def assign(x):
    if(x>0):
        return 1
    else:
        return 0

In [None]:
df['rated'] = df['rate'].apply(assign)

In [None]:
df.columns

In [None]:
df['rated'].unique()

In [None]:
new_restaurants = df[df['rated']==0]
train_val_restaurants = df.query('rated==1 ')

In [None]:
train_val_restaurants.head()

In [None]:
train_val_restaurants['rate'].unique()

# Step 15
Create a target variable/feature that we will have to predict.
According to domain expertise, if rating > 3.75 good, else bad.

In [None]:
threshold=3.75
train_val_restaurants['target'] = train_val_restaurants['rate'].apply(lambda x: 1 if x>=threshold else 0)

In [None]:
train_val_restaurants.head()

In [None]:
x = train_val_restaurants['target'].value_counts()
x

In [None]:
labels = x.index
labels

In [None]:
plt.pie(x, labels=labels)
# Almost 1:1 ratio hence no case of imbalance data

# Step 16
Performing feature extraction/importance on data.

In [None]:
train_val_restaurants.columns

In [None]:
# Select k features from n features that will play a major role/ important features in model building
train_val_restaurants.head()

In [None]:
def count(x):
    return len(x.split(','))

In [None]:
train_val_restaurants['total_cuisines'] = train_val_restaurants['cuisines'].astype(str).apply(count)

In [None]:
train_val_restaurants['multiple_types'] = train_val_restaurants['rest_type'].astype(str).apply(count)

In [None]:
train_val_restaurants.columns

In [None]:
train_val_restaurants.columns

In [None]:
imp_features = ['online_order', 'book_table','location', 'rest_type',
       'approx_cost(for two people)',
       'listed_in(type)', 'listed_in(city)',
       'target', 'total_cuisines', 'multiple_types']

In [None]:
data = train_val_restaurants[imp_features]

In [None]:
data

In [None]:
data.shape

# Step 17
Finding missing values.

In [None]:
data.isnull().sum()

In [None]:
data.dropna(how='any', inplace=True)

In [None]:
data.shape

# Step 18
Separate categorical and numerical data.

In [None]:
data.dtypes

In [None]:
cat_features = [col for col in data.columns if data[col].dtype=='O']
num_features = [col for col in data.columns if data[col].dtype!='O']

In [None]:
for feature in cat_features:
    print('{} has total {} unique features.'.format(feature, data[feature].nunique()))

# Step 19
Feature encoding.

In [None]:
# Avoiding one hot encoding as there as upto 92 unique features
data['location'].nunique()

In [None]:
# Check contribution of each an every category
values=(data['location'].value_counts()/len(data))*100
values

In [None]:
threshold = 0.4
imp=values[values>threshold]
imp

In [None]:
imp.index

In [None]:
imp.nunique()

In [None]:
data['location'].nunique()

In [None]:
data['location'] = np.where(data['location'].isin(imp.index), data['location'], 'other')
#using lambda
#data['location'].apply(lambda x:'other' if x not in imp.index else x)

In [None]:
data['location'].nunique()

In [None]:
# Check contribution of each an every category
values2=(data['rest_type'].value_counts()/len(data))*100
values2

In [None]:
threshold = 1.5
imp2=values2[values2>threshold]
imp2

In [None]:
len(imp2)

In [None]:
# Apply one-hot encoding
data['rest_type'].head(20)

In [None]:
data['rest_type'] = np.where(data['rest_type'].isin(imp2.index), data['rest_type'], 'other')
data['rest_type'].head(20)

In [None]:
for feature in cat_features:
    print('{} has total {} unique features.'.format(feature, data[feature].nunique()))

In [None]:
# Can use one hot encoding now
data_cat = data[cat_features]

In [None]:
data_cat.head()

In [None]:
for col in cat_features:
    col_encoded = pd.get_dummies(data_cat[col], prefix=col, drop_first=True)
    data_cat = pd.concat([data_cat, col_encoded], axis=1)
    data_cat.drop(col, axis=1, inplace=True)
    

In [None]:
data_cat.shape

In [None]:
data_cat.head()