In [156]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [134]:
df = pd.read_csv('data/zomato.csv')
df.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [135]:
df.shape

(51717, 17)

In [136]:
df.isnull().sum()

url                                0
address                            0
name                               0
online_order                       0
book_table                         0
rate                            7775
votes                              0
phone                           1208
location                          21
rest_type                        227
dish_liked                     28078
cuisines                          45
approx_cost(for two people)      346
reviews_list                       0
menu_item                          0
listed_in(type)                    0
listed_in(city)                    0
dtype: int64

In [137]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   url                          51717 non-null  object
 1   address                      51717 non-null  object
 2   name                         51717 non-null  object
 3   online_order                 51717 non-null  object
 4   book_table                   51717 non-null  object
 5   rate                         43942 non-null  object
 6   votes                        51717 non-null  int64 
 7   phone                        50509 non-null  object
 8   location                     51696 non-null  object
 9   rest_type                    51490 non-null  object
 10  dish_liked                   23639 non-null  object
 11  cuisines                     51672 non-null  object
 12  approx_cost(for two people)  51371 non-null  object
 13  reviews_list                 51

In [138]:
df.describe()

Unnamed: 0,votes
count,51717.0
mean,283.697527
std,803.838853
min,0.0
25%,7.0
50%,41.0
75%,198.0
max,16832.0


Data Cleaning

In [139]:
# Get all columns names
columns = df.columns

In [140]:
# drop unwanted columns
df=df.drop([ 'url','phone', 'location','reviews_list','rest_type'],axis=1)

In [141]:
# cleaning 'approx_cost(for two people)' column
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].str.replace(',','').astype('float')



In [142]:
# drop menu items not required
df.drop(['menu_item'],axis=1,inplace=True)

In [143]:
df["dish_liked"].replace(np.nan,"",inplace=True)


In [144]:
# drop all missing values column
df.dropna(inplace=True)

In [145]:
df.drop_duplicates(inplace=True)

Data Preprocessing

In [153]:
df['rate'] = df['rate'].str.replace('NEW','0')
df['rate'] = df['rate'].str.replace('-','0')
df['rate'] = df['rate'].apply(lambda x : x.split('/')[0].strip(' ')).astype('float')

In [158]:
encoder = LabelEncoder()
df['online_order'] = encoder.fit_transform(df['online_order'])
df['book_table'] = encoder.fit_transform(df['book_table'])

In [162]:
df['dish_liked'].value_counts()

dish_liked
                                                                                                      20178
Biryani                                                                                                 181
Chicken Biryani                                                                                          73
Friendly Staff                                                                                           69
Waffles                                                                                                  68
                                                                                                      ...  
Coffee, Chicken Sandwich, Brownie, Hot Chocolate, Cappuccino, Corn Sandwich                               1
English Breakfast, Waffles, Chicken Sandwich, Pancakes, Pork Sausage, Eggs Benedict, Hot Chocolate        1
Pizza, Seafood Pasta, Panna Cotta, Ravioli, Lasagne, Tiramisu, Virgin Sangria                             1
Butter Chicken, T

In [160]:
df

Unnamed: 0,address,name,online_order,book_table,rate,votes,dish_liked,cuisines,approx_cost(for two people),listed_in(type),listed_in(city)
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,1,1,4.1,775,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800.0,Buffet,Banashankari
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,1,0,4.1,787,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800.0,Buffet,Banashankari
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,1,0,3.8,918,"Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800.0,Buffet,Banashankari
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,0,0,3.7,88,Masala Dosa,"South Indian, North Indian",300.0,Buffet,Banashankari
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,0,0,3.8,166,"Panipuri, Gol Gappe","North Indian, Rajasthani",600.0,Buffet,Banashankari
...,...,...,...,...,...,...,...,...,...,...,...
51709,"136, SAP Labs India, KIADB Export Promotion In...",The Farm House Bar n Grill,0,0,3.7,34,,"North Indian, Continental",800.0,Pubs and bars,Whitefield
51711,"139/C1, Next To GR Tech Park, Pattandur Agraha...",Bhagini,0,0,2.5,81,"Biryani, Andhra Meal","Andhra, South Indian, Chinese, North Indian",800.0,Pubs and bars,Whitefield
51712,"Four Points by Sheraton Bengaluru, 43/3, White...",Best Brews - Four Points by Sheraton Bengaluru...,0,0,3.6,27,,Continental,1500.0,Pubs and bars,Whitefield
51715,Sheraton Grand Bengaluru Whitefield Hotel & Co...,Chime - Sheraton Grand Bengaluru Whitefield Ho...,0,1,4.3,236,"Cocktails, Pizza, Buttermilk",Finger Food,2500.0,Pubs and bars,Whitefield


In [148]:
# # online_order and book_table columns convert into integer
# from sklearn.preprocessing import LabelEncoder
# encoder = LabelEncoder()
# df['online_order'].value_counts()
# df['book_table'].value_counts()
# df['online_order'] = encoder.fit_transform(df['online_order'])
# df['book_table'] = encoder.fit_transform(df['book_table'])

In [149]:
# # cleaning rate column
# def rate(x):
#     if pd.isna(x):
#         return x
#     else:
#         return x.split('/')[0].strip(' ')

# df['rate'] = df['rate'].apply(lambda x:rate(x))

In [150]:
# df