In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('dark_background')

In [None]:
df = pd.read_csv('/contents/zomato.csv')

In [None]:
df.shape

In [None]:
df.columns

In [None]:
# dropping all unwanted columns
df = df.drop(['url','address','phone','menu_item','dish_linked','reviews_list'],axis = 1)
df.head()

In [None]:
df.info()

In [None]:
# Remove duplicate
df.drop_duplicate(inplace=True)
df.shape

In [None]:
# data cleaning
df['rate'].unique()

In [None]:
#  Remove /5 in rate column
def hadlerate(value):
  if (value == 'NEW' or value == '-'):
    return np.nan
  else:
    value = str(value).split('/')
    value = value[0]
    return float(value)
df['rate'] = df['rate'].apply(hadlerate)
df.head()

In [None]:
# Handling null values
df.isnull.sum()

In [None]:
# Filling with mean
df['rate'].fillna(df['rate'].mean(),inplace = True)
df['rate'].isnull().sum()

In [None]:
df.info()

In [None]:
# Dropping because of so many null values
df.dropna(inplace=True)
df.head()

In [None]:
# Renaming because of lengthy name
df.rename(columns={'approx_cost(for two people)':'Cost2plates','listed_in(type)':'Type'},inplace=True)

In [None]:
# Exploring locations column
print(df['locations'].unique())
print(df['listed_in(city)'].unique())

In [None]:
#  dropping listed in cities
df.drop('listed_in(city)',axis = 1)

In [None]:
# removing comma
def commaHandle(value):
  value = str(value)
  if ',' in value:
    value = value.replace(',','')
    return float(value)
  else:
    return float(value)

df['Cost2plates'] = df['Cost2plates'].apply(commaHandle)
df['Cost2plates'].unique()

In [None]:
# cleaning rest_type column
df['rest_type'].value_counts()

In [None]:
rest_types = df['rest_type'].value_counts(ascending=False)
rest_types

In [None]:
# Clustering into less than 1000 and greater then 1000
rest_types_lessthan1000 = rest_types[rest_types<1000]
rest_types_lessthan1000

In [None]:
#  making a cluster that less than 1000 rest type
def handle_rest_type(value):
  if(value in rest_types_lessthan1000):
    return 'others'
  else:
    return value

df['rest_type'] = df['rest_type'].apply(handle_rest_type)
df['rest_type'].value_counts()

In [None]:
# location column
df['location'].value_counts()

In [None]:
location = df['location'].value_counts(ascending=False)
location_lessthan300 = location[location<300]

def location_handler(value):
  if(value in location_lessthan300):
    return 'others'
  else:
    return value
df['location'] = df['location'].apply(location_handler)
# clustering

In [None]:
# cuisins also clustering
cuisines = df['cuisines'].value_counts(ascending=False)
cuisines_lessthan100 = cuisines[cuisines<100]

def cuisines_handler(value):
  if(value in cuisines_lessthan100):
    return 'others'
  else:
    return value

df['location'] = df['location'].apply(location_handler)


In [None]:
df.head()

####**Visualisation**

In [None]:
# which location is good for opening a restaurent
plt.figure(figsize=(16,10))
ax = sns.countplot(df['location'])
plt.xticks(rotation=90)

In [None]:
# how many of them have online oreder facilities
plt.figure(figsize=(6,6))
sns.countplot(df['online_order'],palette='inferno')

In [None]:
#  how many of them have book table
plt.figure(figsize=(6,6))
sns.countplot(df['book_table'],palette = 'rainbow')

In [None]:
# checking online order facilities vs rate
plt.figure(figsize = (6,6))
sns.boxplot(x= 'online_order',y = 'rate',data = df)

In [None]:
# book table vs rate
plt.figure(figsize = (6,6))
sns.boxplot(x= 'book_table',y = 'rate',data = df)

In [None]:
df1 = df.groupby(['location','online_order'])['name'].count()
df1.to_csv('location_online.csv')
df1 = pd.read_csv('location_online.csv')
df1 = pd.pivot_table(df1,values = None, index=['location'],columns=['online_order'],fill_value = 0,aggfu=np.sum)
df1

In [None]:
# we get a pivot table that shows
# location | not provide online facilities | provide online facilities 
# it shows how many hotels in that area provide online facilities

In [None]:
df1.plot(kind = 'bar',figsize = (15,8))

In [None]:
df1 = df.groupby(['location','book_table'])['name'].count()
df1.to_csv('location_book.csv')
df1 = pd.read_csv('location_online.csv')
df1 = pd.pivot_table(df1,values = None, index=['location'],columns=['book_table'],fill_value = 0,aggfu=np.sum)
df1

In [None]:
df1.plot(kind = 'bar',figsize = (15,8))

In [None]:
df1 = df.groupby(['Type','rate'])['name'].count()
df1.to_csv('Type_rate.csv')
df1 = pd.read_csv('Type_rate.csv')
df1 = pd.pivot_table(df1,values = None, index=['Type'],columns=['rate'],fill_value = 0,aggfu=np.sum)
df1

In [None]:
df1.boxplot(kind = 'bar',figsize = (15,8))