In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

For some reasons, I could not read in the original file shared by the author, so I removed some garbled characters inside and columns "reviews_list" "menu_item" (which we would not use). The readable file is then saved as "modified_zomato".

In [36]:
# Read the "modified_zomato.csv" and rename some of its columns.
zomato = pd.read_csv('modified_zomato.csv', encoding='latin1')
zomato = zomato.rename(columns= {
    'listed_in(type)' : 'meal_type',
    'listed_in(city)' : 'locality',
    'approx_cost(for two people)' : 'cost'
})

In [37]:
# Drop more columns that will not be used. Remove duplicates that have the same name and locality.
zomato = zomato.drop(columns = ['url', 'address', 'phone', 'location', 'dish_liked', 'meal_type'], axis=1)
zomato = zomato.drop_duplicates(subset=['name', 'locality'])

In [38]:
# Modify "rate" to be just one number for each row. Remove any possible whitespace.
zomato['rate'] = zomato['rate'].str.split('/').str[0].str.strip()
zomato.head()

Unnamed: 0,name,online_order,book_table,rate,votes,rest_type,cuisines,cost,locality
0,Jalsa,Yes,Yes,4.1,775,Casual Dining,"North Indian, Mughlai, Chinese",800,Banashankari
1,Spice Elephant,Yes,No,4.1,787,Casual Dining,"Chinese, North Indian, Thai",800,Banashankari
2,San Churro Cafe,Yes,No,3.8,918,"Cafe, Casual Dining","Cafe, Mexican, Italian",800,Banashankari
3,Addhuri Udupi Bhojana,No,No,3.7,88,Quick Bites,"South Indian, North Indian",300,Banashankari
4,Grand Village,No,No,3.8,166,Casual Dining,"North Indian, Rajasthani",600,Banashankari


In [39]:
# Treat "NEW" and "-" in "rate" column as missing value.
zomato['rate'] = zomato['rate'].replace('NEW', np.nan)
zomato['rate'] = zomato['rate'].replace('-', np.nan)
zomato.head()

Unnamed: 0,name,online_order,book_table,rate,votes,rest_type,cuisines,cost,locality
0,Jalsa,Yes,Yes,4.1,775,Casual Dining,"North Indian, Mughlai, Chinese",800,Banashankari
1,Spice Elephant,Yes,No,4.1,787,Casual Dining,"Chinese, North Indian, Thai",800,Banashankari
2,San Churro Cafe,Yes,No,3.8,918,"Cafe, Casual Dining","Cafe, Mexican, Italian",800,Banashankari
3,Addhuri Udupi Bhojana,No,No,3.7,88,Quick Bites,"South Indian, North Indian",300,Banashankari
4,Grand Village,No,No,3.8,166,Casual Dining,"North Indian, Rajasthani",600,Banashankari


In [40]:
# Make sure all the variables have appropriate data types.
zomato['rate'] = pd.to_numeric(zomato['rate'])
zomato['cost'] = zomato['cost'].str.replace(',', '') # remove thousands separators which it could not read
zomato['cost'] = pd.to_numeric(zomato['cost'])
zomato.dtypes

name             object
online_order     object
book_table       object
rate            float64
votes             int64
rest_type        object
cuisines         object
cost            float64
locality         object
dtype: object

In [10]:
# Reset the index and save the cleaned file into 'cleaned.csv'
zomato.reset_index(drop=True, inplace=True)
zomato.to_csv('cleaned.csv', index=False)

In [43]:
# A summarise of the numerical variables in the cleaned data frame
zomato.describe()

Unnamed: 0,rate,votes,cost
count,24194.0,31159.0,30958.0
mean,3.657551,190.827113,503.353091
std,0.419364,560.93649,413.865232
min,1.8,0.0,40.0
25%,3.4,5.0,300.0
50%,3.7,30.0,400.0
75%,4.0,140.0,600.0
max,4.9,16345.0,6000.0
