# Assignment 2: Exploratory Data Analysis

**Sayali Patil - 111903092, Vedant Bairagi - 111903121, Vishwajit Kadam - 111903128**

## Importing libraries and Dataset

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [4]:
data = pd.read_csv('../input/project-data/resta_data.csv')

## Inspecting the data

In [5]:
data.head()

In [6]:
data.shape

*We have 9090 records and 17 attributes*

In [7]:
data.dtypes

## Handling Missing Values and Column Data Types

In [8]:
data.isnull().sum()

***Percentage of missing values***

In [9]:
data.isnull().sum() / data.shape[0] * 100

In [10]:
import re
def price_extractor(price_str):
    """Function to extract price number
    from cost_for_two column"""

    price = price_str[:price_str.find(" ")]
    price = re.sub("₹", "", price)
    price = re.sub(",", "", price)
    return int(price)

In [11]:
data['price'] = data['cost_for_two'].dropna().apply(price_extractor) # creating new price column

In [12]:
data.price.dtypes

In [13]:
data['delivery_review_count'] = data.delivery_review_count.fillna('0')
data['delivery_review_count'] = data['delivery_review_count'].apply(lambda x: x.replace('K', '000') if type(x) == str else x)

Columns like **people_say, top_dishes** have more that 50% missing data as well as text data, so those columns should be dropped.  
  
  
Columns like dining rating, dining_review_count, delivery_rating and delivery_rating_count can be filled with either mean or median values of respective attribute. For the timing column we can fill with mode value of the column.  

Here we have chosen median values for the earlier four columns and mode value for later column i.e. timing

In [14]:
data = data.drop(['people_say', 'top_dishes'], axis=1) # dropping as we said above

In [15]:
data['delivery_review_count'].apply(lambda x: float(x))

In [16]:
# Replacing null values with median

data['dining_rating'] = data['dining_rating'].fillna(data.dining_rating.median())
data['dining_review_count'] = data['dining_review_count'].fillna(data.dining_review_count.median())
data['delivery_rating'] = data['delivery_rating'].fillna(data.delivery_rating.median())
data['delivery_review_count'] = data['delivery_review_count'].fillna(data.delivery_review_count.median())

In [17]:
data.aggregate_rating.replace({0: data.aggregate_rating.mean()}, inplace=True)

In [18]:
data.isnull().sum()

In [19]:
fill = data.timing.mode()
data['timing'] = data['timing'].fillna(fill)

In [20]:
data.drop('cost_for_two', axis=1, inplace=True)

In [21]:
data.price.fillna(data.price.mean(), inplace=True)

In [22]:
data.isnull().sum()

In [23]:
data.to_csv('final_data.csv') # saving nice data to a csv

**Now that there are no Null Values we can move onto plotting i.e. graphical analysis part**

## Graphical Analysis

In [25]:
ratings_df = data[['aggregate_rating', 'dining_rating', 'delivery_rating']]
pd.plotting.scatter_matrix(ratings_df, figsize=(15, 8))
plt.suptitle('Relations between various rating values', fontdict={'weight':'bold'})

In [26]:
nums = data[['price', 'aggregate_rating', 'rating_votes', 'dining_rating', 'delivery_rating',
            'dining_review_count', 'delivery_review_count']]
corr = nums.corr()
sns.heatmap(corr, annot=True)

In [27]:
plt.figure(figsize=(15,8))
sns.boxplot(y=data.price)
plt.title('Price Values Boxplot')
plt.ylabel('Price')

In [28]:
from wordcloud import WordCloud, STOPWORDS

In [29]:
text = ''.join(data.cuisine)
wc = WordCloud(width=1200, height=800, stopwords=STOPWORDS).generate(text)
plt.figure(figsize=(12,8))
plt.axis('off')
plt.imshow(wc)
plt.title('Cuisines of Pune', fontdict={'fontsize':20., 'fontweight':'bold', 'color':'steelblue'})

In [30]:
plt.figure(figsize=(15,15))
chains=data['name'].value_counts()[:30]
sns.barplot(x=chains,y=chains.index,palette='Set1')
plt.title("Most famous restaurant chains in Pune",size=20,pad=20)
plt.xlabel("Number of outlets",size=15)

In [31]:
fig, ax = plt.subplots(figsize=[15,7])
sns.distplot(data['price'] / 2,color="magenta")
ax.set_title('Approx cost for a person distribution',size=20,pad=15)
plt.xlabel('Approx cost for one person',size = 15)
plt.ylabel('Percentage of restaurants',size = 15)

In [32]:
sns.ecdfplot(data=data, x='price')

In [33]:
from scipy import stats


samples = data.dining_review_count


bins = np.linspace(-25, 25, 30)
histogram, bins = np.histogram(samples, bins=bins, density=True)

bin_centers = 0.5*(bins[1:] + bins[:-1])

pdf = stats.norm.pdf(bin_centers)

plt.figure(figsize=(6, 4))
plt.plot(bin_centers, histogram, label="Dining Reviews")
plt.plot(bin_centers, pdf, label="PDF")
plt.legend()
plt.show()

In [34]:
data['delivery_review_count'] = data.delivery_review_count.astype('float')

In [35]:
samples = data.delivery_review_count

bins = np.linspace(-10, 10, 30)
histogram, bins = np.histogram(samples, bins=bins, density=True)

bin_centers = 0.5*(bins[1:] + bins[:-1])

pdf = stats.norm.pdf(bin_centers)

plt.figure(figsize=(6, 4))
plt.plot(bin_centers, histogram, label="Delivery Reviews")
plt.plot(bin_centers, pdf, label="PDF")
plt.legend()
plt.show()

In [36]:
samples = data.rating_votes


bins = np.linspace(-10, 10, 30)
histogram, bins = np.histogram(samples, bins=bins, density=True)

bin_centers = 0.5*(bins[1:] + bins[:-1])

pdf = stats.norm.pdf(bin_centers)

plt.figure(figsize=(6, 4))
plt.plot(bin_centers, histogram, label="Overall Ratings Count")
plt.plot(bin_centers, pdf, label="PDF")
plt.legend()
plt.show()

We can see that Chinese restaurants are predominant in Pune followed by North Indian, Street Food. From this we can say that Punekars prefer these types of food more and as the dataset is scraped from Zomato, food delivery service, these types of foods are ordered more.  

There are restaurants serving only some specific food items like Momos, Biryani indicating their popularity.

In [54]:
plt.figure(figsize=(15,8))
timings = data['timing'].value_counts()[:15]
sns.barplot(x=timings, y=timings.index, color='#8722e4')
plt.title('Timings of restaurants of Pune', size = 20, pad = 15)
plt.xlabel('No. of restaurants',size = 15)

From above plot we can interprete that opening time for restaurants varies but the closing time is usually 11 PM or 12 Midnight. And for some restaurants only opening time is provided.

In [55]:
plt.figure(figsize=(15,8))
cuisines = data['cuisine'].value_counts()[:15]
colors = sns.color_palette('pastel')[0:15]
sns.barplot(x=cuisines, y=cuisines.index)
plt.title('Top cuisines of Pune', size = 20, pad = 15)

In [56]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(data.cuisine)

In [67]:
import folium

#create a map
this_map = folium.Map(prefer_canvas=True)

#use df.apply(,axis=1) to "iterate" through every row in your dataframe
longi = data.location_longitude
latis = data.location_latitude
name = data.name
both = zip(latis, longi, name)

Count of ratings for overall, dining and delivery have high standard deviation, we need to use feature scaling as they may introduce discrepancies in the model.

In [70]:
for pair in list(both):
    folium.CircleMarker(location=[pair[0], pair[1]],
                        tooltip=f"<b>{pair[2]}</b>", radius=5).add_to(this_map)

In [71]:
this_map.fit_bounds(this_map.get_bounds())

#Save the map to an HTML file
this_map.save('simple_dot_plot.html')

this_map

In [72]:
extra = pd.read_csv('../input/seconddata/new_data.csv')
extra.head()

In [115]:
data2 = extra[['name', 'locality', 'payment_modes']]

**Top 10 Localities in Pune**

In [138]:
localities = new_d.locality.value_counts()[:20]

# Enlarging the pie chart
plt.rcParams['figure.figsize'] = 8,8

# Indexing labels. tolist() will convert the index to list for easy manipulation
labels = localities.index.tolist()

# Convert value counts to list
sizes = localities.tolist()

# As the name suggest, explode will determine how much each section is separated from each other 


# Determine colour of pie chart


# Putting them together. Sizes with the count, explode with the magnitude of separation between pies, colors with the colors, 
# autopct enables you to display the percent value using Python string formatting. .1f% will round off to the tenth place.
# startangle will allow the percentage to rotate counter-clockwise. Lets say we have 4 portions: 10%, 30%, 20% and 40%. The pie will rotate from smallest to the biggest (counter clockwise). 10% -> 20% -> 30% -> 40%
# We have only 2 sections so anglestart does not matter
# textprops will adjust the size of text
#plt.pie(sizes, labels=labels, autopct='%0.2f%%',startangle=450, textprops={'fontsize': 14})
#plt.title('Top restaurants localities of Pune', size=15, pad=20)
localities.plot(kind='barh')

In [133]:
localities = new_d.payment_modes.dropna().value_counts()[:3]

# Enlarging the pie chart
plt.rcParams['figure.figsize'] = 8,8

# Indexing labels. tolist() will convert the index to list for easy manipulation
labels = localities.index.tolist()

# Convert value counts to list
sizes = localities.tolist()

# As the name suggest, explode will determine how much each section is separated from each other 
explode = (0.1, 0.1, 0.1)

# Determine colour of pie chart
colors = ['lightblue', 'yellow', 'red']

# Putting them together. Sizes with the count, explode with the magnitude of separation between pies, colors with the colors, 
# autopct enables you to display the percent value using Python string formatting. .1f% will round off to the tenth place.
# startangle will allow the percentage to rotate counter-clockwise. Lets say we have 4 portions: 10%, 30%, 20% and 40%. The pie will rotate from smallest to the biggest (counter clockwise). 10% -> 20% -> 30% -> 40%
# We have only 2 sections so anglestart does not matter
# textprops will adjust the size of text
plt.pie(sizes, labels=labels, autopct='%0.2f%%',startangle=450, textprops={'fontsize': 14}, explode=explode, colors=colors)
plt.title('Payment Modes Accepted', size=15, pad=20)