In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib.cm as cm
import seaborn as sns
import math
import time
from wordcloud import WordCloud

from scipy.stats import norm
from scipy import stats
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#importing kmeans
from sklearn.cluster import KMeans

#importing random forest and XgB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

#Non-negative matrix Factorization
from sklearn.decomposition import NMF


from sklearn.naive_bayes import MultinomialNB

#principal component analysis
from sklearn.decomposition import PCA


#silhouette score
from sklearn.metrics import silhouette_score
from sklearn.model_selection import ParameterGrid

#importing stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
#for tokenization
from nltk.tokenize import word_tokenize
# for POS tagging(Part of speech in NLP sentiment analysis)
nltk.download('averaged_perceptron_tagger')

#import stemmer
from nltk.stem.snowball import SnowballStemmer

#import tfidf
from sklearn.feature_extraction.text import TfidfVectorizer

#LDA
import pyLDAvis.lda_model
pyLDAvis.lda_model.prepare
from sklearn.decomposition import LatentDirichletAllocation


import gensim
from gensim import corpora
!pip install shap
import shap 

#download small spacy model
# !python -m spacy download en_core_web_sm
# import spacy

# The following lines adjust the granularity of reporting. 
pd.options.display.float_format = "{:.2f}".format

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vipulpandey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vipulpandey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vipulpandey/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/vipulpandey/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/vipulpandey/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
  pid, fd = os.forkpty()


Collecting shap
  Downloading shap-0.47.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Using cached slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.47.2-cp312-cp312-macosx_11_0_arm64.whl (546 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m546.9/546.9 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.47.2 slicer-0.0.8


In [13]:
hotel_df=pd.read_csv('/Users/vipulpandey/Documents/ZomatoRestaurantClustering/data/Zomato Restaurant names and Metadata.csv')
review_df=pd.read_csv('/Users/vipulpandey/Documents/ZomatoRestaurantClustering/data/Zomato Restaurant reviews.csv')

In [14]:
hotel_df.head()

Unnamed: 0,Name,Links,Cost,Collections,Cuisines,Timings
0,Beyond Flavours,https://www.zomato.com/hyderabad/beyond-flavou...,800,"Food Hygiene Rated Restaurants in Hyderabad, C...","Chinese, Continental, Kebab, European, South I...","12noon to 3:30pm, 6:30pm to 11:30pm (Mon-Sun)"
1,Paradise,https://www.zomato.com/hyderabad/paradise-gach...,800,Hyderabad's Hottest,"Biryani, North Indian, Chinese",11 AM to 11 PM
2,Flechazo,https://www.zomato.com/hyderabad/flechazo-gach...,1300,"Great Buffets, Hyderabad's Hottest","Asian, Mediterranean, North Indian, Desserts","11:30 AM to 4:30 PM, 6:30 PM to 11 PM"
3,Shah Ghouse Hotel & Restaurant,https://www.zomato.com/hyderabad/shah-ghouse-h...,800,Late Night Restaurants,"Biryani, North Indian, Chinese, Seafood, Bever...",12 Noon to 2 AM
4,Over The Moon Brew Company,https://www.zomato.com/hyderabad/over-the-moon...,1200,"Best Bars & Pubs, Food Hygiene Rated Restauran...","Asian, Continental, North Indian, Chinese, Med...","12noon to 11pm (Mon, Tue, Wed, Thu, Sun), 12no..."


In [15]:
review_df.head()

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5,"3 Reviews , 2 Followers",5/25/2019 14:20,0
2,Beyond Flavours,Ashok Shekhawat,A must try.. great food great ambience. Thnx f...,5,"2 Reviews , 3 Followers",5/24/2019 22:54,0
3,Beyond Flavours,Swapnil Sarkar,Soumen das and Arun was a great guy. Only beca...,5,"1 Review , 1 Follower",5/24/2019 22:11,0
4,Beyond Flavours,Dileep,Food is good.we ordered Kodi drumsticks and ba...,5,"3 Reviews , 2 Followers",5/24/2019 21:37,0


In [16]:
print(f"shape of hotel_data is {hotel_df.shape}")
print(f"shape of hotel_data is {review_df.shape}")

shape of hotel_data is (105, 6)
shape of hotel_data is (10000, 7)


In [17]:
print('Restaurant Info')
print('\n')
hotel_df.info()
print('='*120)
print('\n')
print('Review Info')
print('\n')
review_df.info()

Restaurant Info


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         105 non-null    object
 1   Links        105 non-null    object
 2   Cost         105 non-null    object
 3   Collections  51 non-null     object
 4   Cuisines     105 non-null    object
 5   Timings      104 non-null    object
dtypes: object(6)
memory usage: 5.1+ KB


Review Info


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Restaurant  10000 non-null  object
 1   Reviewer    9962 non-null   object
 2   Review      9955 non-null   object
 3   Rating      9962 non-null   object
 4   Metadata    9962 non-null   object
 5   Time        9962 non-null   object
 6   Pictures    10000 non-null  int64 
dtypes: int64(1), object(6)

In [19]:
hotel_df[hotel_df.duplicated()]

Unnamed: 0,Name,Links,Cost,Collections,Cuisines,Timings


In [21]:
review_df[review_df.duplicated()].count()

Restaurant    36
Reviewer       0
Review         0
Rating         0
Metadata       0
Time           0
Pictures      36
dtype: int64

In [22]:
review_df[review_df.duplicated()]

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures
8778,American Wild Wings,,,,,,0
8779,American Wild Wings,,,,,,0
8780,American Wild Wings,,,,,,0
8781,American Wild Wings,,,,,,0
8782,American Wild Wings,,,,,,0
8783,American Wild Wings,,,,,,0
8784,American Wild Wings,,,,,,0
8785,American Wild Wings,,,,,,0
8786,American Wild Wings,,,,,,0
8787,American Wild Wings,,,,,,0


In [28]:
hotel_df.isnull().sum()

Name            0
Links           0
Cost            0
Collections    54
Cuisines        0
Timings         1
dtype: int64

In [29]:
review_df.isnull().sum()

Restaurant     0
Reviewer      38
Review        45
Rating        38
Metadata      38
Time          38
Pictures       0
dtype: int64

In [37]:
hotel_df['Cost'].unique()

array(['800', '1,300', '1,200', '1,500', '500', '300', '1,000', '350',
       '400', '1,600', '750', '550', '1,900', '450', '150', '1,400',
       '1,100', '600', '200', '900', '700', '1,700', '2,500', '850',
       '650', '1,800', '2,800', '1,750', '250'], dtype=object)

Understanding the data

In [38]:
print(f'Features in hotel df are {hotel_df.columns.to_list()}')

Features in hotel df are ['Name', 'Links', 'Cost', 'Collections', 'Cuisines', 'Timings']


In [39]:
print(f'Review in hotel df are {review_df.columns.to_list()}')

Review in hotel df are ['Restaurant', 'Reviewer', 'Review', 'Rating', 'Metadata', 'Time', 'Pictures']


In [41]:
hotel_df.describe().T

Unnamed: 0,count,unique,top,freq
Name,105,105,Beyond Flavours,1
Links,105,105,https://www.zomato.com/hyderabad/beyond-flavou...,1
Cost,105,29,500,13
Collections,51,42,Food Hygiene Rated Restaurants in Hyderabad,4
Cuisines,105,92,"North Indian, Chinese",4
Timings,104,77,11 AM to 11 PM,6


In [43]:
review_df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Restaurant,10000.0,100.0,Beyond Flavours,100.0,,,,,,,
Reviewer,9962.0,7446.0,Parijat Ray,13.0,,,,,,,
Review,9955.0,9364.0,good,237.0,,,,,,,
Rating,9962.0,10.0,5,3832.0,,,,,,,
Metadata,9962.0,2477.0,1 Review,919.0,,,,,,,
Time,9962.0,9782.0,7/29/2018 20:34,3.0,,,,,,,
Pictures,10000.0,,,,0.75,2.57,0.0,0.0,0.0,0.0,64.0


In [48]:
for i in hotel_df.columns.tolist():
    print("No of unique values in", i, "is", hotel_df[i].nunique())

No of unique values in Name is 105
No of unique values in Links is 105
No of unique values in Cost is 29
No of unique values in Collections is 42
No of unique values in Cuisines is 92
No of unique values in Timings is 77


In [50]:
hotel_df.head(2)

Unnamed: 0,Name,Links,Cost,Collections,Cuisines,Timings
0,Beyond Flavours,https://www.zomato.com/hyderabad/beyond-flavou...,800,"Food Hygiene Rated Restaurants in Hyderabad, C...","Chinese, Continental, Kebab, European, South I...","12noon to 3:30pm, 6:30pm to 11:30pm (Mon-Sun)"
1,Paradise,https://www.zomato.com/hyderabad/paradise-gach...,800,Hyderabad's Hottest,"Biryani, North Indian, Chinese",11 AM to 11 PM


In [51]:
for i in review_df.columns.tolist():
    print("No of unique values in", i, "is", review_df[i].nunique())

No of unique values in Restaurant is 100
No of unique values in Reviewer is 7446
No of unique values in Review is 9364
No of unique values in Rating is 10
No of unique values in Metadata is 2477
No of unique values in Time is 9782
No of unique values in Pictures is 36


In [57]:
review_df.head(2)

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures
0,Beyond Flavours,Rusha Chakraborty,"The ambience was good, food was quite good . h...",5,"1 Review , 2 Followers",5/25/2019 15:54,0
1,Beyond Flavours,Anusha Tirumalaneedi,Ambience is too good for a pleasant evening. S...,5,"3 Reviews , 2 Followers",5/25/2019 14:20,0


In [56]:
review_df[review_df['Review'].duplicated()]

Unnamed: 0,Restaurant,Reviewer,Review,Rating,Metadata,Time,Pictures
743,Shah Ghouse Spl Shawarma,Satish Gowlikar,delivered on time,5,"6 Reviews , 2 Followers",2/9/2019 19:12,0
797,Shah Ghouse Spl Shawarma,Sadiq,good,5,1 Review,9/4/2018 1:31,0
989,Cream Stone,Yugandhar,good,5,2 Reviews,10/10/2018 23:10,0
990,Cream Stone,Vivek Thakur,excellent service,5,2 Reviews,10/10/2018 22:07,0
1068,Sardarji's Chaats & More,Jayaram Boyina,excellent delivery,5,"17 Reviews , 13 Followers",11/13/2018 14:41,0
...,...,...,...,...,...,...,...
9883,Triptify,Manonit Singh,Good,5,1 Review,7/22/2018 22:27,0
9888,Triptify,Anand Chiruguri,late,2,1 Review,7/17/2018 19:16,0
9890,Triptify,Archit Saxena,fast delivery,5,"1 Review , 1 Follower",7/16/2018 14:54,0
9891,Triptify,Saurabh Kumar,Food was awesome,5,"1 Review , 12 Followers",7/15/2018 16:23,0


In [58]:
hotel=hotel_df.copy()
review=review_df.copy()

In [59]:
hotel['Cost'].unique()

array(['800', '1,300', '1,200', '1,500', '500', '300', '1,000', '350',
       '400', '1,600', '750', '550', '1,900', '450', '150', '1,400',
       '1,100', '600', '200', '900', '700', '1,700', '2,500', '850',
       '650', '1,800', '2,800', '1,750', '250'], dtype=object)

In [60]:
# in cost we have commas ,we need to remove them
hotel['Cost']=hotel['Cost'].str.replace(',','').astype(int)

In [62]:
hotel.head(2)

Unnamed: 0,Name,Links,Cost,Collections,Cuisines,Timings
0,Beyond Flavours,https://www.zomato.com/hyderabad/beyond-flavou...,800,"Food Hygiene Rated Restaurants in Hyderabad, C...","Chinese, Continental, Kebab, European, South I...","12noon to 3:30pm, 6:30pm to 11:30pm (Mon-Sun)"
1,Paradise,https://www.zomato.com/hyderabad/paradise-gach...,800,Hyderabad's Hottest,"Biryani, North Indian, Chinese",11 AM to 11 PM


In [64]:
#top 10 costlier restaurant
hotel.sort_values('Cost', ascending = False)[['Name','Cost']][:10]

Unnamed: 0,Name,Cost
92,Collage - Hyatt Hyderabad Gachibowli,2800
56,Feast - Sheraton Hyderabad Hotel,2500
21,Jonathan's Kitchen - Holiday Inn Express & Suites,1900
18,10 Downing Street,1900
91,Cascade - Radisson Hyderabad Hitec City,1800
97,Zega - Sheraton Hyderabad Hotel,1750
104,Republic Of Noodles - Lemon Tree Hotel,1700
34,Mazzo - Marriott Executive Apartments,1700
90,Arena Eleven,1600
11,Barbeque Nation,1600


In [68]:
#top 5 budget friendly restaurant
hotel.sort_values('Cost', ascending = True)[['Name','Cost']][:5]

Unnamed: 0,Name,Cost
89,Mohammedia Shawarma,150
23,Amul,150
54,Asian Meal Box,200
101,Sweet Basket,200
59,KS Bakers,200


In [77]:
#  finding hotel with same price
hotel_price={}
amount=hotel.Cost.values.tolist()
for price in amount:
    rows=hotel[hotel['Cost']==price]
    hotel_price[price]=rows['Name'].tolist()
same_price=pd.DataFrame.from_dict([hotel_price]).transpose().reset_index().rename(
    columns={'index':'Cost',0:'Name of Restaurants'})


In [79]:
same_price

Unnamed: 0,Cost,Name of Restaurants
0,800,"[Beyond Flavours, Paradise, Shah Ghouse Hotel ..."
1,1300,"[Flechazo, The Lal Street - Bar Exchange, Must..."
2,1200,"[Over The Moon Brew Company, The Glass Onion, ..."
3,1500,"[The Fisherman's Wharf, AB's - Absolute Barbec..."
4,500,"[eat.fit, KFC, Kritunga Restaurant, Karachi Ba..."
5,300,[Shah Ghouse Spl Shawarma]
6,1000,"[Hyper Local, Tiki Shack, Pista House, La La L..."
7,350,"[Cream Stone, Tempteys, The Old Madras Baking ..."
8,400,"[Sardarji's Chaats & More, Hotel Zara Hi-Fi, P..."
9,1600,"[Barbeque Nation, B-Dubs, Arena Eleven]"


In [81]:
hotel_count=hotel.groupby('Cost')['Name'].count().reset_index().sort_values(by='Cost',ascending=False)
same_price=same_price.merge(hotel_count,on='Cost',how='inner').rename(columns={'Name':'Total_Restaurants'})
same_price.sort_values(by='Total_Restaurants',ascending=False)[:5]

Unnamed: 0,Cost,Name of Restaurants,Total_Restaurants
4,500,"[eat.fit, KFC, Kritunga Restaurant, Karachi Ba...",13
17,600,"[Behrouz Biryani, Karachi Cafe, Hyderabad Chef...",10
20,700,"[Marsala Food Company, Green Bawarchi Restaura...",8
2,1200,"[Over The Moon Brew Company, The Glass Onion, ...",7
8,400,"[Sardarji's Chaats & More, Hotel Zara Hi-Fi, P...",6


In [82]:
# max price hotels
same_price.sort_values(by='Cost', ascending=False)[:5]

Unnamed: 0,Cost,Name of Restaurants,Total_Restaurants
26,2800,[Collage - Hyatt Hyderabad Gachibowli],1
22,2500,[Feast - Sheraton Hyderabad Hotel],1
12,1900,"[10 Downing Street, Jonathan's Kitchen - Holid...",2
25,1800,[Cascade - Radisson Hyderabad Hitec City],1
27,1750,[Zega - Sheraton Hyderabad Hotel],1


In [84]:
cuisine_list=hotel['Cuisines'].str.split(',')
cuisine_list

0      [Chinese,  Continental,  Kebab,  European,  So...
1                     [Biryani,  North Indian,  Chinese]
2      [Asian,  Mediterranean,  North Indian,  Desserts]
3      [Biryani,  North Indian,  Chinese,  Seafood,  ...
4      [Asian,  Continental,  North Indian,  Chinese,...
                             ...                        
100                                  [Fast Food,  Salad]
101                                    [Bakery,  Mithai]
102                   [North Indian,  Biryani,  Chinese]
103                                          [Fast Food]
104                 [Thai,  Asian,  Chinese,  Malaysian]
Name: Cuisines, Length: 105, dtype: object

In [85]:
cuisine_dict={}
for cuisine_name in cuisine_list:
    for cuisine in cuisine_name:
        if(cuisine in cuisine_dict):
            cuisine_dict[cuisine]+=1
        else:
            cuisine_dict[cuisine]=1

In [86]:
cuisine_dict

{'Chinese': 7,
 ' Continental': 17,
 ' Kebab': 5,
 ' European': 2,
 ' South Indian': 7,
 ' North Indian': 28,
 'Biryani': 4,
 ' Chinese': 36,
 'Asian': 5,
 ' Mediterranean': 4,
 ' Desserts': 11,
 ' Seafood': 3,
 ' Beverages': 5,
 'Seafood': 1,
 ' Goan': 1,
 ' Asian': 10,
 'Healthy Food': 1,
 'Lebanese': 1,
 'American': 4,
 'Ice Cream': 2,
 'Street Food': 2,
 ' Fast Food': 10,
 'Mediterranean': 1,
 ' BBQ': 1,
 'Continental': 4,
 ' American': 2,
 'North Indian': 33,
 ' Italian': 12,
 ' Finger Food': 1,
 'European': 2,
 'Burger': 2,
 ' Biryani': 12,
 ' Japanese': 2,
 ' Salad': 5,
 ' Sushi': 4,
 'Mexican': 1,
 'Mughlai': 1,
 'Fast Food': 5,
 'Andhra': 3,
 'Bakery': 6,
 ' Mughlai': 5,
 ' Juices': 1,
 'Arabian': 1,
 'Italian': 2,
 ' Andhra': 3,
 ' Hyderabadi': 3,
 'Cafe': 5,
 ' Spanish': 1,
 ' Wraps': 1,
 'Finger Food': 1,
 ' Thai': 2,
 ' Indonesian': 1,
 'South Indian': 2,
 ' Bakery': 1,
 'Modern Indian': 1,
 'Desserts': 2,
 'Kebab': 1,
 ' Momos': 3,
 'BBQ': 1,
 ' Modern Indian': 1,
 ' Burg

In [89]:
cuisine_df=pd.DataFrame.from_dict([cuisine_dict]).transpose().reset_index().rename(columns={'index':'cuisine',0:'Number of Restaurants'})
cuisine_df.sort_values(by='Number of Restaurants',ascending=False)[:5]

Unnamed: 0,cuisine,Number of Restaurants
7,Chinese,36
26,North Indian,33
5,North Indian,28
1,Continental,17
31,Biryani,12


In [90]:
Collections_value_list = hotel.Collections.dropna().str.split(', ')

In [91]:
Collections_dict = {}
for collection in Collections_value_list:
    for col_name in collection:
        if (col_name in Collections_dict):
            Collections_dict[col_name]+=1
        else:  
            Collections_dict[col_name]=1 

In [92]:
Collections_df=pd.DataFrame.from_dict([Collections_dict]).transpose().reset_index().rename(
    columns={'index':'Tags',0:'Number of Restaurants'})

In [93]:
Collections_df.sort_values('Number of Restaurants', ascending =False)[:5]

Unnamed: 0,Tags,Number of Restaurants
2,Great Buffets,11
0,Food Hygiene Rated Restaurants in Hyderabad,8
5,Live Sports Screenings,7
6,Hyderabad's Hottest,7
1,Corporate Favorites,6
