In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
import pickle
import json


In [3]:
#https://www.kaggle.com/datasets/himanshupoddar/zomato-bangalore-restaurants?datasetId=153420&searchQuery=Clust
df = pd.read_csv("zomato.csv")

**Clean data**

In [6]:
df.drop(['url','address','online_order','book_table','phone','dish_liked','menu_item','reviews_list','listed_in(city)','votes','rest_type'],axis=1,inplace=True)
df['rate']=np.where(df.rate=='NEW',np.nan,df.rate)
df['rate']=np.where(df.rate=='-',np.nan,df.rate)
df['rate']=np.where(df.rate!=np.nan,df.rate.apply(lambda x : str(x).split('/')[0]),df.rate)
df.rate=df.rate.astype('float')
df.rate.fillna(df.rate.mean(),inplace=True)
df.dropna(inplace=True)
df.rename({'approx_cost(for two people)':'cost_for_two'},axis=1,inplace=True)
df.rename({'listed_in(type)':'type'},axis=1,inplace=True)
df.cost_for_two=df.cost_for_two.apply(lambda x: str(x).replace(',',''))
df.cost_for_two= df.cost_for_two.astype('int64')
df.rate=df.rate.apply(lambda x : round(x,1))
low_outlier = df.rate.mean()-3*df.rate.std()
df = df[df.rate>=low_outlier]
df = df.drop_duplicates()
locations_to_drop = df.location.value_counts()[df.location.value_counts()<100].index
df = df[~df.location.isin(locations_to_drop)]
df.set_index(np.arange(len(df)),inplace=True)

In [146]:
df

Unnamed: 0,name,rate,location,cuisines,cost_for_two,type
0,"Jalsa,Banashankari",4.1,Banashankari,"North Indian, Mughlai, Chinese",800,Buffet
1,"Spice Elephant,Banashankari",4.1,Banashankari,"Chinese, North Indian, Thai",800,Buffet
2,"San Churro Cafe,Banashankari",3.8,Banashankari,"Cafe, Mexican, Italian",800,Buffet
3,"Addhuri Udupi Bhojana,Banashankari",3.7,Banashankari,"South Indian, North Indian",300,Buffet
4,"Grand Village,Basavanagudi",3.8,Basavanagudi,"North Indian, Rajasthani",600,Buffet
...,...,...,...,...,...,...
21953,"The Farm House Bar n Grill,Whitefield",3.7,Whitefield,"North Indian, Continental",800,Pubs and bars
21954,"Topsy Turvey,Whitefield",3.7,Whitefield,Finger Food,900,Pubs and bars
21955,Best Brews - Four Points by Sheraton Bengaluru...,3.6,Whitefield,Continental,1500,Pubs and bars
21956,"Vinod Bar And Restaurant,Whitefield",3.7,Whitefield,Finger Food,600,Pubs and bars


In [10]:
df.to_csv('clean_zomato_data.csv',index=False)

In [138]:
df =pd.read_csv('clean_zomato_data.csv',encoding ='latin-1')

In [6]:
cv = CountVectorizer()
mn = MinMaxScaler()
vcuisines =cv.fit_transform(df.cuisines)
vcuisines = pd.DataFrame(vcuisines.toarray(), columns=cv.get_feature_names_out())
df = pd.concat([df,vcuisines],axis=1)
df.drop('cuisines',axis=1,inplace=True)
df = pd.concat([df,pd.get_dummies(df.type,drop_first=True,dtype='int64')],axis=1)
df.drop('type',inplace=True,axis=1)

In [140]:
df.name=df.name+','+df.location

In [153]:
df[df.name=='Jalsa,Banashankari']

Unnamed: 0,name,rate,location,cuisines,cost_for_two,type
0,"Jalsa,Banashankari",4.1,Banashankari,"North Indian, Mughlai, Chinese",800,Buffet
426,"Jalsa,Banashankari",4.1,Banashankari,"North Indian, Mughlai, Chinese",800,Delivery
528,"Jalsa,Banashankari",4.1,Banashankari,"North Indian, Mughlai, Chinese",800,Dine-out


In [154]:
df.type.unique()

array(['Buffet', 'Cafes', 'Delivery', 'Desserts', 'Dine-out',
       'Drinks & nightlife', 'Pubs and bars'], dtype=object)

In [156]:
df.cuisines.value_counts()

cuisines
North Indian                                             1216
North Indian, Chinese                                     994
South Indian                                              810
Bakery, Desserts                                          420
Biryani                                                   372
                                                         ... 
North Indian, Chettinad, Chinese                            1
Fast Food, Andhra                                           1
Bakery, Fast Food, Burger, Rolls, Beverages                 1
Fast Food, Kerala, Mangalorean, Seafood, South Indian       1
Chinese, Mughlai                                            1
Name: count, Length: 2556, dtype: int64

In [157]:
df

Unnamed: 0,name,rate,location,cuisines,cost_for_two,type
0,"Jalsa,Banashankari",4.1,Banashankari,"North Indian, Mughlai, Chinese",800,Buffet
1,"Spice Elephant,Banashankari",4.1,Banashankari,"Chinese, North Indian, Thai",800,Buffet
2,"San Churro Cafe,Banashankari",3.8,Banashankari,"Cafe, Mexican, Italian",800,Buffet
3,"Addhuri Udupi Bhojana,Banashankari",3.7,Banashankari,"South Indian, North Indian",300,Buffet
4,"Grand Village,Basavanagudi",3.8,Basavanagudi,"North Indian, Rajasthani",600,Buffet
...,...,...,...,...,...,...
21953,"The Farm House Bar n Grill,Whitefield",3.7,Whitefield,"North Indian, Continental",800,Pubs and bars
21954,"Topsy Turvey,Whitefield",3.7,Whitefield,Finger Food,900,Pubs and bars
21955,Best Brews - Four Points by Sheraton Bengaluru...,3.6,Whitefield,Continental,1500,Pubs and bars
21956,"Vinod Bar And Restaurant,Whitefield",3.7,Whitefield,Finger Food,600,Pubs and bars


In [167]:
def combine_values(series):
    return ', '.join(map(str, series))

grouped = df.groupby('name')['type'].agg(combine_values)

In [172]:
grouped = pd.DataFrame(grouped)

In [193]:
grouped.reset_index(inplace=True)

In [203]:
df_grouped = pd.merge(df,grouped,on='name',how='inner')

In [204]:
df_grouped.drop('type_x',axis=1,inplace=True)

In [208]:
df_grouped = df_grouped[~df_grouped.duplicated()]

In [212]:
df_grouped.set_index(np.arange(len(df_grouped)),inplace=True)

In [240]:
pd.set_option('display.max_colwidth', 100)

df_grouped[df_grouped.name=='Pathaan Sir,Indiranagar']


Unnamed: 0,name,rate,location,cuisines,cost_for_two,type_y
7624,"Pathaan Sir,Indiranagar",3.3,Indiranagar,"Fast Food, Rolls, Momos",250,"Delivery, Delivery, Dine-out, Dine-out, Delivery, Dine-out"
7625,"Pathaan Sir,Indiranagar",3.2,Indiranagar,"Fast Food, Momos, Rolls",250,"Delivery, Delivery, Dine-out, Dine-out, Delivery, Dine-out"
7626,"Pathaan Sir,Indiranagar",3.4,Indiranagar,"Fast Food, Rolls, Momos",250,"Delivery, Delivery, Dine-out, Dine-out, Delivery, Dine-out"
7627,"Pathaan Sir,Indiranagar",3.4,Indiranagar,"Rolls, Chinese",300,"Delivery, Delivery, Dine-out, Dine-out, Delivery, Dine-out"
7628,"Pathaan Sir,Indiranagar",3.3,Indiranagar,"Fast Food, Momos, Rolls",250,"Delivery, Delivery, Dine-out, Dine-out, Delivery, Dine-out"


In [243]:
df_grouped=df_grouped.drop_duplicates(subset='name')

In [250]:
df_grouped.cuisines=df_grouped.cuisines+', '+df_grouped.type_y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_grouped.cuisines=df_grouped.cuisines+', '+df_grouped.type_y


In [254]:
df_grouped.drop('type_y',inplace=True,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_grouped.drop('type_y',inplace=True,axis=1)


In [259]:
df_grouped.to_csv('RemovedDuplicatesAndCombinedColumns.csv',index=False)

In [8]:
df_vectorized = pd.DataFrame(mn.fit_transform(df.drop(['name','location'],axis=1)))

In [84]:
nbrs = NearestNeighbors(n_neighbors=10).fit(df_vectorized)

In [55]:
df_vectorized

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,112,113,114,115,116,117,118,119,120,121
0,0.652174,0.127517,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.343421,0.166612
1,0.652174,0.127517,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.343421,0.166612
2,0.521739,0.127517,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.343421,0.166612
3,0.478261,0.043624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.343421,0.166612
4,0.521739,0.093960,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.480397,0.175515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21953,0.478261,0.127517,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.624622,1.000000
21954,0.478261,0.144295,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.624622,1.000000
21955,0.434783,0.244966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.624622,1.000000
21956,0.478261,0.093960,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.624622,1.000000


In [85]:
with open('zomato_NearestNeighbour_model.pkl','wb') as f:
    pickle.dump(nbrs,f)

In [28]:
columns = {'column_names':[col.lower() for col in df.columns]}
with open('columns.json','w') as f:
    f.write(json.dumps(columns))