# **Model Desription:**
The Collaborative-Based Restaurant Recommendation System is designed to provide personalized dining suggestions by leveraging user interaction data. This system uses collaborative filtering, a technique that predicts a user's preferences by analyzing the preferences of similar users. By collecting user ratings and reviews of various restaurants, the model identifies patterns and similarities between users' tastes.

# 1. Import Libraries

In [26]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# 2. Import Dataset
This dataset is imported from kaggle website. This dataset contains restaurant name, ratings, review list, cuisines. This helps to model get each feature seperately in a precised way.

# Reference:
https://www.kaggle.com/datasets/absin7/zomato-bangalore-dataset

In [27]:
dataset=pd.read_csv("zomato.csv")
dataset.head()           

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


These are the complete columns in our datasets.

# 3. Preprocess data

Let's drop url,phone number, dishes liked because they may not effect the recommendation to a user.

In [28]:
dataset=dataset.drop(['url','phone','dish_liked'],axis=1)
print(len(dataset))
dataset.head()

51717


Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,Banashankari,Quick Bites,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


Remove duplicates and NaN values rows containing in our dataset.

In [29]:
dataset.drop_duplicates(inplace=True)

In [30]:
dataset=dataset.dropna()

Let's change column names approx_cost(for two people) to cost ,listed_in(type) to type....etc for our convinience. And then preprocess the data if needed like to change str to float and removing the rows which makes noise in dataset which are unintended characters.

Let's also create a new column for mean rating for every particular restaurant. Then normalize the mean ratings.

In [31]:
dataset=dataset.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'type','listed_in(city)':'city'})
dataset['cost']=dataset['cost'].astype(str)
dataset['cost']=dataset['cost'].apply(lambda x:x.replace(',','.'))
dataset['cost']=dataset['cost'].astype(float)
dataset=dataset.loc[dataset.rate!='NEW']
dataset=dataset.loc[dataset.rate!='-'].reset_index(drop=True)
remove_slash = lambda x: x.replace('/5', '') if isinstance(x, str) else x
dataset.rate=dataset.rate.apply(remove_slash).str.strip().astype('float')
dataset.name = dataset.name.apply(lambda x:x.title())
dataset.online_order.replace(('Yes','No'),(True, False),inplace=True)
dataset.book_table.replace(('Yes','No'),(True, False),inplace=True)
restaurants = list(dataset['name'].unique())
dataset['Mean Rating'] = 0

for i in range(len(restaurants)):
    dataset['Mean Rating'][dataset['name'] == restaurants[i]] = dataset['rate'][dataset['name'] == restaurants[i]].mean()
    
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (1,5))
dataset[['Mean Rating']] = scaler.fit_transform(dataset[['Mean Rating']]).round(2)


In [32]:
dataset.head()

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city,Mean Rating
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,True,True,4.1,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari,3.99
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,True,False,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.0,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari,3.97
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,True,False,3.8,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800.0,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari,3.58
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,False,False,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300.0,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari,3.45
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,False,False,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.0,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari,3.58


This will be the dataset after adding mean rating column

Let's remove punctuations,remove stop words,remove url's and make restaurant names as title as only first letter of name will be capital and remaining and lower.

In [33]:
dataset['reviews_list']=dataset['reviews_list'].str.lower()
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
dataset['reviews_list']=dataset['reviews_list'].apply(lambda text: remove_punctuation(text))
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
dataset['reviews_list']=dataset['reviews_list'].apply(lambda text: remove_stopwords(text))
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
dataset['reviews_list']=dataset['reviews_list'].apply(lambda text: remove_urls(text))

In [34]:
dataset[['reviews_list', 'cuisines']].sample(5)

Unnamed: 0,reviews_list,cuisines
18505,rated 50 ratedn wonderful restaurant jp nagar ...,South Indian
21560,rated 40 ratedn wasnt long time pick quick cup...,"Cafe, American, Continental, Italian"
31554,rated 50 ratedn place amazing affordable food ...,"Cafe, Continental, Fast Food, Pizza, Italian, ..."
14933,rated 10 ratedn ordered dal makhani got gravy ...,North Indian
13345,rated 40 ratedn visit bangalore stayed ibis ho...,"Asian, Continental, European, Fast Food, North..."


Let's inspect review list and cuisines.

# 4. Build Recommendation system.

Let's build bag of words, apply vectorizer and word frequency. Let's also drop remaining columns.

In [35]:
restaurant_names=list(dataset['name'].unique())
def get_top_words(column,top_nu_of_words,nu_of_word):
    vec=CountVectorizer(ngram_range=nu_of_word,stopwords='english')
    bag_of_words=vec.fit_transform(column)
    sum_words=bag_of_words.sum(axis=0)
    words_freq=[(word,sum_words[0,idx]) for word,idx in vec.vocabulary_.items()]
    words_freq=sorted(words_freq,key=lambda x:x[1],reverse=True)
    return words_freq[:top_nu_of_words]
dataset=dataset.drop(['address','rest_type','type','menu_item'],axis=1)
df_percent = dataset.sample(frac=0.5)

# 5. Vectorization 

Let's apply vectorization and cosine_similarities to find similarities among restaurants.

In [36]:
df_percent.set_index('name', inplace=True)
indices = pd.Series(df_percent.index)

tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

# 6. Build a Recommendation function.

In [40]:
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'Mean Rating', 'cost'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = pd.concat([df_new,df_percent[['cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()])
        #df_new = pd.concat([df_new, df_percent[['cuisines', 'Mean Rating', 'cost']][df_percent.index == each].sample()])
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines','Mean Rating', 'cost'], keep=False)
    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new
recommend('Burma Burma')

TOP 5 RESTAURANTS LIKE Burma Burma WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,Mean Rating,cost
Chinita Real Mexican Food,Mexican,4.47,1.2
Foxtrot - House Of Subculture,"Cafe, American, Asian, North Indian",4.35,1.0
Nando'S,"Portuguese, African",4.13,1.2
Ilyazsab The House Of Chicken,"Rolls, Kebab",3.84,250.0
1992 Chats - Space,Street Food,3.45,200.0


These are the 5 restaurants that are recommended for user along with **Burma Burma** restaurant.