In [88]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

In [89]:
file_path = "../data exploration/explored_data.csv"  
data = pd.read_csv(file_path)

print(data.head())

        Restaurant              Reviewer  \
0  Beyond Flavours     Rusha Chakraborty   
1  Beyond Flavours  Anusha Tirumalaneedi   
2  Beyond Flavours       Ashok Shekhawat   
3  Beyond Flavours        Swapnil Sarkar   
4  Beyond Flavours                Dileep   

                                              Review  Rating  \
0  The ambience was good, food was quite good . h...     5.0   
1  Ambience is too good for a pleasant evening. S...     5.0   
2  A must try.. great food great ambience. Thnx f...     5.0   
3  Soumen das and Arun was a great guy. Only beca...     5.0   
4  Food is good.we ordered Kodi drumsticks and ba...     5.0   

                  Time  Pictures  Review_Count  Follower_Count  Review_Length  
0  2019-05-25 15:54:00         0           1.0             2.0            222  
1  2019-05-25 14:20:00         0           3.0             2.0            144  
2  2019-05-24 22:54:00         0           2.0             3.0            189  
3  2019-05-24 22:11:00        

In [90]:
corpus = data.groupby("Restaurant")["Review"].apply(lambda x: " ".join(x.dropna().astype(str))).reset_index()
corpus

Unnamed: 0,Restaurant,Review
0,10 Downing Street,I've been to this place about two times and i ...
1,13 Dhaba,I didn't go and eat at the Dhaba.\nI had order...
2,"3B's - Buddies, Bar & Barbecue",We go their for a team dinner.The name of the ...
3,AB's - Absolute Barbecues,It was excellent experience spiced thank Krish...
4,Absolute Sizzlers,Service was pathetic. Ordered a sizzler with l...
...,...,...
95,Urban Asia - Kitchen & Bar,This place is highly recommended. It is workin...
96,Yum Yum Tree - The Arabian Food Court,It is at 6th floor of Act Boutique building th...
97,Zega - Sheraton Hyderabad Hotel,"My husband and I, visited Zega for their dimsu..."
98,Zing's Northeast Kitchen,The food is toooooooooo good. The interior and...


In [91]:
def preprocess_text(text):
    text = text.lower() 
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    return text

corpus["Review"] = corpus["Review"].apply(preprocess_text)
corpus

Unnamed: 0,Restaurant,Review
0,10 Downing Street,ive been to this place about two times and i r...
1,13 Dhaba,i didnt go and eat at the dhaba\ni had ordered...
2,"3B's - Buddies, Bar & Barbecue",we go their for a team dinnerthe name of the g...
3,AB's - Absolute Barbecues,it was excellent experience spiced thank krish...
4,Absolute Sizzlers,service was pathetic ordered a sizzler with la...
...,...,...
95,Urban Asia - Kitchen & Bar,this place is highly recommended it is working...
96,Yum Yum Tree - The Arabian Food Court,it is at 6th floor of act boutique building th...
97,Zega - Sheraton Hyderabad Hotel,my husband and i visited zega for their dimsum...
98,Zing's Northeast Kitchen,the food is toooooooooo good the interior and ...


In [92]:
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(corpus["Review"])
print(vectorizer.get_feature_names_out())
print(tfidf_matrix.toarray())
print("\nThe shape of the TF-IDF matrix is: ", tfidf_matrix.shape)

['015' '03am' '03feb2019' ... 'zoomato' 'zucchini' 'zyada']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

The shape of the TF-IDF matrix is:  (100, 18580)


In [93]:
similarity_matrix = cosine_similarity(tfidf_matrix)
print(similarity_matrix)

[[1.         0.30756349 0.27412256 ... 0.49502869 0.36070033 0.29913375]
 [0.30756349 1.         0.16753217 ... 0.26303299 0.2451759  0.34587796]
 [0.27412256 0.16753217 1.         ... 0.26795469 0.19825933 0.1685686 ]
 ...
 [0.49502869 0.26303299 0.26795469 ... 1.         0.36109702 0.26751618]
 [0.36070033 0.2451759  0.19825933 ... 0.36109702 1.         0.27668356]
 [0.29913375 0.34587796 0.1685686  ... 0.26751618 0.27668356 1.        ]]


In [94]:
def recommend_similar_restaurants(restaurant_name, data, similarity_matrix, top_n=5):
    if restaurant_name not in data["Restaurant"].values:
        return f"Restaurant '{restaurant_name}' not found in dataset."

    restaurant_index = data[data["Restaurant"] == restaurant_name].index[0]

    similar_restaurants = list(enumerate(similarity_matrix[restaurant_index]))
    similar_restaurants = sorted(similar_restaurants, key=lambda x: x[1], reverse=True)[1:top_n+1]

    recommendations = [(data["Restaurant"].iloc[i[0]], i[1]) for i in similar_restaurants]

    print(f"\nRestaurants similar to '{restaurant_name}':\n")
    for rec, score in recommendations:
        print(f"{rec} - Similarity Score: {score:.4f}")

    return recommendations

In [95]:
restaurant_name = "Mohammedia Shawarma"  # Change to any restaurant in dataset
recommendations = recommend_similar_restaurants(restaurant_name, corpus, similarity_matrix)


Restaurants similar to 'Mohammedia Shawarma':

Shah Ghouse Spl Shawarma - Similarity Score: 0.8570
The Foodie Monster Kitchen - Similarity Score: 0.4973
Hotel Zara Hi-Fi - Similarity Score: 0.2624
Being Hungry - Similarity Score: 0.2428
Tandoori Food Works - Similarity Score: 0.2350
