In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


In [4]:
# Initialize the BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]



In [13]:
merged_df = pd.read_csv('output/responses/merged_dataset.csv', low_memory=False)
restaurants_and_reviews = merged_df.copy()

In [14]:
restaurant_reviews = restaurants_and_reviews.filter(items=['place_id', 'name', 'review_text', 'main_rating',
       'address', 'link', 'review_photos', 'cuisine', 'latitude', 'longtitude'])

In [15]:
restaurant_reviews['review_text']

0                                               Food is Wow
1         I always stay at the Grand Copthorne Waterfron...
2         My family and I had a gathering at Red House S...
3         Celebrated my MIL 70th birthday and pre-coordi...
4         Had a wonderful dinner here tonight. The highl...
                                ...                        
619625    Almost 2 hours waiting time, order taken at 6....
619626    2nd time trying the fish head steamboat. Its t...
619627    The food here is most excellent and the servic...
619628    reserved in advance so wait wasnt long. added ...
619629    Tucked away in this industrial estate is a coa...
Name: review_text, Length: 619630, dtype: object

In [20]:
combined_reviews = restaurant_reviews.groupby(['name', 'latitude', 'longtitude'])['review_text'].apply(list).reset_index()

In [None]:
# Encode the review texts into BERT embeddings
review_embeddings = model.encode(combined_reviews['review_text'].tolist(), convert_to_tensor=True)

In [None]:
# Input keywords
input_keywords = 'spicy butter chicken'

In [None]:
# Encode the input keywords into BERT embeddings
keyword_embedding = model.encode(input_keywords, convert_to_tensor=True)
keyword_embedding

In [None]:
# Compute cosine similarities between the keyword embedding and each review embedding
cosine_scores = util.pytorch_cos_sim(keyword_embedding, review_embeddings)[0]

In [None]:
# Move tensor to CPU and convert to NumPy array
cosine_scores_cpu = cosine_scores.cpu().numpy()

In [None]:
# Get the top 5 most similar reviews
top_5_indices = np.argsort(cosine_scores_cpu)[-5:][::-1]

In [None]:
top_5_restaurants = combined_reviews.iloc[top_5_indices]

## Sample Code for Reference below

In [3]:
# Load your DataFrame with restaurant reviews
df = pd.DataFrame({
    'restaurant_name': ['Restaurant A', 'Restaurant B', 'Restaurant C', 'Restaurant D', 'Restaurant E'],
    'review_text': [
        'The food was amazing and the service was excellent!',
        'A lovely place with great ambiance, but the food was average.',
        'Terrible service, but the food was decent.',
        'The best restaurant experience I have ever had!',
        'Good food, but the place was too crowded.'
    ]
})

In [None]:
# Initialize the BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [5]:
# Encode the review texts into BERT embeddings
review_embeddings = model.encode(df['review_text'].tolist(), convert_to_tensor=True)

In [6]:
# Input keywords
input_keywords = 'amazing service and great food'

In [7]:
# Encode the input keywords into BERT embeddings
keyword_embedding = model.encode(input_keywords, convert_to_tensor=True)
keyword_embedding

tensor([-5.9450e-02,  2.8937e-02,  6.1454e-02, -1.9401e-02, -9.2930e-02,
         2.2476e-02,  3.0218e-02, -4.8323e-02, -6.0863e-03, -1.3512e-03,
         7.3016e-02,  3.7726e-02,  3.1857e-02,  4.6288e-03, -1.8009e-02,
        -9.7719e-02,  1.4599e-01, -9.7150e-02, -3.5542e-02, -1.4181e-01,
        -6.4401e-02, -2.3209e-03,  2.8313e-02, -1.1180e-02, -9.2467e-02,
         6.4219e-02, -3.7828e-02,  2.1000e-02, -7.2896e-03, -9.7328e-02,
        -3.1132e-02, -8.4543e-03,  1.1060e-02,  3.2587e-02, -1.0543e-02,
         6.6061e-02,  1.0183e-01, -1.3654e-01,  3.1303e-02,  1.8902e-02,
        -7.2216e-03, -2.4764e-02,  2.5153e-02, -1.7342e-02, -1.7283e-02,
         1.0748e-02, -2.4848e-02,  1.2393e-02,  1.0617e-01,  1.8149e-03,
        -3.2808e-02, -2.6896e-02,  4.1898e-02, -1.9722e-02,  2.2352e-02,
         3.5966e-02, -7.8120e-02, -7.1197e-02, -8.1964e-02, -1.7621e-02,
         8.6390e-03,  2.7901e-02, -1.1474e-02, -8.9679e-03, -4.2940e-05,
        -8.0144e-02, -1.1277e-01,  1.3987e-02, -3.5

In [8]:
# Compute cosine similarities between the keyword embedding and each review embedding
cosine_scores = util.pytorch_cos_sim(keyword_embedding, review_embeddings)[0]
cosine_scores

tensor([0.8547, 0.5265, 0.6614, 0.6682, 0.4443], device='mps:0')

In [10]:
# Move tensor to CPU and convert to NumPy array
cosine_scores_cpu = cosine_scores.cpu().numpy()

In [11]:
# Get the top 5 most similar reviews
top_5_indices = np.argsort(cosine_scores_cpu)[-5:][::-1]
top_5_restaurants = df.iloc[top_5_indices]

In [12]:
# Print the results
print(top_5_restaurants[['restaurant_name', 'review_text']])

  restaurant_name                                        review_text
0    Restaurant A  The food was amazing and the service was excel...
3    Restaurant D    The best restaurant experience I have ever had!
2    Restaurant C         Terrible service, but the food was decent.
1    Restaurant B  A lovely place with great ambiance, but the fo...
4    Restaurant E          Good food, but the place was too crowded.
