In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import ast
from datetime import datetime
import string
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [107]:
import gensim.downloader

In [2]:
restaurants_df = pd.read_csv('output/responses/cleaned_all_restaurants_dataset.csv')

In [3]:
reviews_df = pd.read_csv('output/responses/cleaned_all_reviews_dataset.csv')

In [4]:
merged_df = pd.read_csv('output/responses/merged_dataset.csv', low_memory=False)

In [5]:
restaurants = restaurants_df.copy()
reviews = reviews_df.copy()
restaurants_and_reviews = merged_df.copy()

## DO NOT RUN BELOW DATA CLEANING CELLS

In [5]:
restaurants = restaurants[(restaurants['main_rating'] >= 3.7) & (restaurants['reviews'] >= 75) & (restaurants['reviews'] >= 2000)]

In [15]:
reviews.published_at_date.value_counts()

2020-08-29 18:27:04.491038    2
2022-08-29 18:27:02.553693    2
2022-08-29 18:27:02.066803    2
2022-08-29 18:27:02.070242    2
2022-08-29 18:27:02.073277    2
                             ..
2023-11-30 04:11:57.529088    1
2023-11-30 04:11:57.530974    1
2023-11-30 04:11:57.977561    1
2023-11-30 04:11:57.981611    1
2014-08-30 15:41:51.347637    1
Name: published_at_date, Length: 4147993, dtype: int64

In [25]:
# Drop duplicates
reviews.drop_duplicates(inplace=True)

# Drop unnecessary columns
reviews = reviews.drop(columns=['review_translated_text', 'response_from_owner_translated_text', 'response_from_owner_ago', 'response_from_owner_date', 'published_at'])

# Drop rows where review_text and published_at is null
reviews = reviews.dropna(subset=['review_text', 'published_at_date'])

# Change remaining NaN to False, empty string or zero
reviews['is_local_guide'] = reviews['is_local_guide'].fillna(False)  
reviews[['total_number_of_photos_by_reviewer', 'total_number_of_reviews_by_reviewer']] = reviews[['total_number_of_photos_by_reviewer', 'total_number_of_reviews_by_reviewer']].fillna(0)
reviews['response_from_owner_text'] = reviews['response_from_owner_text'].fillna("")

In [26]:
# Establish the consideration set - only keep reviews with published_at date from 1 Jan 2023
# Convert the 'published_at_date' column to datetime and simplify to just date
reviews['published_at_date'] = pd.to_datetime(reviews['published_at_date'])

In [28]:
reviews['published_at_date'] = reviews['published_at_date'].dt.date

In [36]:
cut_off_date = datetime.strptime("2023-01-01", '%Y-%m-%d').date()
reviews = reviews[reviews['published_at_date'] >= cut_off_date]

In [37]:
reviews[reviews.published_at_date >= datetime.strptime("2023-01-01", '%Y-%m-%d').date()].sort_values(by='published_at_date')

Unnamed: 0,place_id,name,review_id,rating,review_text,published_at_date,review_likes_count,response_from_owner_text,total_number_of_reviews_by_reviewer,total_number_of_photos_by_reviewer,is_local_guide
367665,ChIJGXoEwloZ2jERIUSR4jA7ogQ,Ah Lock & Co,ChdDSUhNMG9nS0VJQ0FnSUNoelozdzJnRRAB,2,"Tried the Hakka signature rice bowl, one word ...",2023-08-28,2,,266.0,454.0,True
518593,ChIJZcQkYHIY2jERsDS_lBU0k_Y,Baba Chews,ChZDSUhNMG9nS0VJQ0FnSUN4dVlPVGVnEAE,2,Can be better !!!!,2023-08-28,0,,4.0,0.0,False
518594,ChIJZcQkYHIY2jERsDS_lBU0k_Y,Baba Chews,ChdDSUhNMG9nS0VJQ0FnSUN4aWRmU2tBRRAB,5,Take chance to appreciate the service n patien...,2023-08-28,0,,3.0,0.0,False
518595,ChIJZcQkYHIY2jERsDS_lBU0k_Y,Baba Chews,ChdDSUhNMG9nS0VJQ0FnSUN4N3JHTW1BRRAB,4,"located just opposite starbucks, the restauran...",2023-08-28,0,,1.0,0.0,False
518596,ChIJZcQkYHIY2jERsDS_lBU0k_Y,Baba Chews,ChdDSUhNMG9nS0VJQ0FnSUN4dXJESHBnRRAB,4,"As a guest of Hotel Indigo, we enjoyed their b...",2023-08-28,0,,39.0,94.0,True
...,...,...,...,...,...,...,...,...,...,...,...
3096648,ChIJWy4FfLsZ2jER6QS262kPoBY,Victor's Kitchen,ChZDSUhNMG9nS0VJQ0FnSUNIZ0lyNk1REAE,5,Great Dim Sum!,2024-08-31,0,,52.0,97.0,True
3117552,ChIJye26KJIZ2jERQ-cRCW2All0,Imperial Treasure Super Peking Duck,ChdDSUhNMG9nS0VJQ0FnSUNIZ0xxU2hnRRAB,3,Food : come here for the peiking duck. It is o...,2024-08-31,0,,23.0,19.0,True
2840966,ChIJsdmes4wZ2jEROYrF2YG1bO0,The Orchard Cafe,ChZDSUhNMG9nS0VJQ0FnSUNIZ09XLU5REAE,5,Celebrate father bday at orchard cafe dinner b...,2024-08-31,0,,8.0,3.0,False
3459084,ChIJuXzAbfIZ2jERy-bl07cYgWU,Xin Wang Hong Kong Café,ChZDSUhNMG9nS0VJQ0FnSUNIZ0w3b1JnEAE,5,I m a regular here and the main reason for me ...,2024-08-31,0,,7.0,0.0,False


### Merge restaurants and reviews

In [40]:
# Drop unnecessary columns
reviews.drop(columns=['name', 'review_id'], inplace=True)

# Inner merge
restaurants_and_reviews = restaurants.merge(reviews, how='inner', on='place_id')

# Remove duplicates
restaurants_and_reviews.drop_duplicates(inplace=True)

In [41]:
restaurants_and_reviews

Unnamed: 0,place_id,name,reviews,main_category,categories,main_rating,address,link,review_photos,coordinates,...,district_code,region,rating,review_text,published_at_date,review_likes_count,response_from_owner_text,total_number_of_reviews_by_reviewer,total_number_of_photos_by_reviewer,is_local_guide
0,ChIJYR6HCJ4Z2jERChIQxjl2MhI,Red House Seafood Grand Copthorne,1110,Seafood,"['Seafood restaurant', 'Asian restaurant', 'Di...",4.2,"392 Havelock Rd, Level 2 Grand Copthorne Water...",https://www.google.com/maps/place/Red+House+Se...,['https://lh5.googleusercontent.com/p/AF1QipNp...,"[1.2903687, 103.8352515]",...,3,South,5,Food is Wow,2024-08-28,0,,461.0,21.0,True
1,ChIJYR6HCJ4Z2jERChIQxjl2MhI,Red House Seafood Grand Copthorne,1110,Seafood,"['Seafood restaurant', 'Asian restaurant', 'Di...",4.2,"392 Havelock Rd, Level 2 Grand Copthorne Water...",https://www.google.com/maps/place/Red+House+Se...,['https://lh5.googleusercontent.com/p/AF1QipNp...,"[1.2903687, 103.8352515]",...,3,South,4,I always stay at the Grand Copthorne Waterfron...,2024-08-23,0,,333.0,792.0,True
2,ChIJYR6HCJ4Z2jERChIQxjl2MhI,Red House Seafood Grand Copthorne,1110,Seafood,"['Seafood restaurant', 'Asian restaurant', 'Di...",4.2,"392 Havelock Rd, Level 2 Grand Copthorne Water...",https://www.google.com/maps/place/Red+House+Se...,['https://lh5.googleusercontent.com/p/AF1QipNp...,"[1.2903687, 103.8352515]",...,3,South,5,My family and I had a gathering at Red House S...,2024-08-16,0,,5.0,0.0,False
3,ChIJYR6HCJ4Z2jERChIQxjl2MhI,Red House Seafood Grand Copthorne,1110,Seafood,"['Seafood restaurant', 'Asian restaurant', 'Di...",4.2,"392 Havelock Rd, Level 2 Grand Copthorne Water...",https://www.google.com/maps/place/Red+House+Se...,['https://lh5.googleusercontent.com/p/AF1QipNp...,"[1.2903687, 103.8352515]",...,3,South,5,Celebrated my MIL 70th birthday and pre-coordi...,2024-08-16,0,,3.0,14.0,False
4,ChIJYR6HCJ4Z2jERChIQxjl2MhI,Red House Seafood Grand Copthorne,1110,Seafood,"['Seafood restaurant', 'Asian restaurant', 'Di...",4.2,"392 Havelock Rd, Level 2 Grand Copthorne Water...",https://www.google.com/maps/place/Red+House+Se...,['https://lh5.googleusercontent.com/p/AF1QipNp...,"[1.2903687, 103.8352515]",...,3,South,5,Had a wonderful dinner here tonight. The highl...,2024-07-30,0,,714.0,2924.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1570458,ChIJT49EoY0R2jERS0Ll4u0WxzU,HuaLong Fishhead Steamboat,45,Steamboat,"['Steamboat restaurant', 'Cafe', 'Coffee shop']",3.7,"155 Bukit Batok Street 11, #01-324, Singapore ...",https://www.google.com/maps/place/HuaLong+Fish...,['https://lh5.googleusercontent.com/p/AF1QipPQ...,"[1.3482129, 103.7435755]",...,23,West,5,Having early CNY dinner with parents. Dishes t...,2023-08-29,0,,45.0,329.0,True
1570459,ChIJT49EoY0R2jERS0Ll4u0WxzU,HuaLong Fishhead Steamboat,45,Steamboat,"['Steamboat restaurant', 'Cafe', 'Coffee shop']",3.7,"155 Bukit Batok Street 11, #01-324, Singapore ...",https://www.google.com/maps/place/HuaLong+Fish...,['https://lh5.googleusercontent.com/p/AF1QipPQ...,"[1.3482129, 103.7435755]",...,23,West,4,"Among the zhichar restaurants, so far this is ...",2023-08-29,0,,99.0,960.0,True
1570460,ChIJT49EoY0R2jERS0Ll4u0WxzU,HuaLong Fishhead Steamboat,45,Steamboat,"['Steamboat restaurant', 'Cafe', 'Coffee shop']",3.7,"155 Bukit Batok Street 11, #01-324, Singapore ...",https://www.google.com/maps/place/HuaLong+Fish...,['https://lh5.googleusercontent.com/p/AF1QipPQ...,"[1.3482129, 103.7435755]",...,23,West,4,Nice and delicious food,2023-08-29,0,,1298.0,7767.0,True
1570461,ChIJT49EoY0R2jERS0Ll4u0WxzU,HuaLong Fishhead Steamboat,45,Steamboat,"['Steamboat restaurant', 'Cafe', 'Coffee shop']",3.7,"155 Bukit Batok Street 11, #01-324, Singapore ...",https://www.google.com/maps/place/HuaLong+Fish...,['https://lh5.googleusercontent.com/p/AF1QipPQ...,"[1.3482129, 103.7435755]",...,23,West,3,23 Apr 21 Having late dinner here; my wife ord...,2023-08-29,1,,604.0,1627.0,True


## Preprocessing for Text - RUN FROM HERE

In [121]:
restaurants_and_reviews.shape

(619630, 24)

### Drop unnecessary columns

In [139]:
restaurant_reviews = restaurants_and_reviews.filter(items=['place_id', 'name', 'review_text', 'main_rating',
       'address', 'link', 'review_photos', 'cuisine', 'latitude', 'longtitude'])

In [140]:
restaurant_reviews

Unnamed: 0,place_id,name,review_text,main_rating,address,link,review_photos,cuisine,latitude,longtitude
0,ChIJYR6HCJ4Z2jERChIQxjl2MhI,Red House Seafood Grand Copthorne,Food is Wow,4.2,"392 Havelock Rd, Level 2 Grand Copthorne Water...",https://www.google.com/maps/place/Red+House+Se...,['https://lh5.googleusercontent.com/p/AF1QipNp...,Seafood,1.290369,103.835251
1,ChIJYR6HCJ4Z2jERChIQxjl2MhI,Red House Seafood Grand Copthorne,I always stay at the Grand Copthorne Waterfron...,4.2,"392 Havelock Rd, Level 2 Grand Copthorne Water...",https://www.google.com/maps/place/Red+House+Se...,['https://lh5.googleusercontent.com/p/AF1QipNp...,Seafood,1.290369,103.835251
2,ChIJYR6HCJ4Z2jERChIQxjl2MhI,Red House Seafood Grand Copthorne,My family and I had a gathering at Red House S...,4.2,"392 Havelock Rd, Level 2 Grand Copthorne Water...",https://www.google.com/maps/place/Red+House+Se...,['https://lh5.googleusercontent.com/p/AF1QipNp...,Seafood,1.290369,103.835251
3,ChIJYR6HCJ4Z2jERChIQxjl2MhI,Red House Seafood Grand Copthorne,Celebrated my MIL 70th birthday and pre-coordi...,4.2,"392 Havelock Rd, Level 2 Grand Copthorne Water...",https://www.google.com/maps/place/Red+House+Se...,['https://lh5.googleusercontent.com/p/AF1QipNp...,Seafood,1.290369,103.835251
4,ChIJYR6HCJ4Z2jERChIQxjl2MhI,Red House Seafood Grand Copthorne,Had a wonderful dinner here tonight. The highl...,4.2,"392 Havelock Rd, Level 2 Grand Copthorne Water...",https://www.google.com/maps/place/Red+House+Se...,['https://lh5.googleusercontent.com/p/AF1QipNp...,Seafood,1.290369,103.835251
...,...,...,...,...,...,...,...,...,...,...
619625,ChIJ0T6YJPkZ2jER4SLNFf4Zthg,炭香 Charcoal Fish Head Steamboat Kallang,"Almost 2 hours waiting time, order taken at 6....",4.1,"5 Kallang Pl, Singapore 339152",https://www.google.com/maps/place/%E7%82%AD%E9...,['https://lh5.googleusercontent.com/p/AF1QipM_...,Chinese,1.317298,103.867597
619626,ChIJ0T6YJPkZ2jER4SLNFf4Zthg,炭香 Charcoal Fish Head Steamboat Kallang,2nd time trying the fish head steamboat. Its t...,4.1,"5 Kallang Pl, Singapore 339152",https://www.google.com/maps/place/%E7%82%AD%E9...,['https://lh5.googleusercontent.com/p/AF1QipM_...,Chinese,1.317298,103.867597
619627,ChIJ0T6YJPkZ2jER4SLNFf4Zthg,炭香 Charcoal Fish Head Steamboat Kallang,The food here is most excellent and the servic...,4.1,"5 Kallang Pl, Singapore 339152",https://www.google.com/maps/place/%E7%82%AD%E9...,['https://lh5.googleusercontent.com/p/AF1QipM_...,Chinese,1.317298,103.867597
619628,ChIJ0T6YJPkZ2jER4SLNFf4Zthg,炭香 Charcoal Fish Head Steamboat Kallang,reserved in advance so wait wasnt long. added ...,4.1,"5 Kallang Pl, Singapore 339152",https://www.google.com/maps/place/%E7%82%AD%E9...,['https://lh5.googleusercontent.com/p/AF1QipM_...,Chinese,1.317298,103.867597


### Tokenize, remove stopwords, lemmatize

In [141]:
restaurant_reviews['review_text_processed'] = [simple_preprocess(review) for review in restaurant_reviews['review_text']]

In [143]:
# Set stop words
stop_words = set(stopwords.words('english'))
# Add our custom stopwords
custom_stopwords = {'good', 'food', 'service', 'great', 'nice', 'delicious', 'restaurant'}
stop_words.update(custom_stopwords)

In [144]:
# Remove stop words
restaurant_reviews['review_text_processed'] = restaurant_reviews['review_text_processed'].apply(lambda x: [w for w in x if not w in stop_words])

In [145]:
# Lemmatize
lemmatizer = WordNetLemmatizer()
restaurant_reviews['review_text_processed'] = restaurant_reviews['review_text_processed'].apply(lambda x: [lemmatizer.lemmatize(token) for token in x])

### Keep only reviews which are more than 1 word and less than 2000 words

In [146]:
restaurant_reviews = restaurant_reviews[(restaurant_reviews['review_text_processed'].apply(len) > 1) & (restaurant_reviews['review_text_processed'].apply(len) < 2000)]

### WORD2VEC MODEL

In [148]:
word2vec = Word2Vec(sentences=restaurant_reviews['review_text_processed'])
wv = word2vec.wv

In [149]:
word2vec.save("models/word2vec.model")

In [150]:
wv['steak']

array([ 1.2201053 ,  2.35349   , -1.7830801 , -2.3830266 , -1.8147022 ,
       -1.770769  , -1.0562875 ,  0.1649537 ,  0.47707847, -1.546321  ,
        0.19266199, -1.647951  , -0.67391646,  2.9131725 ,  1.2714548 ,
        0.6648646 , -2.0167744 ,  2.26974   ,  1.0821315 ,  1.1124654 ,
       -2.4741848 , -0.0400463 ,  1.0611336 , -0.37188995,  0.25694212,
        2.8762496 ,  0.02421719,  2.7301853 ,  0.95608544,  0.25084925,
       -3.924364  , -0.35590678,  3.887823  , -0.96357745, -0.3022145 ,
       -0.81779116,  0.8199119 ,  2.8712213 ,  0.56661516, -0.05339861,
        0.86431414,  0.9845978 , -1.8242651 ,  0.31040648, -0.6776101 ,
        1.5195351 , -1.3565949 , -3.679097  ,  1.0681506 ,  0.8809399 ,
       -1.5221616 , -2.001241  , -2.2756796 ,  1.0850888 ,  0.5696699 ,
        0.08361354,  0.21478595,  0.10023685,  0.09228053,  0.11309874,
        0.19037418,  2.3282592 , -2.725697  ,  0.20653066,  2.6366673 ,
       -0.8244838 ,  0.07522777, -1.2109039 , -1.6649383 ,  0.57

### Vectorize each review

In [151]:
def vectorize_reviews(df):
    place_vectors = {}

    for index, row in df.iterrows():
        name = row['name'] + str(index)
        review_vectors = [wv[word] for word in row['review_text_processed'] if word in wv]

        if review_vectors:
            mean_review_vectors = np.mean(review_vectors, axis=0)
            place_vectors[name] = mean_review_vectors
    
    return place_vectors

In [153]:
def get_restaurant(user_input, num, place_vectors):
    tokenized_input = simple_preprocess(user_input)
    tokenized_input = [w for w in tokenized_input if not w in stop_words]
    input_vectors = []
    for word in tokenized_input:
        if word in wv:
            input_vectors.append(wv[word])
            
    # Get a single vector that represents the user input by averaging the embeddings of the individual words
    if input_vectors:
        input_vector = np.mean(input_vectors, axis=0)
    else:
        input_vector = np.zeros(word2vec.vector_size)

    # Check similarity
    similarities = {}
    for name, place_vector in place_vectors.items():
        similarity = np.dot(input_vector, place_vector) / (np.linalg.norm(input_vector) * np.linalg.norm(place_vector))
        similarities[name] = similarity

    sorted_similarities = dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True)[:num])
    print(f"We recommend {', '.join(list(sorted_similarities.keys()))}")
    return sorted_similarities

In [155]:
place_vectors_by_review = vectorize_reviews(restaurant_reviews)

In [156]:
place_vectors_by_review

{'Red House Seafood Grand Copthorne1': array([ 0.7732335 , -0.06895153, -0.11595353, -0.5128661 , -0.0727202 ,
         0.23044397, -0.3439871 , -0.15140437,  0.11664367, -0.0236863 ,
         0.29932517,  0.2502884 ,  0.04261218, -0.5755049 , -0.08180389,
         0.080883  , -0.8845954 , -0.36068448,  0.16023181,  0.76421225,
         0.08970787,  0.03539443,  0.03400542, -0.17087202,  0.16507845,
        -0.7127745 , -0.0311998 , -0.19947971,  0.4669078 , -0.00479673,
        -0.20553936,  0.3319146 , -0.16641298,  0.39316523, -0.03458189,
         0.01369126,  0.30290022,  0.5352536 , -0.14952472,  0.64393294,
         0.22493602,  0.3619201 ,  0.2892058 ,  0.26512843,  0.19477199,
         0.88698745,  0.44278026,  0.64511216,  0.4907834 , -0.3354026 ,
         0.39718044, -0.45983925, -0.33837834, -0.5880849 , -0.01255637,
        -0.24340512,  0.9295671 ,  0.07291658,  0.1259548 , -0.24155952,
         0.8753986 ,  0.13770942,  0.26413405, -0.1961914 ,  0.21485525,
        -0.83

### Vectorize reviews as a whole for each restaurant

In [157]:
restaurant_reviews.columns

Index(['place_id', 'name', 'review_text', 'main_rating', 'address', 'link',
       'review_photos', 'cuisine', 'latitude', 'longtitude',
       'review_text_processed'],
      dtype='object')

In [158]:
combined_reviews = restaurant_reviews.groupby(['name', 'latitude', 'longtitude'])['review_text_processed'].apply(lambda x: sum(x, [])).reset_index()

In [167]:
combined_reviews[combined_reviews['name'].duplicated(keep=False)].head(30)

Unnamed: 0,name,latitude,longtitude,review_text_processed
34,88 Hong Kong Roast Meat Specialist,1.283639,103.847859,"[roasted, meat, melt, mouth, char, siew, crisp..."
35,88 Hong Kong Roast Meat Specialist,1.315132,103.859742,"[char, siew, roast, pork, awesome, glaze, char..."
36,88 Hong Kong Roast Meat Specialist,1.315159,103.859753,"[place, dine, ate, year, back, remember, impre..."
78,Ah Yat Seafood Restaurant,1.288468,103.837375,"[chili, crab, terrible, crab, watery, asked, w..."
79,Ah Yat Seafood Restaurant,1.312151,103.874694,"[yummy, dim, sum, always, hong, kong, style, r..."
126,Anglo Indian Cafe & Bar,1.276801,103.852396,"[ambiance, fast, ordered, sangria, drink, jann..."
127,Anglo Indian Cafe & Bar,1.279125,103.850295,"[amazing, manpreet, greattt, dal, makhani, awe..."
128,Anglo Indian Cafe & Bar,1.294853,103.852751,"[excellent, special, shout, ali, super, friend..."
163,Ayam Penyet President,1.303581,103.872356,"[served, fast, tasted, ok, fantastic, environm..."
164,Ayam Penyet President,1.429265,103.836121,"[suck, outlet, greet, smile, thank, staff, aya..."


In [217]:
def vectorize_reviews_by_restaurant(df):
    place_vectors = []
    
    for index, row in df.iterrows():
        review_vectors = [wv[word] for word in row['review_text_processed'] if word in wv]

        if review_vectors:
            mean_review_vectors = np.mean(review_vectors, axis=0)
            place_vectors.append(mean_review_vectors)
        else:
            print(f"No review vectors generated for {index}")
    
    return place_vectors

In [218]:
place_vectors_by_restaurant = vectorize_reviews_by_restaurant(combined_reviews)

In [221]:
combined_reviews['review_vectors'] = place_vectors_by_restaurant

In [297]:
def get_restaurant_by_all_reviews(user_input, num, df):
    tokenized_input = simple_preprocess(user_input)
    tokenized_input = [w for w in tokenized_input if not w in stop_words]
    input_vectors = []
    
    for word in tokenized_input:
        if word in wv:
            input_vectors.append(wv[word])
            
    # Get a single vector that represents the user input by averaging the embeddings of the individual words
    if input_vectors:
        input_vector = np.mean(input_vectors, axis=0)
    else:
        input_vector = np.zeros(word2vec.vector_size)
    
    # Check similarity
    similarities = {}
    
    for index, values in df.iterrows():
        similarity = np.dot(input_vector, values.review_vectors) / (np.linalg.norm(input_vector) * np.linalg.norm(values.review_vectors))
        similarities[index] = similarity

    sorted_similarities = dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True)[:num])
    
    similarity_df = pd.DataFrame(columns=['name', 'similarity', 'latitude', 'longitude'])
    
    for key, value in sorted_similarities.items():
        new_row = pd.DataFrame({
            'name': [df.loc[key]['name']],
            'similarity': [value],
            'latitude': [df.loc[key]['latitude']],
            'longitude': [df.loc[key]['longtitude']]
        })
        similarity_df = pd.concat([similarity_df, new_row], ignore_index=True)
        
    print(f"\nWe recommend:")
    for index, values in similarity_df.iterrows():
        print(f"{index + 1}. {values['name']}")

    return similarity_df

### Get Restaurant by Keywords

In [298]:
%time
get_restaurant_by_all_reviews("Banh Mi", 10, combined_reviews)

CPU times: user 4 μs, sys: 1 μs, total: 5 μs
Wall time: 10 μs

We recommend:
1. Joo Chiat Banh Mi Ca Phe
2. Banh Mi 233
3. The Viet Roti @ Joo Chiat
4. Nhung Kitchen - Vietnamese Banh Mi
5. Banh Mi Thit by Star Baguette
6. Banh Mi Saigon
7. Miss Saigon Singapore - Orchard Plaza
8. Bami Express
9. Co Hai Banh Mi & Phở Vietnamese Restaurant
10. La Saigon


Unnamed: 0,name,similarity,latitude,longitude
0,Joo Chiat Banh Mi Ca Phe,0.68523,1.310403,103.901791
1,Banh Mi 233,0.680937,1.312772,103.900092
2,The Viet Roti @ Joo Chiat,0.656748,1.310314,103.901645
3,Nhung Kitchen - Vietnamese Banh Mi,0.656565,1.322767,103.851969
4,Banh Mi Thit by Star Baguette,0.653965,1.313885,103.885366
5,Banh Mi Saigon,0.648717,1.374948,103.849804
6,Miss Saigon Singapore - Orchard Plaza,0.641778,1.301025,103.841099
7,Bami Express,0.611388,1.276257,103.842931
8,Co Hai Banh Mi & Phở Vietnamese Restaurant,0.593714,1.301626,103.861901
9,La Saigon,0.582709,1.312237,103.924362


In [299]:
%time
get_restaurant_by_all_reviews("romantic dinner", 10, combined_reviews)

CPU times: user 3 μs, sys: 1e+03 ns, total: 4 μs
Wall time: 8.82 μs

We recommend:
1. The Fullerton Pavillion
2. OUE Tower
3. Las Palmas Rooftop Bar
4. Boulevard ASQ
5. Lin Rooftop Bar
6. Artemis Grill & Sky Bar
7. FRY Rooftop Bistro & Bar
8. Flutes
9. SKAI Bar
10. Dolce Vita


Unnamed: 0,name,similarity,latitude,longitude
0,The Fullerton Pavillion,0.638229,1.284579,103.853965
1,OUE Tower,0.416677,1.283501,103.853136
2,Las Palmas Rooftop Bar,0.372081,1.320585,103.843232
3,Boulevard ASQ,0.366021,1.278341,103.850884
4,Lin Rooftop Bar,0.363451,1.285053,103.834125
5,Artemis Grill & Sky Bar,0.361073,1.281803,103.850125
6,FRY Rooftop Bistro & Bar,0.351301,1.281642,103.845484
7,Flutes,0.350947,1.297346,103.857383
8,SKAI Bar,0.327739,1.29336,103.853104
9,Dolce Vita,0.322796,1.290691,103.858311


In [300]:
%time
get_restaurant_by_all_reviews("good chilli chicken rice", 10, combined_reviews)

CPU times: user 3 μs, sys: 1 μs, total: 4 μs
Wall time: 9.06 μs

We recommend:
1. Feng Sheng Kampong Chicken Rice & Steamboat (24 Hours) | 豐盛滑鸡火锅
2. Wee Nam Kee Hainanese Chicken Rice Restaurant
3. Kap Kun Kap Thai & Seafood
4. Jade’s Chicken (옥‘s Chicken)
5. SoupeRich
6. Selera Restaurant
7. Yong Kee Seafood Restaurant
8. 窝窝店 Wo Wo Dian
9. 8 degrees 8度空間-人文茶館 @Jln Besar
10. 3 Treasures Vegetarian AMK 三寶齋素食 @ 422 宏茂桥3道


Unnamed: 0,name,similarity,latitude,longitude
0,Feng Sheng Kampong Chicken Rice & Steamboat (2...,0.800857,1.302037,103.850361
1,Wee Nam Kee Hainanese Chicken Rice Restaurant,0.79059,1.317031,103.843873
2,Kap Kun Kap Thai & Seafood,0.777574,1.484843,103.946783
3,Jade’s Chicken (옥‘s Chicken),0.769814,1.338124,103.845794
4,SoupeRich,0.765672,1.279671,103.846773
5,Selera Restaurant,0.76291,1.304411,103.849797
6,Yong Kee Seafood Restaurant,0.755746,1.304693,103.854723
7,窝窝店 Wo Wo Dian,0.754488,1.294283,103.852949
8,8 degrees 8度空間-人文茶館 @Jln Besar,0.749993,1.312144,103.859939
9,3 Treasures Vegetarian AMK 三寶齋素食 @ 422 宏茂桥3道,0.749363,1.367875,103.851215


In [301]:
%time
get_restaurant_by_all_reviews("spicy butter chicken", 10, combined_reviews)

CPU times: user 3 μs, sys: 2 μs, total: 5 μs
Wall time: 10 μs

We recommend:
1. Jade’s Chicken (옥‘s Chicken)
2. Chicken Up Korean Restaurant
3. Abang Dol
4. Arcade Fish Soup
5. Tong Fu Ju Sichuan Restaurant
6. Feng Sheng Kampong Chicken Rice & Steamboat (24 Hours) | 豐盛滑鸡火锅
7. Fi Woodfire Thai | Thai Restaurant (Robertson Quay) | Signature "Moo Ping" Smoked Pork Rib
8. Kap Kun Kap Thai & Seafood
9. Daawat Tandoori
10. basdban


Unnamed: 0,name,similarity,latitude,longitude
0,Jade’s Chicken (옥‘s Chicken),0.774087,1.338124,103.845794
1,Chicken Up Korean Restaurant,0.772028,1.279275,103.843749
2,Abang Dol,0.761768,1.315717,103.897703
3,Arcade Fish Soup,0.756483,1.277507,103.852584
4,Tong Fu Ju Sichuan Restaurant,0.753414,1.286991,103.849401
5,Feng Sheng Kampong Chicken Rice & Steamboat (2...,0.747128,1.302037,103.850361
6,Fi Woodfire Thai | Thai Restaurant (Robertson ...,0.74372,1.29199,103.841688
7,Kap Kun Kap Thai & Seafood,0.736464,1.484843,103.946783
8,Daawat Tandoori,0.736091,1.352068,103.835182
9,basdban,0.729389,1.282996,103.849145


### Can run the below cells to compare to the previous method which compares by reviews only

In [199]:
%time
get_restaurant("Banh Mi", 10, place_vectors_by_review)

CPU times: user 3 μs, sys: 1 μs, total: 4 μs
Wall time: 8.11 μs
We recommend Viet & Beans Bistro129851, La Saigon310232, Banh Mi Saigon366104, Banh Mi Saigon366679, Joo Chiat Banh Mi Ca Phe393558, Joo Chiat Banh Mi Ca Phe393849, Nhung Kitchen - Vietnamese Banh Mi393891, Nhung Kitchen - Vietnamese Banh Mi393925, Nhung Kitchen - Vietnamese Banh Mi393998, Nhung Kitchen - Vietnamese Banh Mi394047


{'Viet & Beans Bistro129851': 1.0,
 'La Saigon310232': 1.0,
 'Banh Mi Saigon366104': 1.0,
 'Banh Mi Saigon366679': 1.0,
 'Joo Chiat Banh Mi Ca Phe393558': 1.0,
 'Joo Chiat Banh Mi Ca Phe393849': 1.0,
 'Nhung Kitchen - Vietnamese Banh Mi393891': 1.0,
 'Nhung Kitchen - Vietnamese Banh Mi393925': 1.0,
 'Nhung Kitchen - Vietnamese Banh Mi393998': 1.0,
 'Nhung Kitchen - Vietnamese Banh Mi394047': 1.0}

In [101]:
%time
get_restaurant("romantic dinner", 10, place_vectors_by_review)

CPU times: user 3 μs, sys: 1e+03 ns, total: 4 μs
Wall time: 10 μs
We recommend Shisen Hanten by Chen Kentaro38798, elemen 元素598270, elemen 元素598422, Restaurant Espoir521907, Restaurant Espoir522177, D.O.P Mozzarella Bar & Restaurant475308, The Curry Club Signature175403, The Curry Club Signature175844, SKAI Restaurant5296, CANCHITA Peruvian Cuisine at Dempsey Hill - Best Ceviche in Singapore415726


{'Shisen Hanten by Chen Kentaro38798': 0.8566773,
 'elemen 元素598270': 0.8448358,
 'elemen 元素598422': 0.8448358,
 'Restaurant Espoir521907': 0.8387911,
 'Restaurant Espoir522177': 0.8387911,
 'D.O.P Mozzarella Bar & Restaurant475308': 0.838257,
 'The Curry Club Signature175403': 0.8378647,
 'The Curry Club Signature175844': 0.8378647,
 'SKAI Restaurant5296': 0.8309262,
 'CANCHITA Peruvian Cuisine at Dempsey Hill - Best Ceviche in Singapore415726': 0.8290441}

In [102]:
%time
get_restaurant("romantic", 10, place_vectors_by_review)

CPU times: user 4 μs, sys: 1 μs, total: 5 μs
Wall time: 10 μs
We recommend La Braceria Pizza & Grill215870, Le Jardin: Café | Pets friendly | Fort canning | Tree Tunnel | Events Space | Flowers | Coffee221501, Le Jardin: Café | Pets friendly | Fort canning | Tree Tunnel | Events Space | Flowers | Coffee221897, Café de Paris422102, Bella Pizza371704, Kuan Zhai Alley 宽窄巷子 (Szechuan Cuisine Restaurant）516492, Kuan Zhai Alley 宽窄巷子 (Szechuan Cuisine Restaurant）517308, Spago Dining Room52137, Arab Street Turkish & Western Restaurant342210, Arab Street Turkish & Western Restaurant342850


{'La Braceria Pizza & Grill215870': 0.77871037,
 'Le Jardin: Café | Pets friendly | Fort canning | Tree Tunnel | Events Space | Flowers | Coffee221501': 0.7753472,
 'Le Jardin: Café | Pets friendly | Fort canning | Tree Tunnel | Events Space | Flowers | Coffee221897': 0.7753472,
 'Café de Paris422102': 0.7720444,
 'Bella Pizza371704': 0.7669637,
 'Kuan Zhai Alley 宽窄巷子 (Szechuan Cuisine Restaurant）516492': 0.76167345,
 'Kuan Zhai Alley 宽窄巷子 (Szechuan Cuisine Restaurant）517308': 0.76167345,
 'Spago Dining Room52137': 0.75450766,
 'Arab Street Turkish & Western Restaurant342210': 0.74698985,
 'Arab Street Turkish & Western Restaurant342850': 0.74698985}

## Ignore below

In [None]:
place_vectors = {}

for index, row in restaurants_and_reviews.iterrows():
    name = row['name']
    review_vectors = [wv[word] for word in row['review_text_processed'] if word in wv]

    if review_vectors:
        mean_review_vectors = np.mean(review_vectors, axis=0)
        place_vectors[name] = mean_review_vectors

In [49]:
user_input = "I want to have nice juicy steak"
tokenized_input = simple_preprocess(user_input)

In [51]:
tokenized_input = [w for w in tokenized_input if not w in stop_words]

In [52]:
tokenized_input

['want', 'nice', 'juicy', 'steak']

In [59]:
input_vectors = []
for word in tokenized_input:
    if word in wv:
        input_vectors.append(wv[word])

input_vectors

[array([ 1.676361  , -0.6445835 , -4.2489634 ,  0.98864573, -0.50131005,
         1.0577774 ,  0.13475527, -0.03096577, -0.7752183 , -1.1102387 ,
        -0.11977655, -1.9664255 ,  2.4896073 , -0.5031362 , -1.3480862 ,
        -0.22032644, -0.06110421, -0.5028213 ,  2.0624235 ,  2.606833  ,
         2.2621415 ,  0.02872669, -1.3078108 , -1.5321403 , -0.64283395,
         1.093639  ,  0.26739365, -1.3188336 ,  2.4456618 , -1.6789458 ,
        -0.10863965, -2.1880002 , -2.1737626 , -0.5732178 , -2.0356066 ,
         0.56295925, -0.7061878 ,  0.75368524, -0.23289582, -0.8373885 ,
        -1.2836169 ,  3.1915872 , -0.13366322, -3.2336376 ,  1.8992177 ,
         2.0433745 , -0.18698043,  0.7435844 ,  1.0948602 , -0.16972427,
        -0.3165312 , -1.7217358 ,  0.46773317,  1.1183101 ,  1.8281848 ,
         0.7061278 , -0.45334908, -2.8871098 ,  0.23750968, -0.26083842,
        -2.5709796 , -1.8439145 ,  2.1566699 ,  0.34540388,  0.8704072 ,
        -1.5830318 , -2.3998954 ,  0.19767314,  0.1

In [63]:
# Get a single vector that represents the entire sentence by averaging the embeddings of the individual words
if input_vectors:
    input_vector = np.mean(input_vectors, axis=0)
else:
    input_vector = np.zeros(word2vec.vector_size)

In [83]:
similarities = {}
for name, place_vector in place_vectors.items():
    similarity = np.dot(input_vector, place_vector) / (np.linalg.norm(input_vector) * np.linalg.norm(place_vector))
    similarities[name] = similarity

most_similar_place = max(similarities, key=similarities.get)

print(f"Recommended place: {most_similar_place}")

Recommended place: Kazu Sumiyaki Restaurant


{'Red House Seafood Grand Copthorne': 0.102719545,
 'Red House Seafood Nanyang (Clarke Quay)': 0.05724991,
 'Red House Seafood at Esplanade': 0.16979772,
 'Red Sparrow': 0.6065481,
 'Red Star Restaurant': 0.09804,
 'RedChillies Indian Cuisine': -0.14032133,
 'Regal Golden Steamboat': 0.50380224,
 'Regal Restaurant Group': 0.29674304,
 'Rempah by Chilli Padi': 0.1207284,
 'Rempapa': 0.39039922,
 'Rendezvous Restaurant Hock Lock Kee': 0.3123872,
 'Restaurant Aisyah Halal Chinese XinJiang Cuisine 西北香 中国新疆餐厅': 0.40816465,
 'Restaurant Born': 0.50175446,
 'Restaurant Chaleur': -0.13032255,
 'Restaurant Espoir': 0.51592237,
 'Restaurant Euphoria': 0.057768695,
 'Restaurant Fiz': 0.24262887,
 'Restaurant Ibid': 0.1262138,
 'Restaurant Imbue': 0.30925405,
 'Restaurant Início': 0.32492417,
 'Restaurant JAG': -0.07638022,
 'Restaurant Khiri': 0.18158919,
 'Restaurant Matera': 0.25427827,
 'Restaurant PeraMakan @ Owen Road': 0.41795507,
 'Restaurant Poise': 0.5430799,
 'Restaurant Reve - Singapor

In [94]:
sorted_similarities = dict(sorted(similarities.items(), key=lambda item: item[1], reverse=True)[:10])
print(f"We recommend {', '.join(list(sorted_similarities.keys()))}")

We recommend Kazu Sumiyaki Restaurant, Fatt Choy Eating House [發財餐馆]: Modern Singapore Comfort Food (Bugis - Haji Lane), Waa Cow! Marina One, Josh's Grill - Bugis Junction, Fat Cow - Japanese Wagyu @ Camden Medical Centre, All Things Delicious., Aburi-EN, Ikkiro SG, Two Blur Guys, Turkish Kebab & Grill


In [None]:
reviews = []
name = restaurants_and_reviews.iloc[0]['name'] # set this as the first name in the dataset
df = restaurant_reviews[:100000]
beginning_index = 0

for index, row in restaurant_reviews[:100000].iterrows():
    if name == row['name']:
        print(f"Now adding a review for {index} - {row['name']}")
        reviews += row['review_text_processed']
        
        # Need to catch the last row
        if index == df.index[-1]:
            print(f"Now adding LAST review set for {index} - {row['name']}")
            reviews += row['review_text_processed']
    else:
        # Set reviews as the value of row['review_text_processed']
        print(f"Now saving all reviews for {row['name']} at location {index}")
        row['review_text_processed'] = reviews
        
        # Drop rows up to the index
        print(f"Now dropping rows for {row['name']} from {beginning_index} to {index-1}")
        df = df.drop(index=[beginning_index, index-1])
        
        # Reset name
        name = row['name']        
        # Reset reviews
        reviews = []
        # Set beginning_index
        beginning_index = index + 1

In [None]:
def vectorize_by_place():
    place_vectors = {}
    name = restaurants_and_reviews.iloc[0]['name'] # set this as the first name in the dataset
    reviews = []
    df = restaurants_and_reviews[:10000]
    
    for index, row in df.iterrows():
        if name == row['name']:
            print(f"Now processing for {index} - {row['name']}")
            reviews += row['review_text_processed']
            
            # Need to catch the last row
            if index == df.index[-1]:
                # Vectorize the reviews list
                print(f"The Last One!! Vectorizing reviews for {index} - {name}")
                print(reviews)
                review_vectors = [wv[word] for word in reviews if word in wv]
        
                # Save to place_vectors
                if review_vectors:
                    mean_review_vectors = np.mean(review_vectors, axis=0)
                    place_vectors[name] = mean_review_vectors
                    
        else:
            # Vectorize the reviews list
            print(f"Vectorizing reviews for {index} - {name}")
            print(reviews)
            review_vectors = [wv[word] for word in reviews if word in wv]
    
            # # Save to place_vectors
            if review_vectors:
                mean_review_vectors = np.mean(review_vectors, axis=0)
                place_vectors[name] = mean_review_vectors
                
            # Reset name
            name = row['name']        
            # Reset reviews
            reviews = []
            
    return place_vectors