# Sketch Notebook

In [53]:
import json
import numpy as np
import pandas as pd

# Parse the json response
reviews = []
ratings = []

with open('./json/butcher-reviews.json') as f:
    reviews_temp = json.load(f)
    for review in reviews_temp:
        reviews.append(review['snippet'])
        ratings.append(review['rating'])

In [54]:
print(len(reviews))
print(len(ratings))
for i, review in enumerate(reviews):
    if i > 2: break
    print(review + '\n')

176
176
I don’t know where to begin. The staff is extremely friendly and professional. Very knowledgeable and helpful. We are not very accustomed to this kind of restaurant, and they helped us pick items and the right wine. Steaks were perfectly cooked. Sides were phenomenal. The cheesecake is to die for. Highly recommend for a special night out.

Trendy and chic, incredibly decadent and lavish. Great place to celebrate a special occasion. Of course amazing steaks as you’d expect. The ceviche was amazing, and the cheesecake dessert was a work of art! Amazing cocktail options and incredible variety of bourbon. Lots of rare options that you can’t find elsewhere.

Fantastic Steak and Seafood Restaurant. Service was impeccable, food was outstanding. Drinks and wine were top notch. One of the best restaurants in Toronto. It's a hidden gem in the city. Minutes away from Scotiabank Arena at Bay and Harbour Streets. Great for business meetings, date nights or out with friends. They have an out

In [55]:
from turtle import distance
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

n_gram_range = (1, 1) # [lower bound, upper bound]
stop_words = "english"

# Extract candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit(reviews)
candidates = count.get_feature_names_out()
custom_kws = ['quite', 'intimate', 'dim', 'waygu'] # cutom kws can be passed from FE in the future

# Next, we convert both the reviews as well as the candidate keywords/keyphrases to numerical data using pre-trained BERT
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
review_embeddings = model.encode(reviews)
candidate_embeddings = model.encode(candidates)

In [68]:
# Use cosine similarity to compare candidate embeddings with all review embeddings (vectorized)
top_n = 10
distances = cosine_similarity(candidate_embeddings, review_embeddings) # return kernel matrix ndarray of shape (n_samples_X, n_samples_Y)

# Compute the mean similarity for each candidate kw (vectorized)
mean_distances = np.mean(distances, axis=1)
# print(f'mean_distances shape: {mean_distances.shape}')

keywords = [candidates[index] for index in mean_distances.argsort()[-top_n:]]
print(f'top {top_n} keywords extracted:\n {keywords}')

top 10 keywords extracted:
 ['chef', 'appetizers', 'dinner', 'deliciousness', 'culinary', 'chefs', 'dinners', 'flavoursome', 'flavorful', 'tasty']


In [57]:
# Diversification of returned KWs result
def mmr(doc_embeddingss, word_embeddings, words, top_n, diversity):
    '''Higher diversity (0 - 1) = more diverse'''

    # Extract similarity within words, and between words and the reviews
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embeddingss)
    word_doc_similarity_mean = np.mean(word_doc_similarity, axis=1).reshape(-1, 1)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already chosen best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity_mean)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity_mean[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

kws= mmr(doc_embeddingss=review_embeddings, word_embeddings=candidate_embeddings, words=candidates, top_n=10, diversity=0.2)
print(f'top {top_n} keywords extracted:\n {kws}')

top 10 keywords extracted:
 ['tasty', 'steakhouses', 'waiter', 'foodie', 'sauvignon', 'dinners', 'rich', 'chefs', 'flavoursome', 'appetizers']


In [58]:
# Adding custom keywords from FE user input to the final extracted KWs list
for kw in custom_kws:
    if kw not in kws:
        kws.append(kw)

# Iteractive through concated reviews to get keywords count and store in a dict
concat_reviews = ' '.join(reviews).lower()
kws_cnt = {}
for kw in kws:
    # count() returns the number of occurrences of a substring in a give string
    cnt = concat_reviews.count(kw)
    kws_cnt[kw] = cnt

kws_cnt



{'tasty': 4,
 'steakhouses': 3,
 'waiter': 7,
 'foodie': 2,
 'sauvignon': 1,
 'dinners': 1,
 'rich': 2,
 'chefs': 3,
 'flavoursome': 1,
 'appetizers': 4,
 'quite': 4,
 'intimate': 7,
 'dim': 0,
 'waygu': 2}

In [94]:
def find_similar_kw(kw, doc_embeddings, word_embeddings, words, top_n=100) -> list:
    '''
    Take a string keywords as input and find the closest meaning kw in the top top_N keyword candidates
    '''

    # Extract similarity within words, and between words and the reviews
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embeddings)
    word_doc_similarity_mean = np.mean(word_doc_similarity, axis=1)

    # Get top_n keywords representive to doc embeddings
    keywords = [words[index] for index in word_doc_similarity_mean.argsort()[-top_n:]]

    # Compute input kw embedding and compute its similarity with extracted top_n keywords
    kw_emb = model.encode([kw])
    kw_similarity = cosine_similarity(model.encode(keywords), kw_emb).reshape(-1,)

    # Take the most similar keywords
    similar_kw = [keywords[index] for index in kw_similarity.argsort()[-10:]]
    
    return similar_kw

find_similar_kw(kw='dim', doc_embeddings=review_embeddings, word_embeddings=candidate_embeddings, words=candidates, top_n=100)

['pleasure',
 'flavour',
 'complimentary',
 'tasted',
 'concise',
 'tastes',
 'ambience',
 'impeccable',
 'timely',
 'seasoned']