In [5]:
import numpy as np
import pandas as pd 
import scipy.sparse
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize

## Using this CSV from Kaggle

In [7]:
# todo: double check ids match with our dataset
review_df = pd.read_csv('yelp-dataset/yelp_review.csv')
business_df = pd.read_csv('yelp-dataset/yelp_business.csv')
business_df.head()

Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",,"""4855 E Warner Rd, Ste B9""",Ahwatukee,AZ,85044,33.33069,-111.978599,4.0,22,1,Dentists;General Dentistry;Health & Medical;Or...
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""",,"""3101 Washington Rd""",McMurray,PA,15317,40.291685,-80.1049,3.0,11,1,Hair Stylists;Hair Salons;Men's Hair Salons;Bl...
2,KQPW8lFf1y5BT2MxiSZ3QA,"""Western Motor Vehicle""",,"""6025 N 27th Ave, Ste 1""",Phoenix,AZ,85017,33.524903,-112.11531,1.5,18,1,Departments of Motor Vehicles;Public Services ...
3,8DShNS-LuFqpEWIp0HxijA,"""Sports Authority""",,"""5000 Arizona Mills Cr, Ste 435""",Tempe,AZ,85282,33.383147,-111.964725,3.0,9,0,Sporting Goods;Shopping
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",,"""581 Howe Ave""",Cuyahoga Falls,OH,44221,41.119535,-81.47569,3.5,116,1,American (New);Nightlife;Bars;Sandwiches;Ameri...


## Limit to Arizona Restaurants

In [8]:
arizona_business_df = business_df[business_df['state'] == 'AZ']
arizona_rest_df = arizona_business_df[arizona_business_df['categories'].str.contains("Restaurant")]
arizona_rest_df.head()

Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
45,rDMptJYWtnMhpQu_rRXHng,"""McDonald's""",,"""719 E Thunderbird Rd""",Phoenix,AZ,85022,33.60707,-112.064382,1.0,10,1,Fast Food;Burgers;Restaurants
46,1WBkAuQg81kokZIPMpn9Zg,"""Charr An American Burger Bar""",,"""777 E Thunderbird Rd, Ste 107""",Phoenix,AZ,85022,33.60731,-112.063404,3.0,232,1,Burgers;Restaurants
72,iPa__LOhse-hobC2Xmp-Kw,"""McDonald's""",,"""1635 E Camelback Rd""",Phoenix,AZ,85016,33.508765,-112.04624,3.0,34,1,Restaurants;Burgers;Fast Food
80,kKx8iCJkomVQBdWHnmmOiA,"""Little Caesars Pizza""",,"""10720 E Southern Ave""",Mesa,AZ,85209,33.394877,-111.600194,2.5,4,1,Restaurants;Pizza
88,YhV93k9uiMdr3FlV4FHjwA,"""Caviness Studio""",,"""""",Phoenix,AZ,85001,33.449967,-112.070223,5.0,4,1,Marketing;Men's Clothing;Restaurants;Graphic D...


## Concatenate all reviews for each restaurant

In [11]:
az_review_df = pd.merge(arizona_rest_df, review_df, how = 'inner', left_on='business_id', right_on='business_id')
review_grouped = az_review_df.groupby('business_id')['text'].apply(' '.join).reset_index()

## Add business names to dataframe

In [12]:
name_df = arizona_rest_df[['business_id', 'name']]
review_with_names = pd.merge(review_grouped, name_df, how = 'inner', left_on='business_id', right_on='business_id')
review_with_names.head()

Unnamed: 0,business_id,text,name
0,--g-a85VwrdZJNf0R95GcQ,"My Daughter, grandsons and I were looking for ...","""Kabab House"""
1,-01XupAWZEXbdNbxNg5mEg,Horrible! Had the happy hour nachos. The che...,"""18 Degrees Neighborhood Grill"""
2,-050d_XIor1NpCuWkbIVaQ,So until today I didn't know there was a lot o...,"""Matt's Big Breakfast"""
3,-092wE7j5HZOogMLAh40zA,The food here is kinda mediocre. Not horrible ...,"""Wong's Jr"""
4,-0Sgh0QlUKVsWosCWJzGqQ,Despite the doubts you may have about Domino's...,"""Domino's Pizza"""


In [13]:
snowball = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

def tokenize(text):
    return [snowball.stem(word) for word in tokenizer.tokenize(text.lower())]

In [14]:
def vectorize_reviews(reviews):
    vectorizer = TfidfVectorizer(stop_words = 'english', tokenizer = tokenize,  max_features = 1000)
    tfidf_matrix = vectorizer.fit_transform(reviews)
    words = vectorizer.get_feature_names()
    return tfidf_matrix, words

In [22]:
def get_indices(cosine_similarity):
    return cosine_similarity.argsort()[:-6:-1]

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarities(vector, tfidf_m):
    return cosine_similarity(vector, tfidf_matrix).flatten()

In [32]:
def get_recommendations(indices):
    arr = []
    for i in indices:
        df = reviews_index[reviews_index['index'] == i]
        arr.append(df[['business_id', 'name']].values)
    
    return arr

## Example: Use only 10 businesses

In [18]:
small = review_grouped[:10] # tiny dataset
small = small.reset_index()
# create mapping from business id to index in tfidf_matrix
matrix_dict = dict(zip(small.business_id, small.index)) 
tfidf_matrix, words = vectorize_reviews(small['text'])
scipy.sparse.save_npz('tfidf_matrix.npz', tfidf_matrix) # save tfidf

## Map tfidf index with business_id

In [36]:
reviews_index = pd.merge(review_with_names, small[['business_id', 'index']], how = 'left', left_on='business_id', right_on='business_id')
index_table = reviews_index[['index', 'business_id']]
reviews_index.head()

Unnamed: 0,business_id,text,name,index
0,--g-a85VwrdZJNf0R95GcQ,"My Daughter, grandsons and I were looking for ...","""Kabab House""",0.0
1,-01XupAWZEXbdNbxNg5mEg,Horrible! Had the happy hour nachos. The che...,"""18 Degrees Neighborhood Grill""",1.0
2,-050d_XIor1NpCuWkbIVaQ,So until today I didn't know there was a lot o...,"""Matt's Big Breakfast""",2.0
3,-092wE7j5HZOogMLAh40zA,The food here is kinda mediocre. Not horrible ...,"""Wong's Jr""",3.0
4,-0Sgh0QlUKVsWosCWJzGqQ,Despite the doubts you may have about Domino's...,"""Domino's Pizza""",4.0


## Save Business-Index Mapping as SQL table

In [40]:
import mysql.connector
from sqlalchemy import create_engine
# Uncomment to save to a table
#engine = create_engine('mysql+mysqlconnector://root:poloisbae@localhost/yelp_db', echo=False)
#index_table.to_sql(name='business_index', con=engine)

## Grill Recommendation

In [38]:
grill = cosine_similarities(tfidf_matrix[1:2], tfidf_matrix)
grill_indices = get_indices(grill)
get_recommendations(grill_indices)

[array([['-01XupAWZEXbdNbxNg5mEg', '"18 Degrees Neighborhood Grill"']], dtype=object),
 array([['-0WegMt6Cy966qlDKhu6jA', '"Game Seven Grill"']], dtype=object),
 array([['-0tgMGl7D9B10YjSN2ujLA', '"Dubliner"']], dtype=object),
 array([['--g-a85VwrdZJNf0R95GcQ', '"Kabab House"']], dtype=object),
 array([['-1UMR00eXtwaeh59pEiDjA', '"Matt\'s Big Breakfast"']], dtype=object)]

## Breakfast Recommendation

In [39]:
breakfast = cosine_similarities(tfidf_matrix[2:3], tfidf_matrix)
breakfast_indices = get_indices(breakfast)
print get_recommendations(breakfast_indices)

[array([['-050d_XIor1NpCuWkbIVaQ', '"Matt\'s Big Breakfast"']], dtype=object), array([['-1UMR00eXtwaeh59pEiDjA', '"Matt\'s Big Breakfast"']], dtype=object), array([['-01XupAWZEXbdNbxNg5mEg', '"18 Degrees Neighborhood Grill"']], dtype=object), array([['--g-a85VwrdZJNf0R95GcQ', '"Kabab House"']], dtype=object), array([['-092wE7j5HZOogMLAh40zA', '"Wong\'s Jr"']], dtype=object)]
