# 04_Yelp_Restaruant_Recommendation_System

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('C:/Users/ymp/Yelp_Business_Proj/df_AZ_final.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,business_id,name,latitude,longitude,review_count,city,average_stars,categories,funny,user_id,review_id,text,stars,date,useful,cool
0,24,-01XupAWZEXbdNbxNg5mEg,18 Degrees Neighborhood Grill,33.639205,-111.881925,77,Scottsdale,3.0,"['Sports Bars', 'Nightlife', 'Restaurants', 'A...",1,-XoCb6sUMa7NoFayUW0FlA,Z7ufB8dFZ_9VjkcntOEsig,Horrible! Had the happy hour nachos. The che...,1,2016-09-03,2,0
1,26,-01XupAWZEXbdNbxNg5mEg,18 Degrees Neighborhood Grill,33.639205,-111.881925,77,Scottsdale,3.0,"['Sports Bars', 'Nightlife', 'Restaurants', 'A...",0,TqZOtEPp28bHyHyfkXonNw,pK6q9LKp1W1qQ0boE6kcsA,not bad for a hockey rink.....very good service,4,2017-04-05,0,0
2,27,-01XupAWZEXbdNbxNg5mEg,18 Degrees Neighborhood Grill,33.639205,-111.881925,77,Scottsdale,3.0,"['Sports Bars', 'Nightlife', 'Restaurants', 'A...",0,o0yk9cD97rPqsNCcb4DBFQ,RjIl7CW18ZYoUZkAdLHYig,"Went to this little place on Groupon, not one ...",1,2017-04-12,0,0


In [None]:
#Build the data

In [3]:
recommender_df = df[['business_id', 'user_id', 'stars']]
reviews_count_df = recommender_df.groupby('user_id')['stars'].count()

In [4]:
print('Max reviews: %s, Min reviews: %s' % (max(reviews_count_df), min(reviews_count_df)))
print('Median reviews: %s, Mean reviews: %s' % (np.median(reviews_count_df), round(np.mean(reviews_count_df),2)))
print('25%% reviews: %d,  75%% reviews: %d' % (np.percentile(reviews_count_df, 25), np.percentile(reviews_count_df, 75)))
print('Number of unique business: %d' % (len(set(recommender_df['business_id']))))

Max reviews: 404, Min reviews: 1
Median reviews: 1.0, Mean reviews: 2.78
25% reviews: 1,  75% reviews: 3
Number of unique business: 8479


In [5]:
#reviews' business_id, user_id and stars
active_user = list(reviews_count_df[reviews_count_df >= 10].index)
mask = [user in active_user for user in recommender_df['user_id']]
active_user_df = recommender_df[mask]
active_user_df.head(3)

Unnamed: 0,business_id,user_id,stars
0,-01XupAWZEXbdNbxNg5mEg,-XoCb6sUMa7NoFayUW0FlA,1
4,-01XupAWZEXbdNbxNg5mEg,JKDamUXVPAgIV2Hhb97ncw,4
6,-01XupAWZEXbdNbxNg5mEg,4SxcUfHAVHzMBqb6TtNPQw,3


In [6]:
print('active users: ', (len(active_user_df.groupby('user_id')['stars'].count())))
print('active users records:', (len(active_user_df)))

active users:  8078
active users records: 170088


In [8]:
# Create utility matrix
from scipy import sparse
highest_user_id = len(set(active_user_df['user_id']))
highest_movie_id = len(set(active_user_df['business_id']))
ratings_mat = sparse.lil_matrix((highest_user_id, highest_movie_id))
ratings_mat

<8078x7871 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in LInked List format>

In [9]:
#Fill the rate matrix based on table
user_id = list(set(active_user_df['user_id']))
business_id = list(set(active_user_df['business_id']))
for _, row in active_user_df.iterrows():
    ratings_mat[user_id.index(row.user_id), business_id.index(row.business_id)] = row.stars
ratings_mat

<8078x7871 sparse matrix of type '<class 'numpy.float64'>'
	with 170088 stored elements in LInked List format>

# Item - Item Collaborative Filter Recommender


In [10]:
from sklearn.metrics.pairwise import cosine_similarity
from time import time
class ItemItemRecommender(object):

    def __init__(self, neighborhood_size):
        self.neighborhood_size = neighborhood_size

    def fit(self, ratings_mat):
        self.ratings_mat = ratings_mat
        self.n_users = ratings_mat.shape[0]
        self.n_items = ratings_mat.shape[1]
        self.item_sim_mat = cosine_similarity(self.ratings_mat.T)
        self._set_neighborhoods()

    def _set_neighborhoods(self):
        least_to_most_sim_indexes = np.argsort(self.item_sim_mat, 1)
        self.neighborhoods = least_to_most_sim_indexes[:, -self.neighborhood_size:]

    def pred_one_user(self, user_id, report_run_time=False):
        start_time = time()
        items_rated_by_this_user = self.ratings_mat[user_id].nonzero()[1]
        # Just initializing so I have somewhere to put rating preds
        out = np.zeros(self.n_items)
        for item_to_rate in range(self.n_items):
            relevant_items = np.intersect1d(self.neighborhoods[item_to_rate],
                                            items_rated_by_this_user,
                                            assume_unique=True)  # assume_unique speeds up intersection op
            out[item_to_rate] = self.ratings_mat[user_id, relevant_items] * \
                self.item_sim_mat[item_to_rate, relevant_items] / \
                self.item_sim_mat[item_to_rate, relevant_items].sum()
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        cleaned_out = np.nan_to_num(out)
        return cleaned_out

    def pred_all_users(self, report_run_time=False):
        start_time = time()
        all_ratings = [
            self.pred_one_user(user_id) for user_id in range(self.n_users)]
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        return np.array(all_ratings)

    def top_n_recs(self, user_id, n):
        pred_ratings = self.pred_one_user(user_id)
        item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))
        items_rated_by_this_user = self.ratings_mat[user_id].nonzero()[1]
        unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                        if item not in items_rated_by_this_user]
        return unrated_items_by_pred_rating[-n:], items_rated_by_this_user

In [11]:
my_rec_engine = ItemItemRecommender(neighborhood_size=80)
my_rec_engine.fit(ratings_mat)

In [12]:
lucky_user = np.random.choice(active_user_df['user_id'], 1)[0]
lucky_user_index = user_id.index(lucky_user)
lucky_user_recommend, items_rated_by_this_user = my_rec_engine.top_n_recs(user_id=lucky_user_index, n = 10)



In [13]:
print("The top ten recommendation for user %s are: " % (lucky_user))
print('%s' % (', '.join(list(set(df['name'][df['business_id'] == business_id[i]]))[0] \
                       for i in lucky_user_recommend)))

The top ten recommendation for user dWLdQjvlpNrOREiaO1SWFQ are: 
The Counter, Subway, Angry Crab & BBQ, Domino's Pizza, Samurai Sam's, La Pasadita Hot Dogs, Mariscos La Phoenikera, The Venue Scottsdale, Jack-In-the Box Drive Thru, Subway


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
original_rated_restaurants = [list(set(df['name'][df['business_id'] == business_id[i]]))[0] for i in items_rated_by_this_user]
mask = [name in original_rated_restaurants for name in df['name']]
original_category = df['categories'][mask]
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True
                            )
original_category_vec = vectorizer.fit_transform(original_category).toarray()
original_word = vectorizer.get_feature_names()
print('Categories from user rated restaurants: \n%s' % (','.join(i for i in original_word)))

Categories from user rated restaurants: 
american,asian,bakeries,bar,barbeque,bars,beer,breakfast,brunch,burgers,cafes,caterers,chinese,chips,cocktail,coffee,comfort,cuban,delis,delivery,desserts,eastern,ethnic,event,fast,fish,food,fusion,gastropubs,greek,hawaiian,health,italian,japanese,korean,latin,lebanese,markets,mediterranean,mexican,middle,new,nightlife,pasta,planning,restaurants,salad,sandwiches,seafood,services,shopping,shops,soup,southern,spaces,specialty,sports,steakhouses,street,sushi,tacos,tea,thai,traditional,vegetarian,vendors,venues,vietnamese,wine


In [16]:
recommend_res = [list(set(df['name'][df['business_id'] == business_id[i]]))[0] \
                 for i in lucky_user_recommend]
mask = [name in recommend_res for name in df['name']]
recommend_category = df['categories'][mask]
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True
                            )
recommend_category_vec = vectorizer.fit_transform(recommend_category).toarray()
recommend_word = vectorizer.get_feature_names()
print('Categories from recommend restaurants: \n%s' % (','.join(i for i in recommend_word)))

Categories from recommend restaurants: 
american,barbeque,bars,breakfast,brunch,burgers,cajun,caterers,chicken,creole,delis,delivery,dogs,event,fast,food,hot,italian,japanese,mexican,new,nightlife,party,pizza,planning,restaurants,salad,sandwiches,seafood,services,spaces,tapas,traditional,venues,wings


In [17]:
#Check the common labels
print("Common labels are: \n%s" % (', '.join(word for word in recommend_word if word in original_word)))

Common labels are: 
american, barbeque, bars, breakfast, brunch, burgers, caterers, delis, delivery, event, fast, food, italian, japanese, mexican, new, nightlife, planning, restaurants, salad, sandwiches, seafood, services, spaces, traditional, venues


# Matrix Factorization recommender (NMF)

In [18]:
from sklearn.decomposition import NMF
class NMF_Recommender(object):

    def __init__(self, n_components):
        self.n_components = n_components

    def fit(self, ratings_mat):
        self.ratings_mat = ratings_mat
        self.n_users = ratings_mat.shape[0]
        self.n_items = ratings_mat.shape[1]
        nmf = NMF(n_components = 200)
        nmf.fit(ratings_mat)
        self.W = nmf.transform(ratings_mat)
        self.H = nmf.components_
        self.error = nmf.reconstruction_err_
        self.ratings_mat_fitted = self.W.dot(self.H)

    def get_error(self):
        return self.error
        
    def pred_one_user(self, user_id, report_run_time=False):
        start_time = time()
        cleaned_out = self.ratings_mat_fitted[user_id,:]
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        return cleaned_out

    def pred_all_users(self, report_run_time=False):
        start_time = time()
        all_ratings = [
            self.pred_one_user(user_id) for user_id in range(self.n_users)]
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        return np.array(all_ratings)

    def top_n_recs(self, user_id, n):
        pred_ratings = self.pred_one_user(user_id)
        item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))
        items_rated_by_this_user = self.ratings_mat[user_id].nonzero()[1]
        unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                        if item not in items_rated_by_this_user]
        return unrated_items_by_pred_rating[-n:], items_rated_by_this_user

In [19]:
# get recommendations for the same lucky user
my_rec_engine = NMF_Recommender(n_components=200)
my_rec_engine.fit(ratings_mat)
lucky_user_recommend, items_rated_by_this_user = my_rec_engine.top_n_recs(user_id=lucky_user_index, n = 10)
print("The top ten recommendation for user %s are: " % (lucky_user))
print('%s' % (', '.join(list(set(df['name'][df['business_id'] == business_id[i]]))[0] \
                       for i in lucky_user_recommend)))

The top ten recommendation for user dWLdQjvlpNrOREiaO1SWFQ are: 
Rudy's Country Store and Bar-B-Q, Angry Crab Shack, Fiesta Mexicana, The Haymaker Goodyear, Ada's Fish Fry, MOD Pizza, Black Bear Diner, Oregano's Pizza Bistro, Pho Thanh, Saddle Mountain Brewing Company


In [20]:
print("The users original rated resturants are :\n %s" % (','.join(list(set(df['name'][df['business_id'] == business_id[i]]))[0] \
                       for i in items_rated_by_this_user)))

The users original rated resturants are :
 Jim's Burgers and Eggs,The Habit Burger Grill,Firehouse Subs,Ah-So Sushi & Steak,Ruby Tuesday,DaVang's,KiKu Revolving Sushi,Dillon's KC BBQ,Sinbad's Restaurant,Pita Wraps,Yard House,Aloha Kitchen,Biscuits Cafe,Zeta's Grill,Cracker Barrel Old Country Store,Mandy's Fish & Chips,Red's Bar & Grill,Olive Garden Italian Restaurant,Mr Mesquite Taqueria,Heidi's Brooklyn Deli,Little Saigon,China 7 Chinese Cuisine,Wildflower Bread Company,Cuban Foods Bakery & Restaurant,Park Cafe,Benihana,Cheddar's Scratch Kitchen,Mariscos Playa Hermosa,La Barquita Restaurant,Red Lobster,Pita Kitchen - Avondale,Tomo Japanese Cuisine,Benihana,Royal Jasmine Thai Restaurant,Rubio's Coastal Grill,Don Pancho Mexican Food,Subway,Ah Hai Sushi & Grill,Subway,El Original Tacos Jalisco,Firebirds Wood Fired Grill,Lone Spur Cafe,Cafe Rio Mexican Grill,Osaka Japanese Steakhouse,Bluewater Grill,Luci's Healthy Marketplace


In [21]:
original_rated_restaurants = [list(set(df['name'][df['business_id'] == business_id[i]]))[0] for i in items_rated_by_this_user]
mask = [name in original_rated_restaurants for name in df['name']]
original_category = df['categories'][mask]
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True
                            )
original_category_vec = vectorizer.fit_transform(original_category).toarray()
original_word = vectorizer.get_feature_names()
print('Categories from user rated restaurants: \n%s' % (','.join(i for i in original_word)))

Categories from user rated restaurants: 
american,asian,bakeries,bar,barbeque,bars,beer,breakfast,brunch,burgers,cafes,caterers,chinese,chips,cocktail,coffee,comfort,cuban,delis,delivery,desserts,eastern,ethnic,event,fast,fish,food,fusion,gastropubs,greek,hawaiian,health,italian,japanese,korean,latin,lebanese,markets,mediterranean,mexican,middle,new,nightlife,pasta,planning,restaurants,salad,sandwiches,seafood,services,shopping,shops,soup,southern,spaces,specialty,sports,steakhouses,street,sushi,tacos,tea,thai,traditional,vegetarian,vendors,venues,vietnamese,wine


In [22]:
recommend_res = [list(set(df['name'][df['business_id'] == business_id[i]]))[0] \
                 for i in lucky_user_recommend]
mask = [name in recommend_res for name in df['name']]
recommend_category = df['categories'][mask]
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True
                            )
recommend_category_vec = vectorizer.fit_transform(recommend_category).toarray()
recommend_word = vectorizer.get_feature_names()
print('Categories from recommend restaurants: \n%s' % (','.join(i for i in recommend_word)))

Categories from recommend restaurants: 
american,automotive,barbeque,bars,breakfast,breweries,brunch,cajun,chicken,chips,creole,diners,fast,fish,food,gas,gastropubs,italian,mexican,new,nightlife,pasta,pizza,restaurants,salad,sandwiches,seafood,shops,soup,specialty,stations,traditional,vietnamese,wings


In [23]:
#Check the common labels
print("Common labels are: \n%s" % (', '.join(word for word in recommend_word if word in original_word)))

Common labels are: 
american, barbeque, bars, breakfast, brunch, chips, fast, fish, food, gastropubs, italian, mexican, new, nightlife, pasta, restaurants, salad, sandwiches, seafood, shops, soup, specialty, traditional, vietnamese


In [24]:
#get the number of labels 
mask = [business in business_id for business in df['business_id']]
category = df['categories'][mask]
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True
                            )
category_vec = vectorizer.fit_transform(category).toarray()
words = vectorizer.get_feature_names()
#This is the number of unique categories
print('The total number of restaurant labels is %d' % (len(words))) 

The total number of restaurant labels is 445


In [25]:
from sklearn.decomposition import TruncatedSVD
class SVD_Recommender(object):

    def __init__(self):
        self.n_components = 361 #the number of labels

    def fit(self, ratings_mat):
        self.ratings_mat = ratings_mat
        self.n_users = ratings_mat.shape[0]
        self.n_items = ratings_mat.shape[1]
        svd = TruncatedSVD(n_components=self.n_components, n_iter=7, random_state=1)
        svd.fit(ratings_mat)
        self.V = svd.components_
        self.U = svd.transform(ratings_mat)
        self.ratings_mat_fitted = self.U.dot(self.V)

    def get_error(self):
        return ((self.ratings_mat_fitted - self.ratings_mat)**2).mean(axis=None)
        
    def pred_one_user(self, user_id, report_run_time=False):
        start_time = time()
        cleaned_out = self.ratings_mat_fitted[user_id,:]
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        return cleaned_out

    def pred_all_users(self, report_run_time=False):
        start_time = time()
        all_ratings = [
            self.pred_one_user(user_id) for user_id in range(self.n_users)]
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        return np.array(all_ratings)

    def top_n_recs(self, user_id, n):
        pred_ratings = self.pred_one_user(user_id)
        item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))
        items_rated_by_this_user = self.ratings_mat[user_id].nonzero()[1]
        unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                        if item not in items_rated_by_this_user]
        return unrated_items_by_pred_rating[-n:], items_rated_by_this_user

In [26]:
# get recommendations for the same lucky user
my_rec_engine = SVD_Recommender()
my_rec_engine.fit(ratings_mat)
lucky_user_recommend, items_rated_by_this_user = my_rec_engine.top_n_recs(user_id=lucky_user_index, n = 10)
print("The top ten recommendation for user %s are: " % (lucky_user))
print('%s' % (', '.join(list(set(df['name'][df['business_id'] == business_id[i]]))[0] \
                       for i in lucky_user_recommend)))

The top ten recommendation for user dWLdQjvlpNrOREiaO1SWFQ are: 
Mora Italian, Black Bear Diner, TEXAZ Grill, T.C. Eggington's, MOD Pizza, El Chullo Peruvian Restaurant & Bar, Rudy's Country Store and Bar-B-Q, Fiesta Mexicana, Greek Gyro Express, PT Noodles


In [27]:
original_rated_restaurants = [list(set(df['name'][df['business_id'] == business_id[i]]))[0] for i in items_rated_by_this_user]
mask = [name in original_rated_restaurants for name in df['name']]
original_category = df['categories'][mask]
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True
                            )
original_category_vec = vectorizer.fit_transform(original_category).toarray()
original_word = vectorizer.get_feature_names()
print('Categories from user rated restaurants: \n%s' % (','.join(i for i in original_word)))

Categories from user rated restaurants: 
american,asian,bakeries,bar,barbeque,bars,beer,breakfast,brunch,burgers,cafes,caterers,chinese,chips,cocktail,coffee,comfort,cuban,delis,delivery,desserts,eastern,ethnic,event,fast,fish,food,fusion,gastropubs,greek,hawaiian,health,italian,japanese,korean,latin,lebanese,markets,mediterranean,mexican,middle,new,nightlife,pasta,planning,restaurants,salad,sandwiches,seafood,services,shopping,shops,soup,southern,spaces,specialty,sports,steakhouses,street,sushi,tacos,tea,thai,traditional,vegetarian,vendors,venues,vietnamese,wine


In [28]:
recommend_res = [list(set(df['name'][df['business_id'] == business_id[i]]))[0] \
                 for i in lucky_user_recommend]
mask = [name in recommend_res for name in df['name']]
recommend_category = df['categories'][mask]
vectorizer = TfidfVectorizer(analyzer = 'word', stop_words = 'english', 
                             lowercase = True
                            )
recommend_category_vec = vectorizer.fit_transform(recommend_category).toarray()
recommend_word = vectorizer.get_feature_names()
print('Categories from recommend restaurants: \n%s' % (','.join(i for i in recommend_word)))

Categories from recommend restaurants: 
american,automotive,barbeque,bars,breakfast,brunch,diners,fast,food,gas,greek,italian,juice,mexican,new,noodles,peruvian,pizza,restaurants,salad,sandwiches,smoothies,soup,stations,steakhouses,traditional,vietnamese


In [29]:
#Check the common labels
print("Common labels are: \n%s" % (', '.join(word for word in recommend_word if word in original_word)))

Common labels are: 
american, barbeque, bars, breakfast, brunch, fast, food, greek, italian, mexican, new, restaurants, salad, sandwiches, soup, steakhouses, traditional, vietnamese
