# Building the Restaurant Recommender System

The purpose of this workbook is to carry forward the work I completed in developing the user-item and item-item filtering systems, and to develop it into a model/function that makes actual predictions for any given user.

Here I create my final function called `restaurant_predictions`, which for any given user profile and GPS location, provides the top restaurant recommendations within a certain GPS range as sorted by the predicted ratings for that given user profile.

***

In [1]:
# Import Python libraries as needed
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.model_selection import train_test_split
# import matplotlib.pyplot as plt ~ not utilized in this workbook

In [2]:
# Import ratings_matrix data-frame as created in a previous workbook
ratings_matrix = pd.read_pickle('data/user/ratings_matrix.pkl')

In [3]:
# Import the business and review tables as created in the data processing workbook
business = pd.read_pickle('data/user/business.pkl')
unique_business = pd.read_pickle('data/user/unique_business.pkl')
unique_user = pd.read_pickle('data/user/unique_user.pkl')

In [4]:
# Review the ratings matrix
ratings_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6548,6549,6550,6551,6552,6553,6554,6555,6556,6557
0,4.0,,,,,,,,,,...,,,,,,,,,,
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,1.0,,,,,,,,,,...,,,,,,,,,,
3,4.0,,,,,,,,,,...,,,,,,,,,,
4,1.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96527,,,,,,,,,,,...,,,,,,,,,,
96528,,,,,,,,,,,...,,,,,,,,,,
96529,,,,,,,,,,,...,,,,,,,,,,5.0
96530,,,,,,,,,,,...,,,,,,,,,,


In [5]:
# Create a function with the user_id value of any two users as the two parameters for the function
def find_user_similarity(userA, userB, ratings_matrix):
    # Create a True/False list of businesses that were given a rating for each of the two users
    businesses_rated_by_userA = ~ratings_matrix.loc[userA, :].isna()
    businesses_rated_by_userB = ~ratings_matrix.loc[userB, :].isna()

    # Consolidate the two boolean lists into a single one which represents only those businesses rated by both users
    businesses_rated_by_both_users = businesses_rated_by_userA & businesses_rated_by_userB

    # Capture the rating values of both users for those businesses that were rated by both users
    # Also transform these values into a format suitable for the cosine_similarity function
    ratings_of_userA = ratings_matrix.loc[userA, businesses_rated_by_both_users].values.reshape(1, -1)
    ratings_of_userB = ratings_matrix.loc[userB, businesses_rated_by_both_users].values.reshape(1, -1)

    # Capture the similaritiy between the two users by comparing their ratings for the set of businesses that they have both provided a rating for
    similarity = cosine_similarity(ratings_of_userA, ratings_of_userB)[0][0]

    # Return the consine similarity value as the output of this function
    return similarity

In [6]:
# Create a function to calculate the user-item rating prediction based on cosine similarity, with the following two parameters:
# target_business = business_id value for business for whom rating is being predicted for
# target_user = user_id value for the user for whom rating is being predicted for
def user_item_rating_prediction(target_user, target_business, ratings_matrix):

    # Create empty lists to store the:
    # 1. Similarities with other users to our target user
    similarities_to_target_user = []
    # 2. Existing ratings provided to our target business
    ratings_given_to_target_business = []

    # Create a list of all users that have provided a rating for the target business
    list_of_users_rating_target_business = list(ratings_matrix[~ratings_matrix.iloc[:, target_business].isna()].index)

    # Loop over every user in our target ratings matrix
    # We can refer to each user as the 'other_user' since we know that our target user did not provide a rating for our target business and hence is not in this smaller data frame
    for other_user in list_of_users_rating_target_business:
        # To compensate for the value error that may occur when the two users we are comparing have 0 businesses that they have both rated together
        try:
            # Capture the cosine similarity between our target user and the current user from the list of user we are looping over
            similarity = find_user_similarity(target_user, other_user, ratings_matrix)
            # Capture this similarity value to our list of similarity values
            similarities_to_target_user.append(similarity)
            # Capture the rating value of the current 'other_user' into our list of ratings given to our target businesses
            ratings_given_to_target_business.append(ratings_matrix.loc[other_user, target_business])
        # If a value error is generated, we simply pass over to the next loop
        # Since we will not be appending no values to neither our list of similarities and list of ratings, we will not be impacting our final calculation
        except:
            pass

    # Use the cosine similarity value to calculate the weighted average of all ratings (for those users that have at least 1 business that they have rated together)
    return np.dot(ratings_given_to_target_business, similarities_to_target_user)/np.sum(similarities_to_target_user)

In [7]:
# Create a function with the business_id value of any two businesses as the two parameters for the function
def find_business_similarity(businessA, businessB, ratings_matrix):
    
    # Create a True/False list of users that gave a rating for each of the two businesses
    users_who_rated_businessA = ~ratings_matrix.loc[:, businessA].isna()
    users_who_rated_businessB = ~ratings_matrix.loc[:, businessB].isna()
    
    # Consolidate the two boolean lists into a single one which represents only those users that rated both businesses
    users_who_rated_both_businesses = users_who_rated_businessA & users_who_rated_businessB
    
    # Capture the rating values of both businesses for those users that rated both businesses
    # Also transform these values into a format suitable for the cosine_similarity function
    ratings_of_businessA = ratings_matrix.loc[users_who_rated_both_businesses, businessA].values.reshape(1, -1)
    ratings_of_businessB = ratings_matrix.loc[users_who_rated_both_businesses, businessB].values.reshape(1, -1)
    
    # Capture the similaritiy between the two businesses by comparing their ratings for the set of users that both provided a rating for them
    similarity = cosine_similarity(ratings_of_businessA, ratings_of_businessB)[0][0]
    
    # Return the consine similarity value as the output of this function
    return similarity

In [8]:
# Create a function to calculate the user-item rating prediction based on cosine similarity, with the following two parameters:
# target_business = business_id value for business for whom rating is being predicted for
# target_user = user_id value for the user for whom rating is being predicted for
def item_item_rating_prediction(target_user, target_business, ratings_matrix):
   
    # Create empty lists to store the:
    # 1. Similarities with other users to our target user
    similarities_to_target_business = []
    # 2. Existing ratings provided to our target business
    ratings_given_by_target_user = []
    
    # Create a list of all users that have provided a rating for the target business
    list_of_businesses_rated_by_target_user = list(ratings_matrix.loc[:, ~ratings_matrix.iloc[target_user, :].isna()].columns)
    
    # Loop over every user in our target ratings matrix
    # We can refer to each user as the 'other_user' since we know that our target user did not provide a rating for our target business and hence is not in this smaller data frame
    for other_business in list_of_businesses_rated_by_target_user:
        # To compensate for the value error that may occur when the two users we are comparing have 0 businesses that they have both rated together
        try:
            # Capture the cosine similarity between our target user and the current user from the list of user we are looping over
            similarity = find_business_similarity(target_business, other_business, ratings_matrix)
            # Capture this similarity value to our list of similarity values
            similarities_to_target_business.append(similarity)
            # Capture the rating value of the current 'other_user' into our list of ratings given to our target businesses
            ratings_given_by_target_user.append(ratings_matrix.loc[target_user, other_business])
        # If a value error is generated, we simply pass over to the next loop
        # Since we will not be appending no values to neither our list of similarities and list of ratings, we will not be impacting our final calculation
        except:
            pass
    
    # Use the cosine similarity value to calculate the weighted average of all ratings (for those users that have at least 1 business that they have rated together)
    return np.dot(ratings_given_by_target_user, similarities_to_target_business)/np.sum(similarities_to_target_business)

In [9]:
# Confirm formuala is working as intended (as tested in previous notebook)
user_item_rating_prediction(0, 1, ratings_matrix)

3.7220670351652867

In [10]:
# Confirm formuala is working as intended (as tested in previous notebook)
item_item_rating_prediction(0, 1, ratings_matrix)

3.7917695651231935

***

## Explanation for Decimal Degrees

Decimal degrees is the standard unit of measurement for latitude and longitude geographic coordinates.

After some research and experimentation, I decided to use a decimal degree distance of 0.015 to determine the range of restaurants nearby to limit the scope of my recommender system.

This distance represents a straight distance of around 1.75 km in any one direction, i.e. a radius of 1.75 km which translates to an area just under 10 km^2.

However, due to the reality of road structures and transit networks, this roughly translates to approx.:
- 30-40 min of total walking time
- 20-30 min of total transit time 
- 5-10 min of total driving time

Reference link to USGS.gov website:
https://www.usgs.gov/faqs/how-much-distance-does-a-degree-minute-and-second-cover-your-maps?qt-news_science_products=0#qt-news_science_products

In [11]:
# Create a set of parameters (to use as an example) for our final recommender system

# GPS coordinates (randomly selected)
ex_lat_gps = 43.760204
ex_lon_gps = -79.331793

# Example user for whom we want to generate ratings for (randomly selected)
ex_user_id = 'utjGh0V0XweisHuHM6Wn3Q'

# Determine the unique user number (as used for the index in our ratings matrix) for the selected example user above
ex_user_num = unique_user[unique_user['user_id'] == ex_user_id]
ex_user_num = ex_user_num.iloc[0, 0]

# Set a distance value (in Decimal Degrees)* see explanation above
gps_distance = 0.015

In [13]:
# Create a function which outputs a list of restaurants sorted by ratings predicted for a given user
# Parameters:
# user_num: as per the matching index from our ratings matrix
# user_lat, user_lon: chosen geographic location to centre our recommendations around
# gps_distance: user inputted distance to determine geographic range, in decimal degrees
def restaurant_predictions(user_num, user_lat, user_lon, gps_distance):
    
    # Using the given GPS coordinates and GPS distance range, calcuate the max range for which to search restaurants within
    min_lat = user_lat - gps_distance
    max_lat = user_lat + gps_distance
    min_lon = user_lon - gps_distance
    max_lon = user_lon + gps_distance
    
    # Create an empty list of businesses where we will store the index value of our choosen businesses
    list_of_businesses_in_range = []
    
    # Looping over each record in our business df, select all the restaurants which fall within this range
    for record in business.index:
        
        # Capture the latitude and longitude coordinates for the current business we are looping over
        bus_lat = business.iloc[record, 5]
        bus_lon = business.iloc[record, 6]
        
        # If the GPS coordinates of the current business falls within our GPS range...
        if (bus_lat > min_lat) and (bus_lat < max_lat) and (bus_lon > min_lon) and (bus_lon < max_lon):
            # Add the current restaurant to our list of selected restaurants
            list_of_businesses_in_range.append(record)
           
    # Given our list of index values for our selected businesses...
    # Create a new df containing only those selected businesses
    businesses_in_range = business.loc[list_of_businesses_in_range,:]
    
    # Insert a new column called 'user rating' filled with null values
    businesses_in_range.insert(6, 'user_rating', np.nan)
    
    # Pull in the business_num for each unique business in our df as per the values used in our ratings matrix
    businesses_in_range = pd.merge(businesses_in_range, unique_business, on = 'business_id', how = 'inner')
    
    # Loop over every record in our df containing the selected restaurants with our user-inputted GPS range
    for record in businesses_in_range.index:
        # Capture the business_num for the current business we are looping over in order to refer to our ratings matrix
        business_num = businesses_in_range.iloc[record, 13]
        # Determine the ratings prediction using the hybrid approach as determined in the previous notebook...
        # Weighted average using 80% of the item-item-rating and 20% of the user-item-rating
        ratings_prediction = item_item_rating_prediction(user_num, business_num, ratings_matrix)*0.8 + user_item_rating_prediction(user_num, business_num, ratings_matrix)*0.2
        # Capture the predicted rating for the given user into our df
        businesses_in_range.iloc[record, 6] = ratings_prediction
    
    # Sort our df containing those selected restaurants within our GPS range by the predicted user rating
    businesses_in_range.sort_values('user_rating', ascending = False, inplace = True)
    
    # Return the df
    return businesses_in_range

In [18]:
%%time
# Cell execution time is roughly 30 sec. on average per query

# Use the function created above to make recommendations using the example parameters we set above
recommendations = restaurant_predictions(ex_user_num, ex_lat_gps, ex_lon_gps, gps_distance)



CPU times: user 31.3 s, sys: 244 ms, total: 31.5 s
Wall time: 31.5 s


In [19]:
# Show only the top 10 restaurants with the highest predicted user rating for the selected user (in our example scenario)
recommendations.head(10)

Unnamed: 0,business_id,name,address,city,postal_code,latitude,user_rating,longitude,stars,review_count,attributes,categories,hours,business_num
12,cHvjCa4Inun0ctWZNwQJkg,Hot Thai House,2579 Victoria Park Avenue,Scarborough,M1T 1A4,43.774356,4.9,-79.322104,3.0,12,"{'GoodForMeal': '{'dessert': False, 'latenight...","Restaurants, Thai","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'...",1629
30,2Gd-Gbi--RlOp6NmLbFMfQ,Grande Burrito,3-2555 Victoria Park Avenue,Toronto,M1T 1A3,43.772187,3.93176,-79.321186,5.0,10,"{'RestaurantsReservations': 'False', 'Corkage'...","Mexican, Food, Ice Cream & Frozen Yogurt, Tex-...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",4533
4,SC3jI19OD8z6hrqBZd1Ccw,Tim Hortons,2075 Sheppard Avenue E,North York,M2J 1W6,43.774812,3.405422,-79.334278,2.5,7,"{'RestaurantsDelivery': 'False', 'BikeParking'...","Breakfast & Brunch, Restaurants, Coffee & Tea,...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",1152
25,6NVkLU4d556yKNUPqDjjlg,Athens Pastries,2567 Victoria Park Avenue,Toronto,M1T 1A4,43.773974,3.389924,-79.321933,4.0,27,"{'RestaurantsPriceRange2': '1', 'GoodForKids':...","Food, Bakeries, Restaurants, Greek","{'Monday': '9:0-22:0', 'Tuesday': '9:0-22:0', ...",3376
26,BLgwFGqvAIiG9J2ID47YyA,Beijing Hot Pot Restaurant,"107 Parkway Forest Drive, Unit 1",Toronto,M2J 1L8,43.773994,3.330693,-79.340131,4.0,75,"{'HasTV': 'False', 'DogsAllowed': 'False', 'Re...","Barbeque, Restaurants, Hot Pot, Chinese","{'Monday': '16:0-0:0', 'Tuesday': '16:0-0:0', ...",3395
7,YIez_A3WOt9J2SXN7OMa2Q,Allwyn's Bakery,81 Underhill Drive,Toronto,M3A 1K8,43.745928,3.31306,-79.324623,4.0,122,"{'OutdoorSeating': 'False', 'Alcohol': ''none'...","Restaurants, Bakeries, Caribbean, Food","{'Monday': '11:30-22:0', 'Tuesday': '11:30-22:...",783
42,r6bvqwhWy73SgyK_w8Y5Lg,Starbucks,2555 Victoria Park Avenue,Toronto,M1T 1A3,43.772208,3.309083,-79.321208,3.5,24,"{'RestaurantsAttire': 'u'casual'', 'Restaurant...","Coffee & Tea, Food, Restaurants, Sandwiches","{'Monday': '6:0-22:0', 'Tuesday': '6:0-22:0', ...",6265
27,yflKHx4_Dc4enRvJYWxTLg,Philthy Philly's,2573 Victoria Park Avenue,Toronto,M1T 1A4,43.774176,3.282827,-79.322024,4.0,47,"{'RestaurantsAttire': ''casual'', 'WiFi': ''no...","Poutineries, Sandwiches, Cheesesteaks, Restaur...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",3379
33,lBuvkPla_tRrPgNjNWwLIA,High Street Fish and Chips,55 Underhill Drive,North York,M3A 2J7,43.745306,3.279403,-79.324959,4.0,53,"{'RestaurantsReservations': 'False', 'Restaura...","Restaurants, Seafood, Fish & Chips","{'Tuesday': '11:30-20:0', 'Wednesday': '11:30-...",4659
24,IuHEiRQaX_RmBmEIoE1AAQ,Yan Yu,237 Consumers Road,Toronto,M2J 0E9,43.77057,3.275638,-79.332033,4.0,31,"{'WiFi': 'u'free'', 'RestaurantsDelivery': 'Tr...","Restaurants, Chinese, Dim Sum","{'Monday': '10:0-22:30', 'Tuesday': '10:0-22:3...",3876


In [23]:
# Save the list of selected restaurants to a .csv file for visualization
recommendations.to_csv('data/user/final_recommendations.csv')

In [24]:
# Save the list of all businesses in Toronto to a .csv file for visualization
business.to_csv('data/user/business.csv')