In [3]:
# Recommendation System for Madison Square Garden venue food ordering mobile app
# by Gregory Rzeczko

# pandas for data manipulation
import pandas as pd
from collections import defaultdict 
# read excel with customer orders data
df = pd.read_excel('menu-items-events-companies.xlsx', index_col=0)

In [4]:
df.head()

Unnamed: 0_level_0,event_id,menu_item_name,quantity,menu_item_id,unit_price,total_price,event_name,company_name,suite,customer_name,category_name
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,61,ARANCINI BROTHERS RICE BALL SAMPLER,1,135,119.0,119.0,RANGERS VS COLUMBUS,Willow Lifestyle Clubs,MLS-715,Alexander Kharlamov,Appetizer
1,61,BRISKET SLIDERS,1,148,163.0,163.0,RANGERS VS COLUMBUS,Willow Lifestyle Clubs,MLS-715,Alexander Kharlamov,Sandwiches
2,1,GRILLED CHEESE SANDWICHES WITH TOMATO “SOUP” DIP,1,48,127.0,127.0,AMC PREMIERE - TWD,Willow Lifestyle Clubs,MLS-715,Richard Barrow,Sandwiches
2,1,CHEESE RAVIOLI,4,147,177.0,708.0,AMC PREMIERE - TWD,Willow Lifestyle Clubs,MLS-715,Richard Barrow,Entrée
3,2,GUINNESS (4 PACK),1,91,40.0,40.0,BILLY JOEL,Willow Lifestyle Clubs,MLS-715,Alexander Kharlamov,Beer


In [5]:
df.describe()

Unnamed: 0,event_id,quantity,menu_item_id,unit_price,total_price
count,387018.0,387018.0,387018.0,387018.0,387018.0
mean,702.370357,1.086942,185.474161,100.378481,105.381521
std,337.755437,0.382249,158.648329,59.238552,64.161264
min,1.0,0.0,1.0,0.0,0.0
25%,445.0,1.0,60.0,53.0,66.0
50%,701.0,1.0,139.0,96.0,101.0
75%,987.0,1.0,268.0,133.0,134.0
max,1320.0,16.0,631.0,2200.0,4400.0


In [6]:
df.shape

(387018, 11)

In [7]:
df.dtypes

event_id            int64
menu_item_name     object
quantity            int64
menu_item_id        int64
unit_price        float64
total_price       float64
event_name         object
company_name       object
suite              object
customer_name      object
category_name      object
dtype: object

In [8]:
df.isnull().sum()

event_id          0
menu_item_name    0
quantity          0
menu_item_id      0
unit_price        0
total_price       0
event_name        0
company_name      0
suite             0
customer_name     0
category_name     0
dtype: int64

In [9]:
df.isin([0]).sum()

event_id           0
menu_item_name     0
quantity           8
menu_item_id       0
unit_price        25
total_price       25
event_name         0
company_name       0
suite              0
customer_name      0
category_name      0
dtype: int64

In [10]:
from collections import OrderedDict 
# printing original list 
# print("The original list : " + str(df["customer_name"]))
customers = df["customer_name"].tolist()

In [11]:
# using list comprehension + defaultdict + lambda 
# assigning ids to values 
temp = defaultdict(lambda: len(temp)) 
res = [temp[ele] for ele in customers] 
  
# print result 
print("The ids of assigned values is : " + str(res)) 

# we associate IDs with customers with the above code

The ids of assigned values is : [0, 0, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11, 11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,

In [12]:
df['customer_id'] = res

In [13]:
df.head()

Unnamed: 0_level_0,event_id,menu_item_name,quantity,menu_item_id,unit_price,total_price,event_name,company_name,suite,customer_name,category_name,customer_id
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,61,ARANCINI BROTHERS RICE BALL SAMPLER,1,135,119.0,119.0,RANGERS VS COLUMBUS,Willow Lifestyle Clubs,MLS-715,Alexander Kharlamov,Appetizer,0
1,61,BRISKET SLIDERS,1,148,163.0,163.0,RANGERS VS COLUMBUS,Willow Lifestyle Clubs,MLS-715,Alexander Kharlamov,Sandwiches,0
2,1,GRILLED CHEESE SANDWICHES WITH TOMATO “SOUP” DIP,1,48,127.0,127.0,AMC PREMIERE - TWD,Willow Lifestyle Clubs,MLS-715,Richard Barrow,Sandwiches,1
2,1,CHEESE RAVIOLI,4,147,177.0,708.0,AMC PREMIERE - TWD,Willow Lifestyle Clubs,MLS-715,Richard Barrow,Entrée,1
3,2,GUINNESS (4 PACK),1,91,40.0,40.0,BILLY JOEL,Willow Lifestyle Clubs,MLS-715,Alexander Kharlamov,Beer,0


In [14]:
# Creating a Dummy

# Dummy for marking whether a customer bought that item or not.
# If one buys an item, then purchase_dummy are marked as 1

def create_data_dummy(df):
    data_dummy = df.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy

data_dummy = create_data_dummy(df)
data_dummy.head()

'''
Why create a dummy instead of normalizing it? 
Normalizing the quantity, say by each user, would not work because customers 
with different buying frequencies don’t have the same taste. However, 
we can normalize items by purchase frequency across all users, 
which is done below.
'''

Unnamed: 0_level_0,event_id,menu_item_name,quantity,menu_item_id,unit_price,total_price,event_name,company_name,suite,customer_name,category_name,customer_id,purchase_dummy
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,61,ARANCINI BROTHERS RICE BALL SAMPLER,1,135,119.0,119.0,RANGERS VS COLUMBUS,Willow Lifestyle Clubs,MLS-715,Alexander Kharlamov,Appetizer,0,1
1,61,BRISKET SLIDERS,1,148,163.0,163.0,RANGERS VS COLUMBUS,Willow Lifestyle Clubs,MLS-715,Alexander Kharlamov,Sandwiches,0,1
2,1,GRILLED CHEESE SANDWICHES WITH TOMATO “SOUP” DIP,1,48,127.0,127.0,AMC PREMIERE - TWD,Willow Lifestyle Clubs,MLS-715,Richard Barrow,Sandwiches,1,1
2,1,CHEESE RAVIOLI,4,147,177.0,708.0,AMC PREMIERE - TWD,Willow Lifestyle Clubs,MLS-715,Richard Barrow,Entrée,1,1
3,2,GUINNESS (4 PACK),1,91,40.0,40.0,BILLY JOEL,Willow Lifestyle Clubs,MLS-715,Alexander Kharlamov,Beer,0,1


In [15]:
# Normalizing item values across users

# To do this, we normalize purchase frequency of each item across users by first creating a user-item matrix as follows

df_matrix = pd.pivot_table(df, values='quantity', index='customer_id', columns='menu_item_id')
df_matrix

menu_item_id,1,2,3,4,5,7,8,9,10,11,...,619,620,621,623,624,625,628,629,630,631
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,1.0,1.0,,1.0,3.833333,1.0,,1.0,1.0,1.0,...,,,,,,,,,,
4,1.0,1.0,,1.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476,1.0,,,,,,,,,,...,,,,,,,,,,
477,,,,,,,,,,,...,,,,,,,,,,
478,,,,1.0,,,,,,,...,,,,1.0,,,,,,
479,,,,,,,,,,,...,,,,,,,,,,


In [16]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
df_matrix_norm

menu_item_id,1,2,3,4,5,7,8,9,10,11,...,619,620,621,623,624,625,628,629,630,631
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,0.0,0.0,,0.0,0.944444,0.0,,0.0,0.0,0.0,...,,,,,,,,,,
4,0.0,0.0,,0.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476,0.0,,,,,,,,,,...,,,,,,,,,,
477,,,,,,,,,,,...,,,,,,,,,,
478,,,,0.0,,,,,,,...,,,,,,,,,,
479,,,,,,,,,,,...,,,,,,,,,,


In [17]:
# create a table for input to the modeling  
d = df_matrix_norm.reset_index() 
d.index.names = ['scaled_purchase_freq'] 
data_norm = pd.melt(d, id_vars=['customer_id'], value_name='scaled_purchase_freq').dropna()
print(data_norm.shape)
data_norm.head()

'''
In this step, we have normalized their purchase history, 
from 0–1 (with 1 being the most number of purchases for an 
item and 0 being 0 purchase count for that item).
'''

(25834, 3)


Unnamed: 0,customer_id,menu_item_id,scaled_purchase_freq
3,3,1,0.0
4,4,1,0.0
5,5,1,0.0
6,6,1,0.00295
7,7,1,0.0


In [18]:
def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [19]:
import time
# turicreate for performing model selection and evaluation
import turicreate as tc
from sklearn.model_selection import train_test_split

import sys
sys.path.append("..")

# Now that we have three datasets with purchase counts, purchase dummy, and scaled purchase counts, 
# we would like to split each for modeling.

train_data, test_data = split_data(df)
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [20]:
# constant variables to define field names include:
user_id = 'customer_id'
item_id = 'menu_item_id'
users_to_recommend = list(df[user_id])
n_rec = 10 # number of items to recommend
n_display = 30 # to display the first few rows in an output dataset

In [21]:
# Define Models using Turicreate library

def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    '''
    The popularity model takes the most popular items for recommendation. 
    These items are products with the highest number of sells across customers.
    '''
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

In [22]:
name = 'popularity'
target = 'quantity'
popularity = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+-------------+--------------+--------------------+------+
| customer_id | menu_item_id |       score        | rank |
+-------------+--------------+--------------------+------+
|      0      |     214      |        2.5         |  1   |
|      0      |     190      |        2.5         |  2   |
|      0      |     118      | 2.272727272727273  |  3   |
|      0      |     432      |        2.0         |  4   |
|      0      |     335      |        2.0         |  5   |
|      0      |      88      | 1.9473684210526316 |  6   |
|      0      |      5       |        1.9         |  7   |
|      0      |     237      | 1.8571428571428572 |  8   |
|      0      |      26      | 1.8116996507566938 |  9   |
|      0      |      29      |  1.77772798565987  |  10  |
|      0      |     214      |        2.5         |  1   |
|      0      |     190      |        2.5         |  2   |
|      0      |     118      | 2.272727272727273  |  3   |
|      0      |     432      |        2.0         |  4  

In [23]:
# using purchase dummy
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+-------------+--------------+-------+------+
| customer_id | menu_item_id | score | rank |
+-------------+--------------+-------+------+
|      0      |     551      |  1.0  |  1   |
|      0      |     285      |  1.0  |  2   |
|      0      |     217      |  1.0  |  3   |
|      0      |     251      |  1.0  |  4   |
|      0      |     268      |  1.0  |  5   |
|      0      |     137      |  1.0  |  6   |
|      0      |      4       |  1.0  |  7   |
|      0      |     144      |  1.0  |  8   |
|      0      |     142      |  1.0  |  9   |
|      0      |      60      |  1.0  |  10  |
|      0      |     551      |  1.0  |  1   |
|      0      |     285      |  1.0  |  2   |
|      0      |     217      |  1.0  |  3   |
|      0      |     251      |  1.0  |  4   |
|      0      |     268      |  1.0  |  5   |
|      0      |     137      |  1.0  |  6   |
|      0      |      4       |  1.0  |  7   |
|      0      |     144      |  1.0  |  8   |
|      0      |     142      |  1.

In [24]:
# using scaled purchase freq
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+-------------+--------------+---------------------+------+
| customer_id | menu_item_id |        score        | rank |
+-------------+--------------+---------------------+------+
|      0      |     192      |         0.5         |  1   |
|      0      |      27      |         0.5         |  2   |
|      0      |     539      |         0.5         |  3   |
|      0      |     470      |  0.4537037037037038 |  4   |
|      0      |     329      | 0.40277777777777785 |  5   |
|      0      |     625      |         0.4         |  6   |
|      0      |     169      | 0.38888888888888884 |  7   |
|      0      |     619      |        0.375        |  8   |
|      0      |     330      | 0.36363636363636365 |  9   |
|      0      |     479      | 0.35671191553544496 |  10  |
|      0      |     192      |         0.5         |  1   |
|      0      |      27      |         0.5         |  2   |
|      0      |     539      |         0.5         |  3   |
|      0      |     470      |  0.453703

In [None]:
'''
Once we created the model, we predicted the recommendation items using scores by popularity. 
As you can tell for each model results above, the rows show the first 30 records from 1000 
users with 10 recommendations. These 30 records include 3 users and their recommended items, 
along with score and descending ranks.

In the result, although different models have different recommendation list, each user is 
recommended the same list of 10 items. This is because popularity is calculated by taking 
the most popular items across all users.
'''

In [None]:
# Collaborative Filtering Model
'''
In collaborative filtering, we would recommend items based on how similar users purchase items. 
For instance, if customer 1 and customer 2 bought similar items, e.g. 1 bought X, Y, Z and 
2 bought X, Y, we would recommend an item Z to customer 2.
'''

In [25]:
name = 'cosine'
target = 'quantity'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+-------------+--------------+---------------------+------+
| customer_id | menu_item_id |        score        | rank |
+-------------+--------------+---------------------+------+
|      0      |     148      | 0.41651538345548844 |  1   |
|      0      |      1       |  0.4136105179786682 |  2   |
|      0      |      59      |  0.3919026123152839 |  3   |
|      0      |     121      | 0.38641321659088135 |  4   |
|      0      |     274      |  0.3851756784651015 |  5   |
|      0      |      4       |  0.3851112127304077 |  6   |
|      0      |     144      |  0.3847915265295241 |  7   |
|      0      |     136      | 0.38207639588250053 |  8   |
|      0      |     128      | 0.37931616438759697 |  9   |
|      0      |     139      |  0.3751484288109673 |  10  |
|      0      |     148      | 0.41651538345548844 |  1   |
|      0      |      1       |  0.4136105179786682 |  2   |
|      0      |      59      |  0.3919026123152839 |  3   |
|      0      |     121      | 0.3864132

In [26]:
name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+-------------+--------------+---------------------+------+
| customer_id | menu_item_id |        score        | rank |
+-------------+--------------+---------------------+------+
|      0      |      59      | 0.40167398585213554 |  1   |
|      0      |     253      | 0.39585887061225045 |  2   |
|      0      |     139      | 0.39231093724568683 |  3   |
|      0      |     128      |  0.3916621009508769 |  4   |
|      0      |     144      |  0.3911211093266805 |  5   |
|      0      |      48      |  0.3828655746248033 |  6   |
|      0      |      4       | 0.36120310094621444 |  7   |
|      0      |     142      | 0.36053480042351616 |  8   |
|      0      |     121      |  0.3605281048350864 |  9   |
|      0      |      60      | 0.35970994498994613 |  10  |
|      0      |      59      | 0.40167398585213554 |  1   |
|      0      |     253      | 0.39585887061225045 |  2   |
|      0      |     139      | 0.39231093724568683 |  3   |
|      0      |     128      |  0.391662

In [27]:
name = 'cosine' 
target = 'scaled_purchase_freq' 
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+-------------+--------------+----------------------+------+
| customer_id | menu_item_id |        score         | rank |
+-------------+--------------+----------------------+------+
|      0      |     329      | 0.04306706786155701  |  1   |
|      0      |     304      | 0.022214733064174652 |  2   |
|      0      |     200      | 0.013455837965011597 |  3   |
|      0      |     133      | 0.011378630995750427 |  4   |
|      0      |     186      |  0.0086442232131958  |  5   |
|      0      |     219      | 0.008259765803813934 |  6   |
|      0      |      46      | 0.008259765803813934 |  7   |
|      0      |     289      | 0.007794804871082306 |  8   |
|      0      |     220      | 0.007508881390094757 |  9   |
|      0      |      49      | 0.007508881390094757 |  10  |
|      0      |     329      | 0.04306706786155701  |  1   |
|      0      |     304      | 0.022214733064174652 |  2   |
|      0      |     200      | 0.013455837965011597 |  3   |
|      0      |     133 

In [28]:
name = 'pearson'
target = 'quantity'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+-------------+--------------+--------------------+------+
| customer_id | menu_item_id |       score        | rank |
+-------------+--------------+--------------------+------+
|      0      |     190      |        2.5         |  1   |
|      0      |     214      | 2.499416364563836  |  2   |
|      0      |     332      | 2.3223635421858893 |  3   |
|      0      |     169      | 2.166071573893229  |  4   |
|      0      |     335      |        2.0         |  5   |
|      0      |     432      | 1.999414066473643  |  6   |
|      0      |     625      | 1.798602827390035  |  7   |
|      0      |      26      | 1.7823885216117104 |  8   |
|      0      |      88      | 1.7678398551085057 |  9   |
|      0      |     618      | 1.739935645870134  |  10  |
|      0      |     190      |        2.5         |  1   |
|      0      |     214      | 2.499416364563836  |  2   |
|      0      |     332      | 2.3223635421858893 |  3   |
|      0      |     169      | 2.166071573893229  |  4  

In [29]:
name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+-------------+--------------+-------+------+
| customer_id | menu_item_id | score | rank |
+-------------+--------------+-------+------+
|      0      |     551      |  0.0  |  1   |
|      0      |     285      |  0.0  |  2   |
|      0      |     217      |  0.0  |  3   |
|      0      |     251      |  0.0  |  4   |
|      0      |     268      |  0.0  |  5   |
|      0      |     137      |  0.0  |  6   |
|      0      |      4       |  0.0  |  7   |
|      0      |     144      |  0.0  |  8   |
|      0      |     142      |  0.0  |  9   |
|      0      |      60      |  0.0  |  10  |
|      0      |     551      |  0.0  |  1   |
|      0      |     285      |  0.0  |  2   |
|      0      |     217      |  0.0  |  3   |
|      0      |     251      |  0.0  |  4   |
|      0      |     268      |  0.0  |  5   |
|      0      |     137      |  0.0  |  6   |
|      0      |      4       |  0.0  |  7   |
|      0      |     144      |  0.0  |  8   |
|      0      |     142      |  0.

In [30]:
name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+-------------+--------------+---------------------+------+
| customer_id | menu_item_id |        score        | rank |
+-------------+--------------+---------------------+------+
|      0      |     539      |  0.5088962614536285 |  1   |
|      0      |      27      | 0.49887265264987946 |  2   |
|      0      |     192      |  0.4864218682050705 |  3   |
|      0      |     470      | 0.45370370370370366 |  4   |
|      0      |     329      |  0.4156651960478888 |  5   |
|      0      |     625      |         0.4         |  6   |
|      0      |     619      |        0.375        |  7   |
|      0      |     169      | 0.37166160427861744 |  8   |
|      0      |     330      | 0.36363636363636365 |  9   |
|      0      |     479      |  0.3583485920257518 |  10  |
|      0      |     539      |  0.5088962614536285 |  1   |
|      0      |      27      | 0.49887265264987946 |  2   |
|      0      |     192      |  0.4864218682050705 |  3   |
|      0      |     470      | 0.4537037

In [None]:
#Model Evaluation
'''
For evaluating recommendation engines, we can use the concept of RMSE and precision-recall.

Measures the error of predicted values
Lesser the RMSE value, better the recommendations
'''

#Recall
'''
What percentage of products that a user buys are actually recommended?
If a customer buys 5 products and the recommendation decided to show 3 of them, then the recall is 0.6
'''

# Precision
'''
Out of all the recommended items, how many did the user actually like?
If 5 products were recommended to the customer out of which he buys 4 of them, then precision is 0.8
'''

# Why are both recall and precision important?
'''
Consider a case where we recommend all products, so our customers will surely cover the items that they liked and bought. 
In this case, we have 100% recall! Does this mean our model is good?

We have to consider precision. If we recommend 300 items but user likes and buys only 3 of them, then precision is 0.1%! 
This very low precision indicates that the model is not great, despite their excellent recall.

So our aim has to be optimizing both recall and precision (to be close to 1 as possible).
'''

In [31]:
# create initial callable variables for model evaluation
models_w_counts = [popularity, cos, pear]
models_w_dummy = [pop_dummy, cos_dummy, pear_dummy]
models_w_norm = [pop_norm, cos_norm, pear_norm]

names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts']
names_w_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy', 'Pearson Similarity on Purchase Dummy']
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts']

In [32]:
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_w_dummy, model_names=names_w_dummy)
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_w_norm, model_names=names_w_norm)

PROGRESS: Evaluate model Popularity Model on Purchase Counts

Precision and recall summary statistics by cutoff
+--------+-----------------------+------------------------+
| cutoff |     mean_precision    |      mean_recall       |
+--------+-----------------------+------------------------+
|   1    |  0.002178649237472767 | 3.932579851033874e-06  |
|   2    |  0.001089324618736383 | 3.932579851033874e-06  |
|   3    |  0.002904865649963689 |  4.5093958112164e-05   |
|   4    | 0.0032679738562091487 | 6.441726031586982e-05  |
|   5    | 0.0030501089324618744 | 6.536614935658095e-05  |
|   6    |  0.003631082062454612 | 0.00044111393507537117 |
|   7    |  0.003423591658885776 | 0.0005692697725737691  |
|   8    | 0.0032679738562091504 | 0.0005983184290734061  |
|   9    |  0.005809731299927378 | 0.0029903844891640556  |
|   10   |  0.007843137254901962 |  0.005852106226551414  |
+--------+-----------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.34191267


Overall RMSE: 0.6944868547002837

Per User RMSE (best)
+-------------+---------------------+-------+
| customer_id |         rmse        | count |
+-------------+---------------------+-------+
|     468     | 0.23618592818578088 |   1   |
+-------------+---------------------+-------+
[1 rows x 3 columns]


Per User RMSE (worst)
+-------------+------+-------+
| customer_id | rmse | count |
+-------------+------+-------+
|     398     | 1.0  |   1   |
+-------------+------+-------+
[1 rows x 3 columns]


Per Item RMSE (best)
+--------------+--------------------+-------+
| menu_item_id |        rmse        | count |
+--------------+--------------------+-------+
|     128      | 0.5269652253827163 |  1151 |
+--------------+--------------------+-------+
[1 rows x 3 columns]


Per Item RMSE (worst)
+--------------+------+-------+
| menu_item_id | rmse | count |
+--------------+------+-------+
|     628      | 1.0  |   2   |
+--------------+------+-------+
[1 rows x 3 columns]

PROGRESS: Eva

In [33]:
data_norm['purchase_dummy'] = 1
final_model = tc.item_similarity_recommender.create(tc.SFrame(data_norm), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_dummy', similarity_type='cosine')
recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+-------------+--------------+---------------------+------+
| customer_id | menu_item_id |        score        | rank |
+-------------+--------------+---------------------+------+
|      0      |      1       |  0.3956666886806488 |  1   |
|      0      |      59      | 0.37350634336471555 |  2   |
|      0      |     253      |  0.3699289083480835 |  3   |
|      0      |     142      |  0.3697036266326904 |  4   |
|      0      |     144      |  0.3671062350273132 |  5   |
|      0      |     139      |  0.3649892330169678 |  6   |
|      0      |      2       | 0.35447336435317994 |  7   |
|      0      |     261      | 0.35387101769447327 |  8   |
|      0      |      48      | 0.35159760117530825 |  9   |
|      0      |     150      | 0.34302805066108705 |  10  |
|      0      |      1       |  0.3956666886806488 |  1   |
|      0      |      59      | 0.37350634336471555 |  2   |
|      0      |     253      |  0.3699289083480835 |  3   |
|      0      |     142      |  0.369703

In [34]:
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

(3870180, 4)


Unnamed: 0,customer_id,menu_item_id,score,rank
0,0,1,0.395667,1
1,0,59,0.373506,2
2,0,253,0.369929,3
3,0,142,0.369704,4
4,0,144,0.367106,5


In [35]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['customer_id', 'recommendedProducts']].drop_duplicates() \
        .sort_values('customer_id').set_index('customer_id')
    if print_csv:
        df_output.to_csv('option1_recommendation.csv')
        print("An output file can be found with name 'option1_recommendation.csv'")
    return df_output

In [36]:
df_output = create_output(pear_norm, users_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found with name 'option1_recommendation.csv'
(481, 1)


Unnamed: 0_level_0,recommendedProducts
customer_id,Unnamed: 1_level_1
0,539|27|192|470|329|625|619|169|330|479|539|27|...
1,192|27|539|470|329|625|169|619|330|479|192|27|...
2,539|27|192|470|329|625|169|619|330|593|539|27|...
3,539|192|470|329|625|169|619|330|479|593|539|19...
4,539|27|192|470|329|625|169|619|330|479|539|27|...


In [37]:
# Input: customer ID
# Returns: ranked list of items (product IDs), that the user is most likely to want to put in his/her (empty) "basket"

def customer_recomendation(customer_id):
    if customer_id not in df_output.index:
        print('Customer not found.')
        return customer_id
    return df_output.loc[customer_id]

In [38]:
customer_recomendation(0)

recommendedProducts    539|27|192|470|329|625|619|169|330|479|539|27|...
Name: 0, dtype: object