In [3]:
import pandas as pd 
import numpy as np 
import time 
import turicreate as  tc 
from sklearn.model_selection import train_test_split 

In [9]:
df = pd.read_excel('filtered_retail.xlsx', index_col = 0)
pre_len = len(df)
df.head()

Unnamed: 0,Date,Customer_ID,Transaction_ID,SKU_Category_x,SKU,Quantity,Sales_Amount,SKU_Category_y,Menu_Category,Item_ID,Sex,Weather,Zip_cd
0,2016-01-02,3686,3,0H2,CZUZX,1.0,6.35,0H2,Fruit Series,FS1,male,windy,70573
1,2016-01-02,1253,8,0H2,9STQJ,1.0,8.25,0H2,Fruit Series,FS3,female,sunny,70577
2,2016-01-02,7548,10,N8U,UNJKW,1.0,2.11,N8U,Signature Series,SS1,male,sunny,70562
3,2016-01-02,6044,12,N8U,EMJ1S,1.0,3.62,N8U,Signature Series,SS6,male,sunny,70566
4,2016-01-02,592,13,P42,B2IW9,1.0,9.0,P42,Fresh Milk,FM9,male,sunny,70571


In [10]:
# Quantity will be the number of items be purchased in each transaction
# filter out float number
df = df[df.Quantity>=1]
df.Quantity = df.Quantity.astype(int)
post_len = len(df)
print('{} Row is deleted because of the float number in Quantity column'
      .format(pre_len-post_len))

10 Row is deleted because of the float number in Quantity column


In [11]:
# review how many customerID in the transaction data
C_ID_list = df.Customer_ID.value_counts()
total_ID_nb = len(C_ID_list)
ID_nb_smaller_2 = len(np.where(C_ID_list<=2)[0])
print('There are {} unique ID in the transaction data'.format(total_ID_nb))
print('There are {} ID order equal or less than twice'.format(ID_nb_smaller_2))

There are 11916 unique ID in the transaction data
There are 8276 ID order equal or less than twice


In [12]:
aggregated_table = df.groupby(['Customer_ID', 'Zip_cd', 'Weather', 'Menu_Category']).sum().drop(['Transaction_ID', 'Sales_Amount'], axis=1).reset_index()
aggregated_table

Unnamed: 0,Customer_ID,Zip_cd,Weather,Menu_Category,Quantity
0,4,70576,windy,Egg Waffle,1
1,5,70572,windy,Fresh Milk,3
2,5,70573,sunny,Fresh Milk,4
3,5,70574,sunny,Fresh Milk,2
4,5,70577,sunny,Fresh Milk,4
...,...,...,...,...,...
32236,22615,70579,windy,Small Bites,1
32237,22616,70572,windy,Fresh Milk,1
32238,22621,70563,cloudy,Dessert Combo,2
32239,22621,70563,windy,Dessert Combo,1


In [14]:
# the model refer: https://apple.github.io/turicreate/docs/api/generated/turicreate.recommender.util.compare_models.html#turicreate.recommender.util.compare_models
# and https://medium.datadriveninvestor.com/how-to-build-a-recommendation-system-for-purchase-data-step-by-step-d6d7a78800b6
def recommender(data, algo, user_id, item_id, target, n_recom, n_show):
    if algo == 'popularity':
        model = tc.popularity_recommender.create(data, 
                                                 user_id=user_id, 
                                                 item_id=item_id, 
                                                 target=target)
    elif algo == 'cosine':
        model = tc.item_similarity_recommender.create(data, 
                                                      user_id=user_id, 
                                                      item_id=item_id, 
                                                      target=target, 
                                                      similarity_type='cosine')
    elif algo == 'pearson':
        model = tc.item_similarity_recommender.create(data, 
                                                      user_id=user_id, 
                                                      item_id=item_id, 
                                                      target=target, 
                                                      similarity_type='pearson')
        
    recom = model.recommend(k=n_recom)
#     recom.print_rows(n_show)
    return model, recom

In [51]:
tmp.head()

Unnamed: 0,Customer_ID,Item_ID,Quantity
0,4,EW6,1
1,5,FM2,4
2,11,TP7,1
3,13,TP9,1
4,14,F1,1


In [56]:
input_col = 'Customer_ID'
output_col = 'Item_ID'

tmp = df.groupby([input_col, output_col]).size().reset_index(name = 'Quantity')
df_matrix = pd.pivot_table(tmp, values = 'Quantity', index = input_col, columns = output_col)
df_matrix_norm = (df_matrix - df_matrix.min()) / (df_matrix.max() - df_matrix.min())

d = df_matrix_norm.reset_index() 
d.index.names = ['scaled_Quantity'] 
data_norm = pd.melt(d, id_vars = [input_col], value_name = 'scaled_Quantity').dropna()

train, test = train_test_split(data_norm, test_size = 0.2, random_state = 2022)

def setup_recomm(data, input_col, method, n_r = 1, n_s = 10):
    return recommender(tc.SFrame(data), method, input_col, output_col, 'scaled_Quantity', n_r, n_s)

pop1, pop2 = setup_recomm(train, input_col, 'popularity')
cos1, cos2 = setup_recomm(train, input_col, 'cosine')
ps1, ps2 = setup_recomm(train, input_col, 'pearson')

In [57]:
models = [pop1, cos1, ps1]
tc.recommender.util.compare_models(tc.SFrame(test), models, model_names = ['popularity', 'cosine', 'pearson'])

PROGRESS: Evaluate model popularity



Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |  0.002692119432207538 | 0.0020802741067058256 |
|   2    |  0.002202643171806169 |  0.003324359601892644 |
|   3    | 0.0025289606787404144 |  0.005384238864415075 |
|   4    | 0.0030592266275085663 |  0.008941099689998374 |
|   5    | 0.0030347528144884952 |  0.011102953173437756 |
|   6    | 0.0029776472507750023 |  0.013069016152716594 |
|   7    | 0.0027270820222362055 |  0.013844020231685438 |
|   8    | 0.0025085658345570276 |  0.01463941915483766  |
|   9    | 0.0024745744275847027 |  0.01631179637787568  |
|   10   | 0.0023250122369065035 |  0.017168379833578086 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.15698594114285516

Per User RMSE (best)
+-------------+-----------------------+----


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.00905531081742536  | 0.006447101601286625 |
|   2    | 0.015785609397944208 | 0.024042413506437006 |
|   3    | 0.015500081579376766 | 0.03453352135437299  |
|   4    | 0.017192853646598158 | 0.052271597168807186 |
|   5    | 0.02266275085658355  | 0.08713862278473143  |
|   6    | 0.021129058573992516 | 0.09736459765828336  |
|   7    | 0.019369274875882803 | 0.10347897194446382  |
|   8    | 0.018263582966226155 | 0.11135818008064699  |
|   9    | 0.017566759123293692 | 0.11997976054510558  |
|   10   | 0.017205090553108187 |  0.1301445508861076  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.16758427485254948

Per User RMSE (best)
+-------------+------+-------+
| Customer_ID | rmse | count |
+--------


Precision and recall summary statistics by cutoff
+--------+-----------------------+-----------------------+
| cutoff |     mean_precision    |      mean_recall      |
+--------+-----------------------+-----------------------+
|   1    |  0.002936857562408223 | 0.0022026431718061663 |
|   2    | 0.0023250122369065113 | 0.0037934410181106218 |
|   3    | 0.0027736988089411005 |  0.006118453255017128 |
|   4    |  0.003059226627508566 |  0.008941099689998358 |
|   5    |  0.002936857562408223 |  0.010735845978136734 |
|   6    |  0.002977647250775001 |  0.013191385217816938 |
|   7    |  0.002692119432207537 |  0.013762440854951874 |
|   8    | 0.0025391581008321084 |  0.014884157285038332 |
|   9    | 0.0024745744275847014 |   0.016434165442976   |
|   10   |  0.002373959862946645 |  0.017657856093979466 |
+--------+-----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.15636881559735963

Per User RMSE (best)
+-------------+-----------------------+----

[{'precision_recall_by_user': Columns:
  	Customer_ID	int
  	cutoff	int
  	precision	float
  	recall	float
  	count	int
  
  Rows: 73548
  
  Data:
  +-------------+--------+-----------+--------+-------+
  | Customer_ID | cutoff | precision | recall | count |
  +-------------+--------+-----------+--------+-------+
  |    15523    |   1    |    0.0    |  0.0   |   2   |
  |    15523    |   2    |    0.0    |  0.0   |   2   |
  |    15523    |   3    |    0.0    |  0.0   |   2   |
  |    15523    |   4    |    0.0    |  0.0   |   2   |
  |    15523    |   5    |    0.0    |  0.0   |   2   |
  |    15523    |   6    |    0.0    |  0.0   |   2   |
  |    15523    |   7    |    0.0    |  0.0   |   2   |
  |    15523    |   8    |    0.0    |  0.0   |   2   |
  |    15523    |   9    |    0.0    |  0.0   |   2   |
  |    15523    |   10   |    0.0    |  0.0   |   2   |
  +-------------+--------+-----------+--------+-------+
  [73548 rows x 5 columns]
  Note: Only the head of the SFrame is pr

In [45]:
ps1.recommend(k = 5).print_rows(25)

+-------------+---------+--------------------+------+
| Customer_ID | Item_ID |       score        | rank |
+-------------+---------+--------------------+------+
|    12844    |    F7   | 2.774923291077485  |  1   |
|    12844    |    T6   | 2.542857142857143  |  2   |
|    12844    |    F5   | 2.1046854635502434 |  3   |
|    12844    |   CYO4  | 1.9807225619043622 |  4   |
|    12844    |    F3   | 1.961654085490354  |  5   |
|    17755    |    F7   | 2.7843814991615914 |  1   |
|    17755    |    T6   | 2.5409425522599904 |  2   |
|    17755    |    F5   | 2.1137079239314334 |  3   |
|    17755    |   CYO4  | 1.9820005709216708 |  4   |
|    17755    |    F3   | 1.9512262548737207 |  5   |
|    15887    |    F7   | 2.7838720284603737 |  1   |
|    15887    |    T6   | 2.5445910568748205 |  2   |
|    15887    |    F5   |  2.11697028482214  |  3   |
|    15887    |   CYO4  | 1.9821428571428572 |  4   |
|    15887    |    F3   | 1.9634338772031148 |  5   |
|     4849    |    F7   | 2.

In [50]:
df.Item_ID.value_counts().sort_values(ascending = False).head(20)

SS1    2007
SS2     791
FM1     737
SS3     698
FS1     621
SS4     600
SS5     593
EW1     557
FM2     522
SB1     511
EW2     482
SI1     468
SI2     463
SS6     462
F1      452
AD1     446
FS2     413
T1      402
FM3     402
F2      398
Name: Item_ID, dtype: int64

In [213]:
# scenario defination
# 1. Know who the customer is (Customer_ID), with ZIP code
# 2. Do not know who the customer is, only ZIP code
# 3. customer did not provide any information, recommen based on the weather

# number of recommend and number of printed ranking
n_r = 1
n_s = 10

# split the dataset, try different input
train, test = train_test_split(aggregated_table, test_size=0.2, random_state=5)

train_2 = train.drop(['Customer_ID'], axis=1).groupby(['Zip_cd', 'Weather', 'Menu_Category']).sum().reset_index()
test_2 = test.drop(['Customer_ID'], axis=1).groupby(['Zip_cd', 'Weather', 'Menu_Category']).sum().reset_index()

train_3 = train.drop(['Customer_ID', 'Zip_cd'], axis=1).groupby(['Weather', 'Menu_Category']).sum().reset_index()
test_3 = test.drop(['Customer_ID', 'Zip_cd'], axis=1).groupby(['Weather', 'Menu_Category']).sum().reset_index()


test = tc.SFrame(test)
# change the set by different inputs
evaluate_set = train


# data, algo, user_id, item_id, target, n_recom, n_show
# check which secnario of the input:
if 'Customer_ID' in evaluate_set.columns:
    # s1
    evaluate_set = tc.SFrame(evaluate_set)
    m_1_pop, m_1_pop_list = recommender(evaluate_set, 'popularity', 'Customer_ID', 'Menu_Category',
                          'Quantity', n_r, n_s)
    m_1_cos, m_1_pop_list = recommender(evaluate_set, 'cosine', 'Customer_ID', 'Menu_Category',
                          'Quantity', n_r, n_s)
    m_1_per, m_1_pop_list = recommender(evaluate_set, 'pearson', 'Customer_ID', 'Menu_Category',
                          'Quantity', n_r, n_s)
    
##### found there are no output from the below statements #####
elif 'Zip_cd' in evaluate_set.columns:
    # s2
    evaluate_set = tc.SFrame(evaluate_set)
    m_2_pop = recommender(evaluate_set, 'popularity', 'Zip_cd', 'Menu_Category',
                          'Quantity', n_r, n_s)
    m_2_cos = recommender(evaluate_set, 'cosine', 'Zip_cd', 'Menu_Category',
                          'Quantity', n_r, n_s)
    m_2_per = recommender(evaluate_set, 'pearson', 'Zip_cd', 'Menu_Category',
                          'Quantity', n_r, n_s)
elif 'Weather' in evaluate_set.columns:
#     # s3
    evaluate_set = tc.SFrame(evaluate_set)
    m_3_pop = recommender(evaluate_set, 'popularity', 'Weather', 'Menu_Category',
                          'Quantity', n_r, n_s)
    m_3_cos = recommender(evaluate_set, 'cosine', 'Weather', 'Menu_Category',
                          'Quantity', n_r, n_s)
    m_3_per = recommender(evaluate_set, 'pearson', 'Weather', 'Menu_Category',
                          'Quantity', n_r, n_s)

In [214]:
# the parimary evaluation
eva_model = [m_1_pop, m_1_cos, m_1_per]
eva = tc.recommender.util.compare_models(test, eva_model, model_names=['popularity', 'cosine', 'pearson'])

PROGRESS: Evaluate model popularity



Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.026096737907761534 | 0.018793543664184842 |
|   2    | 0.04274465691788524  | 0.06403163890228003  |
|   3    | 0.03547056617922761  |  0.0785144714053601  |
|   4    | 0.03734533183352084  |  0.1113828628564286  |
|   5    | 0.03586051743532058  |  0.1341450175870871  |
|   6    | 0.03367829021372331  | 0.15078918706590252  |
|   7    | 0.03256950024104126  |  0.1709916617565661  |
|   8    | 0.03249718785151858  | 0.19339430785437542  |
|   9    |  0.0358373953255845  | 0.24361820843823095  |
|   10   | 0.037657167854018236 | 0.28427526916278306  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.66041870775386

Per User RMSE (best)
+-------------+---------------------+-------+
| Customer_ID |         rmse


Precision and recall summary statistics by cutoff
+--------+----------------------+---------------------+
| cutoff |    mean_precision    |     mean_recall     |
+--------+----------------------+---------------------+
|   1    | 0.11046119235095614  | 0.08817682130393037 |
|   2    | 0.12002249718785152  | 0.19686502099325487 |
|   3    | 0.10633670791151106  |  0.2573887055326878 |
|   4    | 0.09302587176602921  |  0.2959476356664205 |
|   5    | 0.08386951631046118  | 0.33147748015014594 |
|   6    |  0.0757105361829773  | 0.35736406575551655 |
|   7    | 0.06846858428410726  | 0.37483902424284854 |
|   8    | 0.06351518560179988  | 0.39625474013550493 |
|   9    | 0.060934258217722935 | 0.42894502884941565 |
|   10   | 0.05756717910261227  |  0.4490762624452161 |
+--------+----------------------+---------------------+
[10 rows x 3 columns]


Overall RMSE: 1.4664400913070201

Per User RMSE (best)
+-------------+---------------------+-------+
| Customer_ID |         rmse        | co


Precision and recall summary statistics by cutoff
+--------+----------------------+----------------------+
| cutoff |    mean_precision    |     mean_recall      |
+--------+----------------------+----------------------+
|   1    | 0.025871766029246342 | 0.018568571785669667 |
|   2    | 0.042294713160854924 |  0.0637129287410503  |
|   3    | 0.03599550056242971  | 0.08083918081668359  |
|   4    | 0.03695163104611925  | 0.11007677611727104  |
|   5    | 0.03586051743532064  | 0.13405127930437274  |
|   6    | 0.033903262092238436 | 0.15129162426125303  |
|   7    | 0.03276233327976854  | 0.17087917581730866  |
|   8    | 0.036377952755905565 |  0.2207471387505136  |
|   9    | 0.035912385951756276 | 0.24401190922563254  |
|   10   | 0.03794963129608793  | 0.28575633402967476  |
+--------+----------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.6614224945597964

Per User RMSE (best)
+-------------+----------------------+-------+
| Customer_ID |         r

In [264]:
# s2
Zip_result = pd.DataFrame()
n_recomm = 3
for i in train_2.Zip_cd.value_counts().index:
    for j in train_2.Weather.value_counts().index:
        temp = train_2[(train_2.Zip_cd==i) & (train_2.Weather==j)].nlargest(3, 'Quantity')
        Zip_result = Zip_result.append(temp)

In [265]:
# Zip_result is the recommendation result of scenario 2
Zip_result

Unnamed: 0,Zip_cd,Weather,Menu_Category,Quantity
575,70569,rainy,Almond Drink,129
589,70569,rainy,Tofu Pudding,91
582,70569,rainy,Herbal Teas,62
600,70569,sunny,Shaved Ice,281
595,70569,sunny,Fluffy,243
...,...,...,...,...
866,70574,cloudy,Shaved Ice,21
867,70574,cloudy,Signature Series,20
910,70574,windy,Signature Series,39
909,70574,windy,Shaved Ice,24


In [269]:
# s3
W_result = pd.DataFrame()
n_recomm = 3
for j in train_3.Weather.value_counts().index:
    temp = train_3[(train_3.Weather==j)].nlargest(3, 'Quantity')
    W_result = W_result.append(temp)

In [270]:
# W_result is the recommendation result of scenario 3
W_result

Unnamed: 0,Weather,Menu_Category,Quantity
41,sunny,Shaved Ice,3293
36,sunny,Fluffy,2445
37,sunny,Fresh Milk,1981
30,rainy,Tofu Pudding,1029
16,rainy,Almond Drink,990
26,rainy,Signature Series,828
10,cloudy,Signature Series,922
8,cloudy,Milk Teas,570
2,cloudy,Dessert Combo,541
58,windy,Signature Series,1305
