In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
import turicreate as tc

In [2]:
customers = pd.read_csv('data/recommend_1.csv') 
transactions = pd.read_csv('data/trx_data.csv')

In [3]:
customers.head()

Unnamed: 0,customerId
0,1553
1,20400
2,19750
3,6334
4,27773


In [4]:
transactions.head()

Unnamed: 0,customerId,products
0,0,20
1,1,2|2|23|68|68|111|29|86|107|152
2,2,111|107|29|11|11|11|33|23
3,3,164|227
4,5,2|2


In [5]:
transactions['products'] = transactions['products'].apply(lambda x: [int(i) for i in x.split('|')])

In [6]:
transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index()

Unnamed: 0,customerId,0,1,2,3,4,5,6,7,8,9
0,0,20.0,,,,,,,,,
1,1,2.0,2.0,23.0,68.0,68.0,111.0,29.0,86.0,107.0,152.0


In [7]:
data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})
data['productId'] = data['productId'].astype(np.int64)

In [8]:
data.head()

Unnamed: 0,customerId,productId,purchase_count
0,0,1,2
1,0,13,1
2,0,19,3
3,0,20,1
4,0,31,2


In [9]:
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy

In [10]:
data_dummy = create_data_dummy(data)

In [11]:
df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')
df_matrix.head()

productId,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,2.0,,,,,,,,,...,,,,,,,,,,
1,,,6.0,,,,,,,,...,,,,1.0,,,1.0,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [12]:
df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
print(df_matrix_norm.shape)
df_matrix_norm.head()

(24429, 300)


productId,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,0.1,,,,,,,,,...,,,,,,,,,,
1,,,0.166667,,,,,,,,...,,,,0.0,,,0.0,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [13]:
# create a table for input to the modeling

d = df_matrix_norm.reset_index()
d.index.names = ['scaled_purchase_freq']
data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()
print(data_norm.shape)
data_norm.head()

(133585, 3)


Unnamed: 0,customerId,productId,scaled_purchase_freq
9,9,0,0.133333
25,25,0,0.133333
32,33,0,0.133333
35,36,0,0.133333
43,44,0,0.133333


In [14]:
def normalize_data(data):
    df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    return pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()

In [15]:
train, test = train_test_split(data, test_size = .2)
print(train.shape, test.shape)

((106868, 3), (26717, 3))


In [16]:
# Using turicreate library, we convert dataframe to SFrame - this will be useful in the modeling part

train_data = tc.SFrame(train)
test_data = tc.SFrame(test)

In [17]:
train_data

customerId,productId,purchase_count
16380,193,1
11494,153,1
14779,137,1
306,146,1
22196,52,2
18484,127,2
16739,47,1
11559,44,2
13947,60,3
6706,45,1


In [18]:
test_data

customerId,productId,purchase_count
756,291,1
16653,211,1
11802,236,2
3140,20,1
9808,245,2
4230,230,1
3816,17,2
13880,9,2
5269,194,1
22386,49,1


In [19]:
# We can define a function for this step as follows

def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

In [20]:
# lets try with both dummy table and scaled/normalized purchase table

train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [21]:
# variables to define field names
user_id = 'customerId'
item_id = 'productId'
target = 'purchase_count'
users_to_recommend = list(transactions[user_id])
n_rec = 10 # number of items to recommend
n_display = 30

In [22]:
popularity_model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)

In [23]:
# Get recommendations for a list of users to recommend (from customers file)
# Printed below is head / top 30 rows for first 3 customers with 10 recommendations each

popularity_recomm = popularity_model.recommend(users=users_to_recommend, k=n_rec)
popularity_recomm.print_rows(n_display)

+------------+-----------+---------------+------+
| customerId | productId |     score     | rank |
+------------+-----------+---------------+------+
|     0      |    132    | 3.06666666667 |  1   |
|     0      |     34    | 3.01581027668 |  2   |
|     0      |     37    | 2.97674418605 |  3   |
|     0      |     0     | 2.95242718447 |  4   |
|     0      |     3     | 2.86344537815 |  5   |
|     0      |    248    | 2.84090909091 |  6   |
|     0      |     27    | 2.77419354839 |  7   |
|     0      |    245    | 2.69736842105 |  8   |
|     0      |    110    | 2.68421052632 |  9   |
|     0      |     32    | 2.67961165049 |  10  |
|     1      |    132    | 3.06666666667 |  1   |
|     1      |     34    | 3.01581027668 |  2   |
|     1      |     37    | 2.97674418605 |  3   |
|     1      |     0     | 2.95242718447 |  4   |
|     1      |     3     | 2.86344537815 |  5   |
|     1      |    248    | 2.84090909091 |  6   |
|     1      |     27    | 2.77419354839 |  7   |


In [24]:
# Since turicreate is very accessible library, we can define a model selection function as below

def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

In [25]:
# variables to define field names
# constant variables include:
user_id = 'customerId'
item_id = 'productId'
users_to_recommend = list(customers[user_id])
n_rec = 10 # number of items to recommend
n_display = 30 # to print the head / first few rows in a defined dataset

In [26]:
# these variables will change accordingly
name = 'popularity'
target = 'purchase_dummy'
pop_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
|    1553    |    123    |  1.0  |  1   |
|    1553    |    235    |  1.0  |  2   |
|    1553    |    159    |  1.0  |  3   |
|    1553    |     35    |  1.0  |  4   |
|    1553    |     76    |  1.0  |  5   |
|    1553    |     11    |  1.0  |  6   |
|    1553    |    202    |  1.0  |  7   |
|    1553    |     20    |  1.0  |  8   |
|    1553    |    107    |  1.0  |  9   |
|    1553    |     55    |  1.0  |  10  |
|   20400    |    123    |  1.0  |  1   |
|   20400    |    235    |  1.0  |  2   |
|   20400    |    159    |  1.0  |  3   |
|   20400    |     35    |  1.0  |  4   |
|   20400    |     76    |  1.0  |  5   |
|   20400    |     11    |  1.0  |  6   |
|   20400    |    202    |  1.0  |  7   |
|   20400    |     20    |  1.0  |  8   |
|   20400    |    107    |  1.0  |  9   |
|   20400    |     55    |  1.0  |  10  |
|   19750    |    123    |  1.0  |

In [27]:
name = 'popularity'
target = 'scaled_purchase_freq'
pop_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------+------+
| customerId | productId |     score      | rank |
+------------+-----------+----------------+------+
|    1553    |    226    | 0.778911564626 |  1   |
|    1553    |    247    | 0.328244274809 |  2   |
|    1553    |    230    | 0.324444444444 |  3   |
|    1553    |    125    | 0.266666666667 |  4   |
|    1553    |    248    | 0.26329787234  |  5   |
|    1553    |    294    |   0.2515625    |  6   |
|    1553    |     72    | 0.231884057971 |  7   |
|    1553    |    276    | 0.227112676056 |  8   |
|    1553    |    165    | 0.223350253807 |  9   |
|    1553    |     83    | 0.220238095238 |  10  |
|   20400    |    226    | 0.778911564626 |  1   |
|   20400    |    247    | 0.328244274809 |  2   |
|   20400    |    230    | 0.324444444444 |  3   |
|   20400    |    125    | 0.266666666667 |  4   |
|   20400    |    248    | 0.26329787234  |  5   |
|   20400    |    294    |   0.2515625    |  6   |
|   20400    |     72    | 0.23

In [28]:
train.groupby(by=item_id)['purchase_count'].mean().sort_values(ascending=False).head(20)

productId
132    3.066667
34     3.015810
37     2.976744
0      2.952427
3      2.863445
248    2.840909
27     2.774194
245    2.697368
110    2.684211
32     2.679612
10     2.640118
230    2.621429
82     2.595556
129    2.541899
58     2.526882
83     2.509434
54     2.498233
252    2.473684
226    2.469388
91     2.432653
Name: purchase_count, dtype: float64

In [29]:
# these variables will change accordingly
name = 'cosine'
target = 'purchase_count'
cos = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-----------------+------+
| customerId | productId |      score      | rank |
+------------+-----------+-----------------+------+
|    1553    |     2     |  0.118224442005 |  1   |
|    1553    |     1     | 0.0823344111443 |  2   |
|    1553    |     35    | 0.0754737973213 |  3   |
|    1553    |     5     | 0.0741784930229 |  4   |
|    1553    |     33    | 0.0643378734589 |  5   |
|    1553    |     17    | 0.0597451925278 |  6   |
|    1553    |     8     | 0.0491762399673 |  7   |
|    1553    |     41    | 0.0486076831818 |  8   |
|    1553    |     76    | 0.0463675141335 |  9   |
|    1553    |    167    | 0.0459497451782 |  10  |
|   20400    |    284    | 0.0486576557159 |  1   |
|   20400    |     6     |  0.038588643074 |  2   |
|   20400    |    132    | 0.0359700918198 |  3   |
|   20400    |     26    | 0.0357463955879 |  4   |
|   20400    |    198    | 0.0347434878349 |  5   |
|   20400    |    131    | 0.0344877839088 |  6   |
|   20400   

In [30]:
# these variables will change accordingly
name = 'cosine'
target = 'purchase_dummy'
cos_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-----------------+------+
| customerId | productId |      score      | rank |
+------------+-----------+-----------------+------+
|    1553    |     35    |  0.116672674815 |  1   |
|    1553    |     1     | 0.0806420644124 |  2   |
|    1553    |     5     | 0.0754251082738 |  3   |
|    1553    |     2     | 0.0687856475512 |  4   |
|    1553    |     33    | 0.0600904623667 |  5   |
|    1553    |     21    | 0.0543644825617 |  6   |
|    1553    |     8     | 0.0498030781746 |  7   |
|    1553    |     14    | 0.0490155816078 |  8   |
|    1553    |     20    | 0.0469390352567 |  9   |
|    1553    |     38    | 0.0458361903826 |  10  |
|   20400    |     1     | 0.0459654331207 |  1   |
|   20400    |     27    | 0.0432065725327 |  2   |
|   20400    |     4     | 0.0403519272804 |  3   |
|   20400    |     26    | 0.0385573506355 |  4   |
|   20400    |    246    |  0.03819835186  |  5   |
|   20400    |     58    | 0.0371216535568 |  6   |
|   20400   

In [31]:
name = 'cosine'
target = 'scaled_purchase_freq'
cos_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+------------------+------+
| customerId | productId |      score       | rank |
+------------+-----------+------------------+------+
|    1553    |    157    |       0.0        |  1   |
|    1553    |     47    |       0.0        |  2   |
|    1553    |     24    |       0.0        |  3   |
|    1553    |     13    |       0.0        |  4   |
|    1553    |     89    |       0.0        |  5   |
|    1553    |    258    |       0.0        |  6   |
|    1553    |    259    |       0.0        |  7   |
|    1553    |     60    |       0.0        |  8   |
|    1553    |    268    |       0.0        |  9   |
|    1553    |     80    |       0.0        |  10  |
|   20400    |    157    |       0.0        |  1   |
|   20400    |     47    |       0.0        |  2   |
|   20400    |     24    |       0.0        |  3   |
|   20400    |     13    |       0.0        |  4   |
|   20400    |     89    |       0.0        |  5   |
|   20400    |    258    |       0.0        | 

In [32]:
# these variables will change accordingly
name = 'pearson'
target = 'purchase_count'
pear = model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+---------------+------+
| customerId | productId |     score     | rank |
+------------+-----------+---------------+------+
|    1553    |    132    | 3.06666666667 |  1   |
|    1553    |     34    | 2.99865378302 |  2   |
|    1553    |     37    | 2.97581589139 |  3   |
|    1553    |     0     | 2.95018876779 |  4   |
|    1553    |     3     | 2.86270859322 |  5   |
|    1553    |    248    | 2.83779164878 |  6   |
|    1553    |     27    | 2.77370169086 |  7   |
|    1553    |    245    | 2.69736842105 |  8   |
|    1553    |     32    | 2.67961165049 |  9   |
|    1553    |    110    | 2.67633323607 |  10  |
|   20400    |    132    | 3.05505580107 |  1   |
|   20400    |     34    | 3.01529421967 |  2   |
|   20400    |     37    |  2.9762148372 |  3   |
|   20400    |     0     | 2.95242718447 |  4   |
|   20400    |     3     |  2.8621127375 |  5   |
|   20400    |    248    | 2.84090909091 |  6   |
|   20400    |     27    | 2.77144911212 |  7   |


In [33]:
# these variables will change accordingly
name = 'pearson'
target = 'purchase_dummy'
pear_dummy = model(train_data_dummy, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+-------+------+
| customerId | productId | score | rank |
+------------+-----------+-------+------+
|    1553    |    123    |  0.0  |  1   |
|    1553    |    235    |  0.0  |  2   |
|    1553    |    159    |  0.0  |  3   |
|    1553    |     35    |  0.0  |  4   |
|    1553    |     76    |  0.0  |  5   |
|    1553    |     11    |  0.0  |  6   |
|    1553    |    202    |  0.0  |  7   |
|    1553    |     20    |  0.0  |  8   |
|    1553    |    107    |  0.0  |  9   |
|    1553    |     55    |  0.0  |  10  |
|   20400    |    123    |  0.0  |  1   |
|   20400    |    235    |  0.0  |  2   |
|   20400    |    159    |  0.0  |  3   |
|   20400    |     35    |  0.0  |  4   |
|   20400    |     76    |  0.0  |  5   |
|   20400    |     11    |  0.0  |  6   |
|   20400    |    202    |  0.0  |  7   |
|   20400    |     20    |  0.0  |  8   |
|   20400    |    107    |  0.0  |  9   |
|   20400    |     55    |  0.0  |  10  |
|   19750    |    123    |  0.0  |

In [34]:
name = 'pearson'
target = 'scaled_purchase_freq'
pear_norm = model(train_data_norm, name, user_id, item_id, target, users_to_recommend, n_rec, n_display)

+------------+-----------+----------------+------+
| customerId | productId |     score      | rank |
+------------+-----------+----------------+------+
|    1553    |    226    | 0.778559971727 |  1   |
|    1553    |    247    | 0.328244274809 |  2   |
|    1553    |    230    | 0.324225948718 |  3   |
|    1553    |    125    | 0.266502932707 |  4   |
|    1553    |    248    | 0.26329787234  |  5   |
|    1553    |    294    | 0.251402565837 |  6   |
|    1553    |     72    | 0.231624837371 |  7   |
|    1553    |    276    | 0.22702648774  |  8   |
|    1553    |    165    | 0.223350253807 |  9   |
|    1553    |     83    | 0.219271665528 |  10  |
|   20400    |    226    | 0.778911564626 |  1   |
|   20400    |    247    | 0.328244274809 |  2   |
|   20400    |    230    | 0.324444444444 |  3   |
|   20400    |    125    | 0.265923158328 |  4   |
|   20400    |    248    | 0.26329787234  |  5   |
|   20400    |    294    |   0.2515625    |  6   |
|   20400    |     72    | 0.23

In [35]:
# create initial callable variables

models_w_counts = [popularity_model, cos, pear]
models_w_dummy = [pop_dummy, cos_dummy, pear_dummy]
models_w_norm = [pop_norm, cos_norm, pear_norm]

names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts']
names_w_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy', 'Pearson Similarity on Purchase Dummy']
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts']

In [36]:
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff |   mean_precision  |    mean_recall    |
+--------+-------------------+-------------------+
|   1    | 0.000942507068803 | 0.000470217812348 |
|   2    |  0.00333502501269 |  0.00358907973842 |
|   3    |  0.00369752773146 |  0.00610159759024 |
|   4    |  0.00775755818169 |   0.01760266067   |
|   5    |  0.00797505981295 |  0.0223399696486  |
|   6    |  0.00673046714517 |  0.0226783055194  |
|   7    |  0.00625576120392 |  0.0244984413677  |
|   8    |  0.00575473066048 |  0.0256597447203  |
|   9    |  0.00548587447739 |  0.0274533132316  |
|   10   |  0.00534329007468 |  0.0299384419891  |
+--------+-------------------+-------------------+
[10 rows x 3 columns]


Overall RMSE: 1.04149382451

Per User RMSE (best)
+------------+-------+------+
| customerId | count | rmse |
+------------+-------+------+
|    3234    |   1   | 0.0  |
+------------+-------+------+
[1 rows x 3


Precision and recall summary statistics by cutoff
+--------+-----------------+-----------------+
| cutoff |  mean_precision |   mean_recall   |
+--------+-----------------+-----------------+
|   1    |  0.113970854781 | 0.0655918419101 |
|   2    |  0.097005727543 |  0.110940480359 |
|   3    | 0.0816356122671 |  0.138026707047 |
|   4    | 0.0713767853259 |  0.159343143762 |
|   5    | 0.0637714782861 |  0.177674091534 |
|   6    | 0.0575895985887 |  0.191748708879 |
|   7    | 0.0529150397199 |  0.205502260002 |
|   8    | 0.0488925541942 |  0.216822297457 |
|   9    | 0.0458847885804 |  0.227601200719 |
|   10   | 0.0432465743493 |  0.237510971527 |
+--------+-----------------+-----------------+
[10 rows x 3 columns]


Overall RMSE: 1.88995579038

Per User RMSE (best)
+------------+-------+---------------+
| customerId | count |      rmse     |
+------------+-------+---------------+
|   19062    |   1   | 0.10503077507 |
+------------+-------+---------------+
[1 rows x 3 columns]




Precision and recall summary statistics by cutoff
+--------+------------------+-------------------+
| cutoff |  mean_precision  |    mean_recall    |
+--------+------------------+-------------------+
|   1    | 0.00130500978757 | 0.000643701256331 |
|   2    | 0.0034800261002  |  0.00378845623374 |
|   3    | 0.00381836197105 |  0.00633722435744 |
|   4    | 0.0078119335895  |  0.0177295366216  |
|   5    | 0.0080475603567  |  0.0226118466877  |
|   6    | 0.00680296768892 |  0.0229743494064  |
|   7    | 0.00631790452714 |  0.0248355688962  |
|   8    | 0.00587254404408 |  0.0263074162445  |
|   9    | 0.00556643063712 |  0.0279378585324  |
|   10   | 0.00535779018343 |  0.0300380439266  |
+--------+------------------+-------------------+
[10 rows x 3 columns]


Overall RMSE: 1.03835316824

Per User RMSE (best)
+------------+-------+------------------+
| customerId | count |       rmse       |
+------------+-------+------------------+
|   13424    |   1   | 4.4408920985e-16 |
+------

In [37]:
eval_dummy = tc.recommender.util.compare_models(test_data_dummy, models_w_dummy, model_names=names_w_dummy)

PROGRESS: Evaluate model Popularity Model on Purchase Dummy



Precision and recall summary statistics by cutoff
+--------+------------------+------------------+
| cutoff |  mean_precision  |   mean_recall    |
+--------+------------------+------------------+
|   1    | 0.00637496377861 | 0.00292967761817 |
|   2    | 0.00525210084034 | 0.0050043868122  |
|   3    | 0.00521587945523 | 0.00779943277046 |
|   4    | 0.00519776876268 | 0.0106225619116  |
|   5    | 0.00482468849609 | 0.0124753388314  |
|   6    | 0.0046242634985  | 0.0146713321712  |
|   7    | 0.0045742434905  | 0.0169870573119  |
|   8    | 0.00448239640684 | 0.0191411948289  |
|   9    | 0.00457999291671 | 0.0220672790557  |
|   10   | 0.00464358157056 | 0.0247319969805  |
+--------+------------------+------------------+
[10 rows x 3 columns]


Overall RMSE: 0.0

Per User RMSE (best)
+------------+-------+------+
| customerId | count | rmse |
+------------+-------+------+
|    3006    |   1   | 0.0  |
+------------+-------+------+
[1 rows x 3 columns]


Per User RMSE (worst)
+---


Precision and recall summary statistics by cutoff
+--------+-----------------+-----------------+
| cutoff |  mean_precision |   mean_recall   |
+--------+-----------------+-----------------+
|   1    |  0.123007823819 | 0.0718950195624 |
|   2    |  0.098630831643 |  0.111618607397 |
|   3    | 0.0823432821404 |  0.137143583081 |
|   4    | 0.0724971022892 |  0.158122499877 |
|   5    | 0.0640683859751 |  0.173501175512 |
|   6    | 0.0580991017096 |  0.188590759405 |
|   7    | 0.0531315974666 |  0.200865957494 |
|   8    | 0.0490980875109 |  0.211680765392 |
|   9    | 0.0459689622976 |  0.222555168053 |
|   10   | 0.0432410895393 |  0.232054052005 |
+--------+-----------------+-----------------+
[10 rows x 3 columns]


Overall RMSE: 0.969255919449

Per User RMSE (best)
+------------+-------+----------------+
| customerId | count |      rmse      |
+------------+-------+----------------+
|   22278    |   1   | 0.530925393105 |
+------------+-------+----------------+
[1 rows x 3 colu


Precision and recall summary statistics by cutoff
+--------+------------------+------------------+
| cutoff |  mean_precision  |   mean_recall    |
+--------+------------------+------------------+
|   1    | 0.00637496377861 | 0.00292967761817 |
|   2    | 0.00525210084034 | 0.0050043868122  |
|   3    | 0.00521587945523 | 0.00779943277046 |
|   4    | 0.00519776876268 | 0.0106225619116  |
|   5    | 0.00482468849609 | 0.0124753388314  |
|   6    | 0.0046242634985  | 0.0146713321712  |
|   7    | 0.0045742434905  | 0.0169870573119  |
|   8    | 0.00448239640684 | 0.0191411948289  |
|   9    | 0.00457999291671 | 0.0220672790557  |
|   10   | 0.00464358157056 | 0.0247319969805  |
+--------+------------------+------------------+
[10 rows x 3 columns]


Overall RMSE: 1.0

Per User RMSE (best)
+------------+-------+------+
| customerId | count | rmse |
+------------+-------+------+
|    3006    |   1   | 1.0  |
+------------+-------+------+
[1 rows x 3 columns]


Per User RMSE (worst)
+---

In [38]:
eval_norm = tc.recommender.util.compare_models(test_data_norm, models_w_norm, model_names=names_w_norm)

PROGRESS: Evaluate model Popularity Model on Scaled Purchase Counts



Precision and recall summary statistics by cutoff
+--------+------------------+------------------+
| cutoff |  mean_precision  |   mean_recall    |
+--------+------------------+------------------+
|   1    | 0.00238112417923 |  0.001102859067  |
|   2    | 0.00223681362292 | 0.00202838926953 |
|   3    | 0.00223681362292 | 0.00350825966491 |
|   4    | 0.00214661952522 | 0.00491356960619 |
|   5    | 0.0018183130096  | 0.00520047273601 |
|   6    | 0.00202034778844 | 0.00718113512146 |
|   7    | 0.00296867430138 | 0.0118756949571  |
|   8    | 0.00285013348726 | 0.0128336869907  |
|   9    | 0.00281405584819 | 0.0140750969665  |
|   10   | 0.00272025398658 | 0.0151130162832  |
+--------+------------------+------------------+
[10 rows x 3 columns]


Overall RMSE: 0.134791699842

Per User RMSE (best)
+------------+-------+-------------------+
| customerId | count |        rmse       |
+------------+-------+-------------------+
|    8671    |   1   | 0.000384050604314 |
+------------+--


Precision and recall summary statistics by cutoff
+--------+-----------------+-----------------+
| cutoff |  mean_precision |   mean_recall   |
+--------+-----------------+-----------------+
|   1    | 0.0654448372898 | 0.0382332472788 |
|   2    | 0.0523847319431 | 0.0593816341318 |
|   3    | 0.0445438583832 |  0.073133988734 |
|   4    | 0.0397214806263 | 0.0857406778641 |
|   5    | 0.0356158452991 |  0.094765762884 |
|   6    | 0.0324698751714 |  0.103313987234 |
|   7    | 0.0302536773422 |  0.111959020717 |
|   8    | 0.0284381990043 |  0.119915711076 |
|   9    | 0.0268978842469 |  0.127498751758 |
|   10   | 0.0255790461072 |  0.134194515653 |
+--------+-----------------+-----------------+
[10 rows x 3 columns]


Overall RMSE: 0.161978582093

Per User RMSE (best)
+------------+-------+------+
| customerId | count | rmse |
+------------+-------+------+
|    4836    |   1   | 0.0  |
+------------+-------+------+
[1 rows x 3 columns]


Per User RMSE (worst)
+------------+-------


Precision and recall summary statistics by cutoff
+--------+------------------+------------------+
| cutoff |  mean_precision  |   mean_recall    |
+--------+------------------+------------------+
|   1    | 0.00238112417923 |  0.001102859067  |
|   2    | 0.00223681362292 | 0.00202838926953 |
|   3    | 0.00223681362292 | 0.00350825966491 |
|   4    | 0.00214661952522 | 0.00491356960619 |
|   5    | 0.0018183130096  | 0.00520047273601 |
|   6    | 0.00203237366813 | 0.00725329039962 |
|   7    | 0.00299959799202 | 0.0120440572728  |
|   8    | 0.00285915289703 | 0.0129539457877  |
|   9    | 0.00283810760757 | 0.0142735239815  |
|   10   | 0.00275633162566 | 0.0153775856364  |
+--------+------------------+------------------+
[10 rows x 3 columns]


Overall RMSE: 0.134493734496

Per User RMSE (best)
+------------+-------+-------------------+
| customerId | count |        rmse       |
+------------+-------+-------------------+
|    5996    |   1   | 1.56759879118e-06 |
+------------+--

In [39]:
users_to_recommend = list(customers[user_id])

final_model = tc.item_similarity_recommender.create(tc.SFrame(data_dummy), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_dummy', 
                                            similarity_type='cosine')

recom = final_model.recommend(users=users_to_recommend, k=n_rec)
recom.print_rows(n_display)

+------------+-----------+-----------------+------+
| customerId | productId |      score      | rank |
+------------+-----------+-----------------+------+
|    1553    |     1     |  0.103481757641 |  1   |
|    1553    |     2     | 0.0934672474861 |  2   |
|    1553    |     35    | 0.0845762014389 |  3   |
|    1553    |     33    | 0.0668614387512 |  4   |
|    1553    |     61    | 0.0651255607605 |  5   |
|    1553    |     15    | 0.0647641539574 |  6   |
|    1553    |     11    | 0.0546789884567 |  7   |
|    1553    |     5     | 0.0540698170662 |  8   |
|    1553    |     17    | 0.0519999623299 |  9   |
|    1553    |     36    | 0.0504865050316 |  10  |
|   20400    |     26    | 0.0581226944923 |  1   |
|   20400    |     6     | 0.0536174178123 |  2   |
|   20400    |    113    | 0.0531278848648 |  3   |
|   20400    |     1     | 0.0521045923233 |  4   |
|   20400    |     15    | 0.0476838946342 |  5   |
|   20400    |     27    | 0.0446733832359 |  6   |
|   20400   

In [40]:
df_rec = recom.to_dataframe()
print(df_rec.shape)
df_rec.head()

(10000, 4)


Unnamed: 0,customerId,productId,score,rank
0,1553,1,0.103482,1
1,1553,2,0.093467,2
2,1553,35,0.084576,3
3,1553,33,0.066861,4
4,1553,61,0.065126,5


In [41]:
df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id].transform(lambda x: '|'.join(x.astype(str)))
df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates().sort_values('customerId').set_index('customerId')

In [42]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates() \
        .sort_values('customerId').set_index('customerId')
    if print_csv:
        df_output.to_csv('output/option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [44]:
df_output = create_output(pear_norm, users_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(1000, 1)


Unnamed: 0_level_0,recommendedProducts
customerId,Unnamed: 1_level_1
4,226|247|230|125|248|294|72|276|165|83
11,226|247|230|125|248|294|72|276|165|83
12,226|247|230|125|248|294|72|276|165|83
16,226|247|230|125|248|294|72|276|165|83
21,226|247|230|125|248|294|72|276|165|83


In [45]:
def customer_recomendation(customer_id):
    if customer_id not in df_output.index:
        print('Customer not found.')
        return customer_id
    return df_output.loc[customer_id]

In [46]:
customer_recomendation(4)

recommendedProducts    226|247|230|125|248|294|72|276|165|83
Name: 4, dtype: object

In [47]:
customer_recomendation(21)

recommendedProducts    226|247|230|125|248|294|72|276|165|83
Name: 21, dtype: object