In [1]:
import pandas as pd
import numpy as np
customers=pd.read_csv('recommend_1.csv')
transactions=pd.read_csv('trx_data.csv')

In [2]:
transactions['products'] = transactions['products'].apply(lambda x: [int(i) for i in x.split('|')])
transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index()

Unnamed: 0,customerId,0,1,2,3,4,5,6,7,8,9
0,0,20.0,,,,,,,,,
1,1,2.0,2.0,23.0,68.0,68.0,111.0,29.0,86.0,107.0,152.0


In [3]:
data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), 
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})

In [4]:
data.head()

Unnamed: 0,customerId,productId,purchase_count
0,0,1.0,2
1,0,13.0,1
2,0,19.0,3
3,0,20.0,1
4,0,31.0,2


In [5]:
data['productId'] = data['productId'].astype(np.int64)

In [6]:
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy
data_dummy = create_data_dummy(data)

In [7]:
df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')
df_matrix_norm = ((df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min()))
d = df_matrix_norm.reset_index()
d.index.names = ['scaled_purchase_freq']
data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()

In [8]:
data_norm.head()

Unnamed: 0,customerId,productId,scaled_purchase_freq
9,9,0,0.133333
25,25,0,0.133333
32,33,0,0.133333
35,36,0,0.133333
43,44,0,0.133333


In [9]:
from sklearn.model_selection import train_test_split
train,test=train_test_split(data,test_size=0.2,random_state=0)
#train test size (106868, 3) (26717, 3)

In [10]:
import graphlab as gf
train_data = gf.SFrame(train)
test_data = gf.SFrame(test)

This non-commercial license of GraphLab Create for academic use is assigned to viveksood92@gmail.com and will expire on July 21, 2020.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\vivek\AppData\Local\Temp\graphlab_server_1563897271.log.0


In [11]:
train_data

customerId,productId,purchase_count
8342,2,1
11081,255,1
16786,50,1
24759,6,3
15142,1,1
7090,156,1
887,69,2
10694,20,1
529,89,1
23734,33,1


In [14]:
# We can define a function for this step as follows for split data

def split_data(data):
    '''
    Splits dataset into training and test set.
    
    Args:
        data (pandas.DataFrame)
        
    Returns
        train_data (tc.SFrame)
        test_data (tc.SFrame)
    '''
    train, test = train_test_split(data, test_size = .2)
    train_data = gf.SFrame(train)
    test_data = gf.SFrame(test)
    return train_data, test_data

In [15]:
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

In [16]:
train_data_dummy


customerId,productId,purchase_count,purchase_dummy
555,96,2,1
5116,253,1,1
7699,7,2,1
5649,224,1,1
10539,31,2,1
9183,46,1,1
20115,113,1,1
378,20,1,1
4114,86,1,1
1341,79,1,1


In [17]:
train_data_norm #check purpose

customerId,productId,scaled_purchase_freq
1500,19,0.0714285714286
8698,219,0.0
24222,26,0.571428571429
605,26,0.0
3147,240,0.0
9056,11,0.05
9213,0,0.0
20658,235,0.4
8579,84,0.0
4706,49,0.0


In [None]:
'''We use 80:20 ratio for our train-test set size.
Our training portion will be used to develop a predictive model, while the other to evaluate the model's performance.
Now that we have three datasets with purchase counts, purchase dummy, and scaled purchase counts, we would like to split each.'''

In [None]:
''' Using a Popularity model as a baseline 
The popularity model takes the most popular items for recommendation. These items are products with the highest number of sells across customers.
We use graphlab library for running and evaluating both baseline and collaborative filtering models below
Training data is used for model selection


Using purchase counts
'''


In [20]:
user_id = 'customerId'
item_id = 'productId'
target = 'purchase_count'
users_to_recommend = list(transactions[user_id])
n_rec = 10 # number of items to recommend
n_display = 30


In [21]:
popularity_model = gf.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)

In [22]:
# Get recommendations for a list of users to recommend (from customers file)
# Printed below is head / top 30 rows for first 3 customers with 10 recommendations each

popularity_recomm = popularity_model.recommend(users=users_to_recommend, k=n_rec)
popularity_recomm.print_rows(n_display)

+------------+-----------+---------------+------+
| customerId | productId |     score     | rank |
+------------+-----------+---------------+------+
|     0      |     34    | 3.00387596899 |  1   |
|     0      |     0     | 2.99157303371 |  2   |
|     0      |     37    | 2.98062015504 |  3   |
|     0      |    248    |     2.925     |  4   |
|     0      |    132    | 2.90163934426 |  5   |
|     0      |     3     | 2.88065843621 |  6   |
|     0      |     27    | 2.73282442748 |  7   |
|     0      |    110    |      2.7      |  8   |
|     0      |     82    | 2.64317180617 |  9   |
|     0      |     10    | 2.62424242424 |  10  |
|     1      |     34    | 3.00387596899 |  1   |
|     1      |     0     | 2.99157303371 |  2   |
|     1      |     37    | 2.98062015504 |  3   |
|     1      |    248    |     2.925     |  4   |
|     1      |    132    | 2.90163934426 |  5   |
|     1      |     3     | 2.88065843621 |  6   |
|     1      |     27    | 2.73282442748 |  7   |


In [23]:
'''If a grouping example below, products 132, 248, 37, and 34 are the most popular (best-selling) across customers. 
Using their purchase counts divided by the number of customers, 
we see that these products are at least bought 3 times on average in the training set of transactions (same as the first popularity measure on purchase_count variable)'''
train.groupby(by=item_id)['purchase_count'].mean().sort_values(ascending=False).head(20)

productId
34     3.003876
0      2.991573
37     2.980620
248    2.925000
132    2.901639
3      2.880658
27     2.732824
110    2.700000
82     2.643172
10     2.624242
230    2.623077
32     2.611702
226    2.556338
68     2.437229
91     2.408163
252    2.403846
58     2.395604
129    2.393939
83     2.382979
75     2.352941
Name: purchase_count, dtype: float64

In [None]:
#Collaborative Filtering Model
# we use Cosine similarity

In [24]:
cos_model = gf.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')

In [26]:
cos_recomm = cos_model.recommend(users=users_to_recommend, k=n_rec)
cos_recomm.print_rows(n_display)

+------------+-----------+-----------------+------+
| customerId | productId |      score      | rank |
+------------+-----------+-----------------+------+
|     0      |     2     |  0.114501219988 |  1   |
|     0      |     1     |  0.112510490417 |  2   |
|     0      |     15    | 0.0743314683437 |  3   |
|     0      |     21    | 0.0729395508766 |  4   |
|     0      |     8     | 0.0700700581074 |  5   |
|     0      |     17    | 0.0635743021965 |  6   |
|     0      |    139    | 0.0626840353012 |  7   |
|     0      |     7     | 0.0595255196095 |  8   |
|     0      |     47    | 0.0592169523239 |  9   |
|     0      |     72    |  0.059126073122 |  10  |
|     1      |     1     |  0.165850902305 |  1   |
|     1      |     14    |  0.162978547461 |  2   |
|     1      |     21    |  0.132956497809 |  3   |
|     1      |     11    |  0.111557294341 |  4   |
|     1      |     5     |  0.107090178658 |  5   |
|     1      |     15    |  0.106762584518 |  6   |
|     1     

In [None]:
 Model Evaluation
For evaluating recommendation engines, we can use the concept of precision-recall.

RMSE (Root Mean Squared Errors)
Measures the error of predicted values
Lesser the RMSE value, better the recommendations
Recall
What percentage of products that a user buys are actually recommended?
If a customer buys 5 products and the recommendation decided to show 3 of them, then the recall is 0.6
Precision

Out of all the recommended items, how many the user actually liked?
If 5 products were recommended to the customer out of which he buys 4 of them, then precision is 0.8
Why are both recall and precision important?

Consider a case where we recommend all products, so our customers will surely cover the items that they liked and bought. In this case, we have 100% recall! Does this mean our model is good?
We have to consider precision. If we recommend 300 items but user likes and buys only 3 of them, then precision is 0.1%! This very low precision indicates that the model is not great, despite their excellent recall.
So our aim has to be optimizing both recall and precision (to be close to 1 as possible).
Lets compare all the models we have built based on precision-recall characteristics:

In [32]:
#model_performance = gf.compare(test_data, [popularity_model, cos_model])
eval_counts = gf.recommender.util.compare_models(test_data, [popularity_model, cos_model])

PROGRESS: Evaluate model M0



Precision and recall summary statistics by cutoff
+--------+------------------+------------------+
| cutoff |  mean_precision  |   mean_recall    |
+--------+------------------+------------------+
|   1    | 0.00488681279195 | 0.00263378694209 |
|   2    | 0.0114624505929  | 0.0125172115982  |
|   3    | 0.00876751706791 |  0.014286570917  |
|   4    | 0.00688106360043 | 0.0149872536335  |
|   5    | 0.00579231045634 | 0.0156118179142  |
|   6    | 0.00612049347227 | 0.0193062518988  |
|   7    | 0.00564652738566 | 0.0207330694817  |
|   8    | 0.00530003593245 | 0.0220062758537  |
|   9    | 0.00524613726195 | 0.0248087171542  |
|   10   | 0.00530362917715 | 0.0285181434328  |
+--------+------------------+------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 1.0307393256443131)

Per User RMSE (best)
+------------+-------+------------------+
| customerId | count |       rmse       |
+------------+-------+------------------+
|   22015    |   1   | 0.00387596899225 |
+--------


Precision and recall summary statistics by cutoff
+--------+-----------------+-----------------+
| cutoff |  mean_precision |   mean_recall   |
+--------+-----------------+-----------------+
|   1    |  0.114840100611 | 0.0666975139684 |
|   2    | 0.0935321595401 |  0.107648659581 |
|   3    | 0.0783806443885 |  0.133275467695 |
|   4    | 0.0682357168523 |  0.152600611613 |
|   5    | 0.0606970894718 |  0.168267637125 |
|   6    | 0.0549047790155 |  0.182176877164 |
|   7    | 0.0507058159232 |  0.195084476974 |
|   8    | 0.0470804886813 |  0.20667525594  |
|   9    | 0.0441649698567 |  0.216921284315 |
|   10   |  0.041545095221 |  0.22644746341  |
+--------+-----------------+-----------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 1.8810687607363101)

Per User RMSE (best)
+------------+-------+-----------------+
| customerId | count |       rmse      |
+------------+-------+-----------------+
|   10294    |   1   | 0.0166245698929 |
+------------+-------+-----------------+


In [None]:
Popularity v. Collaborative Filtering: We can see that the collaborative filtering algorithms work better than popularity model for purchase counts. Indeed, popularity model doesn’t give any personalizations as it only gives the same list of recommended items to every user.

In [None]:
 Final Output
In this step, we would like to manipulate format for recommendation output to one we can export to csv, and also a function that will return recommendation list given a customer ID.
We need to first rerun the model using the whole dataset, as we came to a final model using train data and evaluated with test set.

In [33]:
df_rec = cos_recomm.to_dataframe()
print(df_rec.shape)
df_rec.head()

(624830, 4)


Unnamed: 0,customerId,productId,score,rank
0,0,2,0.114501,1
1,0,1,0.11251,2
2,0,15,0.074331,3
3,0,21,0.07294,4
4,0,8,0.07007,5


In [64]:
df_rec['recommendedProducts'] = df_rec.groupby([user_id])[item_id].transform(lambda x: '|'.join(x.astype(str)))

In [62]:
df_rec.head()

Unnamed: 0,customerId,productId,score,rank,recommendedProducts
0,0,2,0.114501,1,2|1|15|21|8|17|139|7|47|72|2|1|15|21|8|17|139|...
1,0,1,0.11251,2,2|1|15|21|8|17|139|7|47|72|2|1|15|21|8|17|139|...
2,0,15,0.074331,3,2|1|15|21|8|17|139|7|47|72|2|1|15|21|8|17|139|...
3,0,21,0.07294,4,2|1|15|21|8|17|139|7|47|72|2|1|15|21|8|17|139|...
4,0,8,0.07007,5,2|1|15|21|8|17|139|7|47|72|2|1|15|21|8|17|139|...


In [66]:
df_output = df_rec[['customerId', 'recommendedProducts']].drop_duplicates().sort_values('customerId').set_index('customerId')

In [67]:
df_output.head()

Unnamed: 0_level_0,recommendedProducts
customerId,Unnamed: 1_level_1
0,2|1|15|21|8|17|139|7|47|72|2|1|15|21|8|17|139|...
1,1|14|21|11|5|15|39|33|8|41|1|14|21|11|5|15|39|...
2,11|2|1|5|61|15|21|9|82|0
3,2|1|8|46|51|16|20|15|38|42|2|1|8|46|51|16|20|1...
4,2|1|14|13|8|38|5|61|20|36|2|1|14|13|8|38|5|61|...


In [38]:
df_output.to_csv('cosine_recommended.csv')

In [39]:
#Customer recommendation function
def customer_recomendation(customer_id):
    if customer_id not in df_output.index:
        print('Customer not found.')
        return customer_id
    return df_output.loc[customer_id]

In [40]:
customer_recomendation(21)

recommendedProducts    48|2|1|38|36|93|9|79|194|144|48|2|1|38|36|93|9...
Name: 21, dtype: object

In [41]:
customer_recomendation(5)


recommendedProducts    2|1|8|0|5|272|38|162|48|72|2|1|8|0|5|272|38|16...
Name: 5, dtype: object

In [68]:
customer_recomendation(0)

recommendedProducts    2|1|15|21|8|17|139|7|47|72|2|1|15|21|8|17|139|...
Name: 0, dtype: object