In [6]:
#import library for the simple recommender system
from lightfm import LightFM
from lightfm.evaluation import auc_score
from scipy.sparse import coo_matrix
from sklearn import preprocessing
import pandas as pd
import numpy as np
from datetime import datetime

In [20]:
#read the transaction csv file for the e-commerce dataset
events = pd.read_csv('events.csv')
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [21]:
#the dataset uses unix timestamp, so we need to process the data
events = events.assign(date=pd.Series(datetime.fromtimestamp(i/1000).date() for i in events.timestamp))
events = events.sort_values('date').reset_index(drop=True)
events = events[['visitorid','itemid','event', 'date']]
#I only use data from 2015-05-03 to 2015-06-03 because my computer is not able to train too much data
events=events[(events.date>=datetime.strptime('2015-5-3','%Y-%m-%d').date())&(events.date<=datetime.strptime('2015-6-3','%Y-%m-%d').date())]
events.head()

Unnamed: 0,visitorid,itemid,event,date
0,910284,331446,view,2015-05-03
1,1324815,292237,view,2015-05-03
2,398588,269842,view,2015-05-03
3,139680,465621,addtocart,2015-05-03
4,108814,247909,view,2015-05-03


In [10]:
events.tail()

Unnamed: 0,visitorid,itemid,event,date
647784,809805,113935,view,2015-06-03
647785,144763,248558,view,2015-06-03
647786,39675,313563,view,2015-06-03
647787,581811,292246,view,2015-06-03
647788,148394,354162,view,2015-06-03


In [22]:
events.to_csv('53to63events.csv')

In [11]:
#split the dataset to prepare for training and testing
split_point = np.int(np.round(events.shape[0]*0.9))
events_train = events.iloc[0:split_point]
events_test = events.iloc[split_point::]
#need to keep the same labels in order to train the lightfm model
events_test = events_test[(events_test['visitorid'].isin(events_train['visitorid'])) &
                          (events_test['itemid'].isin(events_train['itemid']))]

In [12]:
labels=['visitorid','itemid']
train_dict=dict()
test_dict=dict()
event_dict = dict()

#transform the categorical variables
for label in labels:
    encoder=preprocessing.LabelEncoder()
    train_dict[label]=encoder.fit_transform(events_train[label].values)
    test_dict[label]=encoder.transform(events_test[label].values)
    
encoder=preprocessing.LabelEncoder()
event_dict['train'] = encoder.fit_transform(events_train.event)
event_dict['test'] = encoder.transform(events_test.event)

In [13]:
user_num=len(np.unique(train_dict['visitorid']))
item_num=len(np.unique(train_dict['itemid']))

In [14]:
print('number of users: ' + str(user_num))
print('number of items: ' + str(item_num))

number of users: 303521
number of items: 114072


In [15]:
#prepare to train the lightfm model
event_matrix = dict()
event_matrix['train'] = coo_matrix((event_dict['train'], (train_dict['visitorid'], train_dict['itemid'])),
                                   shape=(user_num,item_num))
event_matrix['test'] = coo_matrix((event_dict['test'], (test_dict['visitorid'], test_dict['itemid'])),
                                  shape=(user_num,item_num))

In [17]:
model = LightFM(no_components=10, loss='warp')
model.fit(event_matrix['train'], epochs=20, verbose=True)

Epoch: 100%|██████████| 20/20 [00:26<00:00,  1.31s/it]


<lightfm.lightfm.LightFM at 0x1f48610ecd0>

In [18]:
auc_score(model, event_matrix['test']).mean()

0.82439727

using lightfm model, I was able to achieve 82.4% AUC which a lot greater than 50% if we do not use a recommender system

the result proves that using a recommender system could significantly improve conversion rates because customers will have higher chances of seeing what they like or are interested in.

to dive deeper into the recommendation system, I could try to process data in different ways to train the model