## Recommender engine for e-commerce 

Problem: create a recommender engine to suggest products to the users visiting an e-commerce site

(Some) analysis:

* The dataset consists of a collection of events (namely, 'view', 'addtocart' and 'transaction') describing the actions that visitors to the site carry out with the items offered. It is expected that visitors that buy items with the action 'transaction' have added such items to their cart beforehand 'addtocart' (they may or may have not visited them though).

In [1]:
import bz2
from datetime import datetime
import logging
import os
import time
import tqdm

import numpy as np
import scipy
import scipy.sparse as sparse
import seaborn as sns

os.environ["MODIN_ENGINE"] = "ray"  # Modin will be using the Ray engine
# import pandas as pd
import modin.pandas as pd

import implicit
# import lightfm
import tensorflow as tf

In [2]:
%matplotlib inline
random_seed = 12345
tf.random.set_seed(random_seed)

In [3]:
print("Running TensorFlow {}".format(tf.__version__))

Running TensorFlow 2.2.0


In [4]:
# Some common variables
DATA_DIR = os.path.join(os.getcwd(), 'data')

EVENTS_FILENAME = 'events.csv'
EVENTS_FILE_OPTIONS = {'visitorid': 'string',
                       'itemid': 'string',
                       'event': 'string',
                       'transactionid': 'string'}
CATEGORIES_FILENAME = 'category_tree.csv'
CATEGORIES_FILE_OPTIONS = {'categoryid': 'string',
                           'parentid': 'string'}
ITEMS_PROPERTIES_FILENAME = ['item_properties_part1.csv',
                             'item_properties_part2.csv']
ITEMS_PROPERTIES_FILE_OPTIONS = {'itemid': 'string',
                                 'property': 'string'}

PREDICTION_USERS_FILENAME = 'predictions.csv'
PREDICTION_FILE_OPTIONS = {'visitorid': 'string'}

In [5]:
# Some helper functions
def str_to_date(date, format='%Y-%m-%d'):
    """
    Converts a string date in to its date type equivalent
    """
    return datetime.strptime(date, format).date()

def remap_list(source, init=0):
    """
    Maps an input list of ids to a sequential list of integers.
    Returns the mapping in either 'direction' for convenience
    """
    
    # Remove potential duplicates
    source = list(dict.fromkeys(source))
    
    source_index = {}
    index_source = {}
    count = init
    
    for l in source:
        source_index[l] = count
        index_source[count] = l
        count += 1

    return source_index, index_source

def load_file(filenames, options=None, bzip2=True):
    """
    Loads the data contained in one or more files and returns it as a Pandas variable
    """
    def read_bz2_as_pd(filepath, options=None):
        with bz2.open(filepath, "rt") as f:
            if options:
                data = pd.read_csv(f, dtype=options)
            else:
                data = pd.read_csv(f)
            
        return data
    
    
    if type(filenames) is not list:
        filenames = [filenames]
    
    if bzip2:
        filenames = [filename + '.bz2' if bzip2 else filename for filename in filenames]
        df_files_array = (read_bz2_as_pd(os.path.join(DATA_DIR, filename), options) for filename in filenames)
    else:
        if options:
            df_files_array = (pd.read_csv(os.path.join(DATA_DIR, filename), dtype=options) for filename in filenames)
        else:
            df_files_array = (pd.read_csv(os.path.join(DATA_DIR, filename)) for filename in filenames)

    data = pd.concat(df_files_array, ignore_index=True)
    
    return data

def save_recommendations(recommendations,
                         filename,
                         user_col='visitorid',
                         item_col_prefix='item_',
                         n=100):

    col_names = [item_col_prefix + str(x) for x in range(n)]

    data = pd.DataFrame.from_dict(recommendations, orient='index', columns=col_names)
    data.index.names = [user_col]
    data.reset_index(inplace=True)

    data.to_csv(filename, index=False)

In [6]:
# Load users to eventually predict on
raw_prediction_users = load_file(PREDICTION_USERS_FILENAME, PREDICTION_FILE_OPTIONS, bzip2=False)

In [7]:
raw_prediction_users.head()

Unnamed: 0,visitorid
0,593408
1,71998
2,1403739
3,693797
4,1244757


In [8]:
raw_prediction_users.info()

<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 174956 entries, 0 to 174955
Data columns (total 1 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   visitorid  174956 non-null  string
dtypes: string(1)
memory usage: 1.3 MB


To request implementation, send an email to feature_requests@modin.org.


In [9]:
# Load events
raw_events = load_file(EVENTS_FILENAME, EVENTS_FILE_OPTIONS)        
# Load categories
raw_categories = load_file(CATEGORIES_FILENAME, CATEGORIES_FILE_OPTIONS)
# Load items properties
raw_items_properties = load_file(ITEMS_PROPERTIES_FILENAME, ITEMS_PROPERTIES_FILE_OPTIONS)



In [10]:
# Make timestamps readable where applicable
raw_events.timestamp = pd.to_datetime(raw_events.timestamp,
                                      unit='ms')
raw_items_properties.timestamp = pd.to_datetime(raw_items_properties.timestamp,
                                                unit='ms')

# Transform timestamps and add them as a column
raw_events['date'] = raw_events.timestamp.dt.date
raw_items_properties['date'] = raw_items_properties.timestamp.dt.date



In [11]:
raw_events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,date
0,2015-06-02 05:02:12.117,257597,view,355908,,2015-06-02
1,2015-06-02 05:50:14.164,992329,view,248676,,2015-06-02
2,2015-06-02 05:13:19.827,111016,view,318965,,2015-06-02
3,2015-06-02 05:12:35.914,483717,view,253185,,2015-06-02
4,2015-06-02 05:02:17.106,951259,view,367447,,2015-06-02


In [12]:
raw_events.info()



<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 6 columns):
 #   Column         Dtype         
---  ------         -----         
 0   timestamp      datetime64[ns]
 1   visitorid      string        
 2   event          string        
 3   itemid         string        
 4   transactionid  string        
 5   date           object        
dtypes: datetime64[ns](1), object(1), string(4)
memory usage: 126.2+ MB


In [13]:
raw_categories.head()

Unnamed: 0,categoryid,parentid
0,1016,213
1,809,169
2,570,9
3,1691,885
4,536,1691


In [14]:
raw_categories.info()

<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 1669 entries, 0 to 1668
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   categoryid  1669 non-null   string
 1   parentid    1644 non-null   string
dtypes: string(2)
memory usage: 26.2 KB




In [15]:
raw_items_properties.head()

Unnamed: 0,timestamp,itemid,property,value,date
0,2015-06-28 03:00:00,460429,categoryid,1338,2015-06-28
1,2015-09-06 03:00:00,206783,888,1116713 960601 n277.200,2015-09-06
2,2015-08-09 03:00:00,395014,400,n552.000 639502 n720.000 424566,2015-08-09
3,2015-05-10 03:00:00,59481,790,n15360.000,2015-05-10
4,2015-05-17 03:00:00,156781,917,828513,2015-05-17


In [16]:
raw_items_properties.info()



<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 20275902 entries, 0 to 20275901
Data columns (total 5 columns):
 #   Column     Dtype         
---  ------     -----         
 0   timestamp  datetime64[ns]
 1   itemid     string        
 2   property   string        
 3   value      object        
 4   date       object        
dtypes: datetime64[ns](1), object(2), string(2)
memory usage: 773.5+ MB


In [17]:
event_begin_timestamp = raw_events.timestamp.min()
event_end_timestamp = raw_events.timestamp.max()

num_users = raw_events.visitorid.nunique()
num_items = raw_events.itemid.nunique()

num_users_with_transactions = raw_events[raw_events.event == 'transaction'].visitorid.nunique()
num_items_with_transactions = raw_events[raw_events.event == 'transaction'].itemid.nunique()

item_lowest_price = raw_events[raw_events.event == 'transaction'].transactionid.min()
item_highest_price = raw_events[raw_events.event == 'transaction'].transactionid.max()

prediction_users = list(raw_prediction_users.visitorid.unique())



### A naive recommender

A naive recommender for an item X could be one that suggests a subset of all the items that other customers have bought together with item X.

Such recommender doesn't take into account much more than the chances of items being bought together in the realm of a customer within a considered period.<br>
On the bright side, the recommender is independent of the customer's behaviour (i.e., the suggestions are independent from the number of purchases) because it relies solely on the item-to-item "relationships".

In [18]:
# Naive recommender
# Suggest n random items of a set of all other items bought together with the queried one by all users
# Supports other actions like adding to the cart or viewing an item
def items_to_item(events,
                  itemid,
                  actions='transaction'):
    # Customers who bought some specific itemid: event == 'transaction' && transactionid != NaN
    # Otherwise (event == 'view' || event == 'addtocart') && transactionid == NaN)
    
    def visitors_to_item(events, itemid, actions):
        filtered_events_per_itemid = events[(events.itemid == itemid)]

        if type(actions) is not list:
            actions = [actions]
            
        # Create empty pd.DataFrame WITH structure
        #visitors = filtered_events_per_itemid.iloc[0:0,:].copy()
        visitors = {}
        
        for a in actions:
            if a == 'transaction':
                filtered_events_per_event = filtered_events_per_itemid[(filtered_events_per_itemid.event == a) &
                                                                       (filtered_events_per_itemid.transactionid.notna())]
            else:
                filtered_events_per_event = filtered_events_per_itemid[(filtered_events_per_itemid.event == a) &
                                                                       (filtered_events_per_itemid.transactionid.isna())]

            visitors[a] = list(filtered_events_per_event.visitorid.unique())

        return visitors
    
    visitors_list_per_event = visitors_to_item(events, itemid, actions)

    items_lists = []
    for action, visitors_list in visitors_list_per_event.items():
        filtered_events_by_action = events[(events.event == action)]
        if action == 'transaction':
            filtered_events_by_action = filtered_events_by_action[(filtered_events_by_action.transactionid.notna())]
        else:
            filtered_events_by_action = filtered_events_by_action[(filtered_events_by_action.transactionid.isna())]
            
        items_lists_per_transaction = [list(filtered_events_by_action[(filtered_events_by_action.visitorid == visitorid)].itemid.unique()) for visitorid in visitors_list]
        items_lists += items_lists_per_transaction
    
    items = set().union(*items_lists)
    return list(items)

In [19]:
itemid = '200793'
actions=['transaction', 'addtocart']
filtered_events = raw_events[(raw_events.date < str_to_date('2015-09-01'))]

recs = items_to_item(filtered_events, itemid, actions)
recs

['15335',
 '380775',
 '25353',
 '237753',
 '400969',
 '302422',
 '200793',
 '80582',
 '105792',
 '12836',
 '317178']

### A collaborative filtering recommender

Based in matrix factorization. For the interested ones in a summary: https://www.benfrederickson.com/matrix-factorization/ and improvements: https://www.benfrederickson.com/fast-implicit-matrix-factorization/

In [20]:
# Filtering events by type and choosing those of type 'transaction' (i.e., actual purchase)
# By doing this we ignore those visitors that do simply view and/or add items to the cart.
# Hence we already have a problem of cold start although it is also up to the semantics of the recommendations.
# E.g., "Others customers bought these other items" vs. "Others customers added these items to their cart"
filtered_events_by_date = raw_events[(raw_events.date < str_to_date('2015-09-01'))]
filtered_events_by_action = filtered_events_by_date[(raw_events.event == 'transaction')]

filtered_events = filtered_events_by_action[['visitorid', 'itemid', 'transactionid']]
# And we simply convert the purchases to a binary event
filtered_events.transactionid = filtered_events.transactionid.apply(lambda x: 1 if x else 0)

In [21]:
# Adding 'view' and 'addtocart' events in the events to recommend on based on a normalised weight
# over all the events in the dataset (see https://www.kaggle.com/aafrin/retail-rocket-recommender-system-for-beginners)
def get_normalised_group(group, statistics):
    
    def get_weight(x, statistics):
        statistics_normalised = dict(statistics)
        
        total = sum(list(statistics_normalised.values()))

        for k, v in statistics_normalised.items():
            statistics_normalised[k] = statistics[k]/total
            
        return statistics_normalised[x]
    
    group_events = list(group.event)
    
    rating = 0
    for e in group_events:
        rating += get_weight(e, statistics)

    return rating

filtered_events_statistics = dict(filtered_events_by_date['event'].value_counts())
# filtered_events = filtered_events_by_date.drop_duplicates(['visitorid', 'itemid', 'date']).groupby(['visitorid','itemid']).progress_apply(get_normalised_group, statistics=filtered_events_statistics).reset_index()
# filtered_events = filtered_events_by_date.drop_duplicates(['visitorid', 'itemid', 'date'])
# filtered_events = filtered_events[['visitorid', 'itemid', 'event']]
# filtered_events = filtered_events.groupby(['visitorid','itemid']).apply(get_normalised_group, statistics=filtered_events_statistics).reset_index()



In [22]:
users_index_map, index_users_map = remap_list(list(filtered_events.visitorid.unique()))
items_index_map, index_items_map = remap_list(list(filtered_events.itemid.unique()))

In [23]:
def create_ratings_matrix(events,
                          user_col='visitorid',
                          item_col='itemid',
                          rating_col='transactionid',
                          user_index=None,
                          item_index=None,
                          mode='coo'):

    num_users = events[user_col].nunique()
    num_items = events[item_col].nunique()
    
    if user_index:
        users_index_map = user_index
    else:
        users_index_map, index_users_map = remap_list(list(events[user_col].unique()))
        
    if item_index:
        items_index_map = item_index
    else:
        items_index_map, index_items_map = remap_list(list(events[item_col].unique()))
    
    
    if mode == 'dok':
        ratings = sparse.dok_matrix((num_users, num_items), dtype=events[rating_col].dtype)
        
        for row in events.itertuples(index=False):
            i = users_index_map[row[user_col]]
            j = items_index_map[row[item_col]]
            v = row[raing_col]
    
            ratings[i, j] = v
    elif mode == 'coo':
        i = events[user_col].apply(lambda x: users_index_map[x])
        j = events[item_col].apply(lambda x: items_index_map[x])
        data = events[rating_col].values

        ratings = sparse.coo_matrix((data, (i, j)), shape=(num_users, num_items))
    else:
        raise("Nope")

    return ratings, users_index_map, items_index_map

In [24]:
user_to_item_matrix, _, _ = create_ratings_matrix(filtered_events, user_index=users_index_map, item_index=items_index_map)
user_to_item_matrix.shape

(10500, 11016)

In [25]:
# Initialize the model(s)
cf_als = implicit.als.AlternatingLeastSquares(factors=128,
                                              regularization=1.5,
                                              iterations=100,
                                              calculate_training_loss=True)

cf_bpr = implicit.bpr.BayesianPersonalizedRanking(factors=128,
                                                  learning_rate=1.5,
                                                  regularization=1,
                                                  iterations=100,
                                                  verify_negative_samples=True)

cf_lmf = implicit.lmf.LogisticMatrixFactorization(factors=128,
                                                  learning_rate=0.1,
                                                  regularization=1.5,
                                                  iterations=100)



In [26]:
# Train the model(s) on the transposed sparse matrix of user/item/[confidence|rating weights]
item_to_user_matrix = user_to_item_matrix.T.tocsr()

cf_als.fit(item_to_user_matrix, show_progress=True)
cf_bpr.fit(item_to_user_matrix, show_progress=True)
cf_lmf.fit(item_to_user_matrix, show_progress=True)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [27]:
# Recommend items for one user (note that both the user and the results correspond to the internal indexes)
user_index=4
n=10
recommendations = cf_als.recommend(userid=user_index,
                                   user_items=user_to_item_matrix.tocsr(),
                                   N=n,
                                   filter_already_liked_items=True,
                                   recalculate_user=True)    

In [29]:
rprediction_users.visitorid

['593408',
 '71998',
 '1403739',
 '693797',
 '1244757',
 '1101223',
 '691239',
 '233213',
 '36849',
 '954002',
 '379352',
 '218336',
 '504979',
 '1168835',
 '1383921',
 '952880',
 '1264903',
 '8487',
 '619060',
 '1329500',
 '1322689',
 '289889',
 '53715',
 '1360703',
 '1144229',
 '435381',
 '1375985',
 '1076479',
 '206730',
 '307125',
 '528508',
 '839060',
 '940651',
 '79928',
 '214835',
 '280167',
 '1154684',
 '34732',
 '254911',
 '701625',
 '981780',
 '1094568',
 '988884',
 '1360530',
 '677731',
 '804118',
 '797910',
 '304077',
 '899581',
 '651920',
 '518199',
 '1391087',
 '464899',
 '882246',
 '940851',
 '715981',
 '1140565',
 '875702',
 '413962',
 '671515',
 '1057220',
 '485016',
 '929604',
 '893040',
 '1068814',
 '126728',
 '1147768',
 '678032',
 '847967',
 '584597',
 '104190',
 '597813',
 '1057265',
 '281389',
 '1221967',
 '707021',
 '717120',
 '444923',
 '625199',
 '78794',
 '851833',
 '1033160',
 '457631',
 '13880',
 '745820',
 '191484',
 '990210',
 '905361',
 '546826',
 '12482

In [30]:
def get_implicit_recommendations(users_to_recommend,
                                 trained_model,
                                 users_items_matrix,
                                 users_index_map,
                                 index_items_map,
                                 user_col='visitorid',
                                 n=100):

    num_users_prediction = users_to_recommend[user_col].size
    recommendations = {}
    
    start = time.time()
    with tqdm.tqdm(total=num_users_prediction) as progress:
        for _, user in enumerate(users_to_recommend[user_col]):
            user_recommendations = list()
        
            if user in users_index_map.keys():
                user_index = users_index_map[user]
                user_item_index_recommendations = model.recommend(userid=user_index,
                                                                  user_items=users_items_matrix.tocsr(),
                                                                  N=n,
                                                                  filter_already_liked_items=True,
                                                                  recalculate_user=True)
                user_item_index_recommendations_sorted = sorted(user_item_index_recommendations,
                                                                key=lambda x: x[1],
                                                                reverse=True)
                user_recommendations = list(map(lambda item_index: index_items_map[item_index],
                                            [t[0] for t in user_item_index_recommendations_ranked]))
            recommendations[user] = user_recommendations
        
            progress.update(1)

    return recommendations

In [31]:
recommendations = get_implicit_recommendations(raw_prediction_users,
                                               cf_als,
                                               user_to_item_matrix,
                                               users_index_map,
                                               index_items_map,
                                               n=100)

  0%|          | 256/174956 [00:00<00:10, 16322.23it/s]


In [None]:
# write_recommendations(recommendations, "data/pong.csv")