In [132]:
import bz2
from datetime import datetime
import os
import tqdm

import numpy as np
import scipy
import scipy.sparse as sparse
import seaborn as sns

os.environ["MODIN_ENGINE"] = "ray"  # Modin will be using the Ray engine
# import pandas as pd
import modin.pandas as pd

import implicit
# import lightfm
# import sklearn
import tensorflow as tf

In [4]:
%matplotlib inline
random_seed = 12345
tf.random.set_seed(random_seed)

In [5]:
print("Running TensorFlow {}".format(tf.__version__))

Running TensorFlow 2.2.0


In [6]:
# Some common variables
DATA_DIR = os.path.join(os.getcwd(), 'data')

EVENTS_FILENAME = 'events.csv'
EVENTS_FILE_OPTIONS = {'visitorid': 'string',
                       'itemid': 'string',
                       'event': 'string',
                       'transactionid': 'string'}
CATEGORIES_FILENAME = 'category_tree.csv'
CATEGORIES_FILE_OPTIONS = {'categoryid': 'string',
                           'parentid': 'string'}
ITEMS_PROPERTIES_FILENAME = ['item_properties_part1.csv',
                             'item_properties_part2.csv']
ITEMS_PROPERTIES_FILE_OPTIONS = {'itemid': 'string',
                                 'property': 'string'}

PREDICTION_USERS_FILENAME = 'predictions.csv'
PREDICTION_FILE_OPTIONS = {'visitorid': 'string'}

In [38]:
# Some helper functions
def str_to_date(date):
    return datetime.strptime(date, "%Y-%m-%d").date()

def remap_list(source):
    
    # Remove potential duplicates
    source = list(dict.fromkeys(source))
    
    source_index = {}
    index_source = {}
    count = 0
    
    for l in source:
        source_index[l] = count
        index_source[count] = l
        count += 1

    return source_index, index_source

def load_file(filenames, options=None, bzip2=True):

    def read_bz2_as_pd(filepath, options=None):
        with bz2.open(filepath, "rt") as f:
            if options:
                data = pd.read_csv(f, dtype=options)
            else:
                data = pd.read_csv(f)
            
        return data
    
    
    if type(filenames) is not list:
        filenames = [filenames]
    
    if bzip2:
        filenames = [filename + '.bz2' if bzip2 else filename for filename in filenames]
        df_files_array = (read_bz2_as_pd(os.path.join(DATA_DIR, filename), options) for filename in filenames)
    else:
        if options:
            df_files_array = (pd.read_csv(os.path.join(DATA_DIR, filename), dtype=options) for filename in filenames)
        else:
            df_files_array = (pd.read_csv(os.path.join(DATA_DIR, filename)) for filename in filenames)

    data = pd.concat(df_files_array, ignore_index=True)
    
    return data

In [8]:
# Load users to eventually predict on
raw_prediction_users = load_file(PREDICTION_USERS_FILENAME, PREDICTION_FILE_OPTIONS, bzip2=False)

In [9]:
raw_prediction_users.head()

Unnamed: 0,visitorid
0,593408
1,71998
2,1403739
3,693797
4,1244757


In [10]:
raw_prediction_users.info()

<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 174956 entries, 0 to 174955
Data columns (total 1 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   visitorid  174956 non-null  string
dtypes: string(1)
memory usage: 1.3 MB


To request implementation, send an email to feature_requests@modin.org.


In [11]:
# Load events
raw_events = load_file(EVENTS_FILENAME, EVENTS_FILE_OPTIONS)        
# Load categories
raw_categories = load_file(CATEGORIES_FILENAME, CATEGORIES_FILE_OPTIONS)
# Load items properties
raw_items_properties = load_file(ITEMS_PROPERTIES_FILENAME, ITEMS_PROPERTIES_FILE_OPTIONS)



In [12]:
# Make timestamps readable where applicable
raw_events.timestamp = pd.to_datetime(raw_events.timestamp,
                                      unit='ms')
raw_items_properties.timestamp = pd.to_datetime(raw_items_properties.timestamp,
                                                unit='ms')

# Transform timestamps and add them as a column
raw_events['date'] = raw_events.timestamp.dt.date
raw_items_properties['date'] = raw_items_properties.timestamp.dt.date



In [13]:
raw_events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,date
0,2015-06-02 05:02:12.117,257597,view,355908,,2015-06-02
1,2015-06-02 05:50:14.164,992329,view,248676,,2015-06-02
2,2015-06-02 05:13:19.827,111016,view,318965,,2015-06-02
3,2015-06-02 05:12:35.914,483717,view,253185,,2015-06-02
4,2015-06-02 05:02:17.106,951259,view,367447,,2015-06-02


In [14]:
raw_events.info()



<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 6 columns):
 #   Column         Dtype         
---  ------         -----         
 0   timestamp      datetime64[ns]
 1   visitorid      string        
 2   event          string        
 3   itemid         string        
 4   transactionid  string        
 5   date           object        
dtypes: datetime64[ns](1), object(1), string(4)
memory usage: 126.2+ MB


In [15]:
raw_categories.head()

Unnamed: 0,categoryid,parentid
0,1016,213
1,809,169
2,570,9
3,1691,885
4,536,1691


In [16]:
raw_categories.info()

<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 1669 entries, 0 to 1668
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   categoryid  1669 non-null   string
 1   parentid    1644 non-null   string
dtypes: string(2)
memory usage: 26.2 KB




In [17]:
raw_items_properties.head()

Unnamed: 0,timestamp,itemid,property,value,date
0,2015-06-28 03:00:00,460429,categoryid,1338,2015-06-28
1,2015-09-06 03:00:00,206783,888,1116713 960601 n277.200,2015-09-06
2,2015-08-09 03:00:00,395014,400,n552.000 639502 n720.000 424566,2015-08-09
3,2015-05-10 03:00:00,59481,790,n15360.000,2015-05-10
4,2015-05-17 03:00:00,156781,917,828513,2015-05-17


In [18]:
raw_items_properties.info()



<class 'modin.pandas.dataframe.DataFrame'>
RangeIndex: 20275902 entries, 0 to 20275901
Data columns (total 5 columns):
 #   Column     Dtype         
---  ------     -----         
 0   timestamp  datetime64[ns]
 1   itemid     string        
 2   property   string        
 3   value      object        
 4   date       object        
dtypes: datetime64[ns](1), object(2), string(2)
memory usage: 773.5+ MB


In [19]:
event_begin_timestamp = raw_events.timestamp.min()
event_end_timestamp = raw_events.timestamp.max()

num_users = raw_events.visitorid.nunique()
num_items = raw_events.itemid.nunique()

num_users_with_transactions = raw_events[raw_events.event == 'transaction'].visitorid.nunique()
num_items_with_transactions = raw_events[raw_events.event == 'transaction'].itemid.nunique()

item_lowest_price = raw_events[raw_events.event == 'transaction'].transactionid.min()
item_highest_price = raw_events[raw_events.event == 'transaction'].transactionid.max()

prediction_users = list(raw_prediction_users.visitorid.unique())



### A naive recommender

A naive recommender for an item X could be one that suggests a subset of all the items that other customers have bought together with item X.

Such recommender doesn't take into account much more than the chances of items being bought together in the realm of a customer within a considered period.<br>
On the bright side, the recommender is independent of the customer's behaviour (i.e., the suggestions are independent from the number of purchases) because it relies solely on the item-to-item "relationships".

In [20]:
# Naive recommender
# Suggest n random items of a set of all other items bought together with the queried one by all users
# Supports other actions like adding to the cart or viewing an item
def items_to_item(events,
                  itemid,
                  actions='transaction'):
    # Customers who bought some specific itemid: event == 'transaction' && transactionid != NaN
    # Otherwise (event == 'view' || event == 'addtocart') && transactionid == NaN)
    
    def visitors_to_item(events, itemid, actions):
        filtered_events_per_itemid = events[(events.itemid == itemid)]

        if type(actions) is not list:
            actions = [actions]
            
        # Create empty pd.DataFrame WITH structure
        #visitors = filtered_events_per_itemid.iloc[0:0,:].copy()
        visitors = {}
        
        for a in actions:
            if a == 'transaction':
                filtered_events_per_event = filtered_events_per_itemid[(filtered_events_per_itemid.event == a) &
                                                                       (filtered_events_per_itemid.transactionid.notna())]
            else:
                filtered_events_per_event = filtered_events_per_itemid[(filtered_events_per_itemid.event == a) &
                                                                       (filtered_events_per_itemid.transactionid.isna())]

            visitors[a] = list(filtered_events_per_event.visitorid.unique())

        return visitors
    
    visitors_list_per_event = visitors_to_item(events, itemid, actions)

    items_lists = []
    for action, visitors_list in visitors_list_per_event.items():
        filtered_events_by_action = events[(events.event == action)]
        if action == 'transaction':
            filtered_events_by_action = filtered_events_by_action[(filtered_events_by_action.transactionid.notna())]
        else:
            filtered_events_by_action = filtered_events_by_action[(filtered_events_by_action.transactionid.isna())]
            
        items_lists_per_transaction = [list(filtered_events_by_action[(filtered_events_by_action.visitorid == visitorid)].itemid.unique()) for visitorid in visitors_list]
        items_lists += items_lists_per_transaction
    
    items = set().union(*items_lists)
    return list(items)

In [30]:
itemid = '200793'
actions=['transaction', 'addtocart']
filtered_events = raw_events[(raw_events.date < str_to_date('2015-09-01'))]

recs = items_to_item(filtered_events, itemid, actions)
recs

['317178',
 '400969',
 '12836',
 '15335',
 '25353',
 '380775',
 '237753',
 '80582',
 '105792',
 '200793',
 '302422']

### A collaborative filtering recommender

Based in matrix factorization. For the interested ones in a summary: https://www.benfrederickson.com/matrix-factorization/ and improvements: https://www.benfrederickson.com/fast-implicit-matrix-factorization/

In [95]:
filtered_events_by_date = raw_events[(raw_events.date < str_to_date('2015-09-01'))]
filtered_events_by_action = filtered_events_by_date[(raw_events.event == 'transaction')]

filtered_events = filtered_events_by_action[['visitorid', 'itemid', 'transactionid']]
filtered_events.transactionid = filtered_events.transactionid.apply(lambda x: 1 if x else 0)

In [96]:
users_index_map, index_users_map = remap_list(list(filtered_events.visitorid.unique()))
items_index_map, index_items_map = remap_list(list(filtered_events.itemid.unique()))

In [97]:
def create_ratings_matrix(events,
                          user_col='visitorid',
                          item_col='itemid',
                          rating_col='transactionid',
                          user_index=None,
                          item_index=None,
                          mode='coo'):

    num_users = events[user_col].nunique()
    num_items = events[item_col].nunique()
    
    if user_index:
        users_index_map = user_index
    else:
        users_index_map, index_users_map = remap_list(list(events[user_col].unique()))
        
    if item_index:
        items_index_map = item_index
    else:
        items_index_map, index_items_map = remap_list(list(events[item_col].unique()))
    
    
    if mode == 'dok':
        ratings = sparse.dok_matrix((num_users, num_items), dtype=events[rating_col].dtype)
        
        for row in events.itertuples(index=False):
            i = users_index_map[row[user_col]]
            j = items_index_map[row[item_col]]
            v = row[raing_col]
    
            ratings[i, j] = v
    elif mode == 'coo':
        i = events[user_col].apply(lambda x: users_index_map[x])
        j = events[item_col].apply(lambda x: items_index_map[x])
        data = events[rating_col].values

        ratings = sparse.coo_matrix((data, (i, j)), shape=(num_users, num_items))
    else:
        raise("Nope")

    return ratings, users_index_map, items_index_map

In [103]:
# user_to_item_coo = sparse.coo_matrix((filtered_events.transactionid.values, (filtered_events.visitorid, filtered_events.itemid)))
user_to_item_matrix, _, _ = create_ratings_matrix(filtered_events, user_index=users_index_map, item_index=items_index_map)

In [123]:
# initialize a model
cf_als = implicit.als.AlternatingLeastSquares(factors=128,
                                              regularization=1.5,
                                              iterations=100,
                                              calculate_training_loss=True)

cf_bpr = implicit.bpr.BayesianPersonalizedRanking(factors=128,
                                                  learning_rate=0.1.5,
                                                  regularization=1,
                                                  iterations=100,
                                                  verify_negative_samples=True)
cf_lmf = implicit.lmf.LogisticMatrixFactorization(factors=128,
                                                  learning_rate=0.1,
                                                  regularization=1.5,
                                                  iterations=100)

In [124]:
# train the model on a sparse matrix of item/user/confidence weights
cf_als.fit(user_to_item_matrix, show_progress=True)
cf_bpr.fit(user_to_item_matrix, show_progress=True)
cf_lmf.fit(user_to_item_matrix, show_progress=True)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [131]:
# recommend items for a user
user_index=4
user_items = user_to_item_matrix.T.tocsr()
recommendations = cf_als.recommend(userid=user_index,
                                   user_items=user_items,
                                   N=100,
                                   filter_already_liked_items=True,
                                   recalculate_user=True)
recommendations
#rankings = cf_bpr.rank_items(userid=user_index, user_items=user_items, recalculate_user=True)
#rankings

[(963, 0.0022365980156609476),
 (9665, 0.00015606703520093038),
 (6566, 0.00014496022112939373),
 (5390, 0.0001345236343571695),
 (1583, 0.00011647061983822193),
 (8453, 0.00011304546963300828),
 (1315, 7.94911548469208e-05),
 (7944, 6.985108875129526e-05),
 (3977, 6.779215012561945e-05),
 (156, 6.542576801624856e-05),
 (2547, 6.315737360971968e-05),
 (3837, 6.232464454680244e-05),
 (8670, 5.792081829609382e-05),
 (1208, 5.792081636468579e-05),
 (9271, 5.792080507215131e-05),
 (10406, 5.7920773299005675e-05),
 (10213, 5.1021588741806925e-05),
 (9033, 5.092846787336326e-05),
 (255, 4.8126511137992215e-05),
 (627, 4.791910236337214e-05),
 (1975, 4.791909759419493e-05),
 (5412, 4.791909323496985e-05),
 (6619, 4.791907744022648e-05),
 (6085, 4.791906112001938e-05),
 (153, 4.791905520290194e-05),
 (6642, 4.791904694082243e-05),
 (9866, 4.791904538412941e-05),
 (5235, 4.7919023884886595e-05),
 (9829, 4.790149934892754e-05),
 (259, 4.3445177989644455e-05),
 (3351, 4.303162104339862e-05),
 (19

In [112]:
recommendations

[(8842, 0.026507292),
 (8548, 0.0017611035),
 (722, 0.0015646522),
 (9771, 0.0013502428),
 (7883, 0.0012506607),
 (2547, 0.001158115),
 (646, 0.0011132723),
 (982, 0.0010440261),
 (1834, 0.0009371986),
 (645, 0.0009296304)]

In [None]:
# find related items
related = model.similar_items(itemid)

In [133]:
filtered_events.groupby(['visitorid', 'itemid']).agg({'transactionid': ['count']})



Unnamed: 0_level_0,Unnamed: 1_level_0,transactionid
Unnamed: 0_level_1,Unnamed: 1_level_1,count
visitorid,itemid,Unnamed: 2_level_2
1000057,271494,1
1000093,199101,1
100012,196659,1
100020,70387,1
1000248,37471,1
...,...,...
999814,36130,1
999869,172048,1
999869,47779,1
999869,65391,1


In [None]:
2664312+22457+69332