In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import os
import seaborn as sns
#from implicit.nearest_neighbours import bm25_weight
from implicit.als import AlternatingLeastSquares
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import transactions, customer and article data
df_trans = pd.read_csv('../data/transactions_train.csv', parse_dates=[0], dtype={'article_id':'string'})
df_art = pd.read_csv('../data/articles.csv', dtype={'article_id':'string'})
df_customers = pd.read_csv('../data/customers.csv')

In [3]:
# exclude parts of the data to reduce computational expense
# test period
#df = df_trans.query('t_dat <= "2020-09-15"').copy()
# only september 2018 and 2019
#df = df_trans.query('(t_dat >= "2018-09-01" and t_dat < "2018-10-01") or (t_dat >= "2019-09-01" and t_dat < "2019-10-01") or (t_dat >= "2020-09-01" and t_dat < "2020-10-01")').copy()
df = df_trans.query('(t_dat >= "2018-08-26" and t_dat < "2018-09-23") or (t_dat >= "2019-08-26" and t_dat < "2019-09-22") or (t_dat >= "2020-07-25" and t_dat < "2020-08-26")').copy()
df_trans = df_trans.query('t_dat < "2020-08-26"').copy()
#df = df_trans.copy()

# exclude users with less than 5 items in history
#n_items_per_user = df.groupby('customer_id').count().article_id.rename('n_items')
#df['n_items'] = df.customer_id.map(n_items_per_user)
#df = df[df.loc[:, 'n_items'] > 4]

# keep only users that have bought between 10 and 90 % of their items online
#n_items_per_channel = df.groupby(['customer_id', 'sales_channel_id']).count().article_id
#tmp1 = n_items_per_channel.reset_index().query('sales_channel_id == 1').set_index('customer_id').article_id.rename('offline_items')
#tmp2 = n_items_per_channel.reset_index().query('sales_channel_id == 2').set_index('customer_id').article_id.rename('online_items')
#online_fac = pd.concat([tmp1, tmp2], axis=1).fillna(0.0)
#online_fac['online_fac'] = online_fac.online_items/(online_fac.online_items + online_fac.offline_items)
#df['online_fac'] = df.customer_id.map(online_fac.online_fac).fillna(0.0)
#df = df.query('online_fac > 0.1 and online_fac < 0.9').copy()

In [4]:
df.customer_id.nunique()

402190

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2344774 entries, 0 to 30744933
Data columns (total 5 columns):
 #   Column            Dtype         
---  ------            -----         
 0   t_dat             datetime64[ns]
 1   customer_id       object        
 2   article_id        string        
 3   price             float64       
 4   sales_channel_id  int64         
dtypes: datetime64[ns](1), float64(1), int64(1), object(1), string(1)
memory usage: 107.3+ MB


In [6]:
# reduce data further by random sampling of users
#user_sample = df.groupby('customer_id').sum().sample(371827, random_state=42) #371827
#df = df.set_index('customer_id').loc[user_sample.index].reset_index()
# now count the number of times a customer bought an item
#_ = df.groupby(['customer_id', 'article_id']).count().price.rename('interactions')
#df['interactions'] = df.set_index(['customer_id', 'article_id']).index.map(_)
df_int = df.groupby(['customer_id', 'article_id']).t_dat.count().reset_index()
df_int.rename(columns={'t_dat':'interactions'}, inplace=True)

In [7]:
df_int_full = df_trans.groupby(['customer_id', 'article_id']).t_dat.count().reset_index()
df_int_full.rename(columns={'t_dat':'interactions'}, inplace=True)

In [8]:
n_users_full = df_int_full.customer_id.nunique()
n_items_full = df_int_full.article_id.nunique()

In [9]:
n_users = df_int.customer_id.nunique()
n_items = df_int.article_id.nunique()
n_records = len(df)
print(f'Sample size: {n_users} user and {n_items} items in {n_records} transactions.')
print(f'Sum of interactions: {df_int.interactions.sum()}')

Sample size: 402190 user and 62095 items in 2344774 transactions.
Sum of interactions: 2344774


In [10]:
# create utility matrix Y
# rows represent items, columns represent users
# note: users with no transactions and items never sold are not included
n_users = df_int.customer_id.nunique()
n_items = df_int.article_id.nunique()
user_ids = df_int.customer_id.unique()
item_ids = df_int.article_id.unique()

item_id_map = dict([(item_id, i) for i, item_id in enumerate(item_ids)])
item_id_map_rev = dict([(i, item_id) for i, item_id in enumerate(item_ids)])
user_id_map = dict([(user_id, j) for j, user_id in enumerate(user_ids)])
user_id_map_rev = dict([(j, user_id) for j, user_id in enumerate(user_ids)])

df_int['i'] = df_int.article_id.apply(lambda id: item_id_map[id])
df_int['j'] = df_int.customer_id.apply(lambda id: user_id_map[id])

# create sparse matrix
Y = coo_matrix((df_int.interactions, (df_int['i'], df_int['j'])), shape=(n_items,n_users))
#Y = bm25_weight(Y, K1=100, B=0.8)
Y_csr = Y.T.tocsr()


In [11]:
# create full utility matrix Y
# rows represent items, columns represent users
# note: users with no transactions and items never sold are not included
n_users_full = df_int_full.customer_id.nunique()
n_items_full = df_int_full.article_id.nunique()
user_ids_full = df_int_full.customer_id.unique()
item_ids_full = df_int_full.article_id.unique()

item_id_map_full = dict([(item_id, i) for i, item_id in enumerate(item_ids_full)])
item_id_map_rev_full = dict([(i, item_id) for i, item_id in enumerate(item_ids_full)])
user_id_map_full = dict([(user_id, j) for j, user_id in enumerate(user_ids_full)])
user_id_map_rev_full = dict([(j, user_id) for j, user_id in enumerate(user_ids_full)])

df_int_full['i'] = df_int_full.article_id.apply(lambda id: item_id_map_full[id])
df_int_full['j'] = df_int_full.customer_id.apply(lambda id: user_id_map_full[id])

# create sparse matrix
Y_full = coo_matrix((df_int_full.interactions, (df_int_full['i'], df_int_full['j'])), shape=(n_items_full,n_users_full))
#Y = bm25_weight(Y, K1=100, B=0.8)
Y_full_csr = Y_full.T.tocsr()


In [12]:
# check sparsity ratio
n_total = Y.shape[0]*Y.shape[1]
n_ratings = Y.nnz
sparsity = n_ratings/n_total
print('Matrix sparsity: ', round(sparsity*100, 2))

Matrix sparsity:  0.01


In [13]:
# check sparsity ratio
n_total = Y_full.shape[0]*Y_full.shape[1]
n_ratings = Y_full.nnz
sparsity = n_ratings/n_total
print('Matrix sparsity: ', round(sparsity*100, 2))

Matrix sparsity:  0.02


In [14]:
model = AlternatingLeastSquares(factors=1280, regularization=0.01, num_threads=0, iterations=30)
model.fit(2 * Y_csr)

100%|██████████| 30/30 [13:05<00:00, 26.17s/it]


In [15]:
model_full = AlternatingLeastSquares(factors=1280, regularization=0.01, num_threads=0, iterations=30)
model_full.fit(2 * Y_full_csr)

100%|██████████| 30/30 [48:56<00:00, 97.90s/it]


In [None]:
#user_id = user_ids[0]
#print(len(user_ids), user_id)
#print(Y_csr.shape, Y_csr[user_id_map[user_id], :])
#ids, scores = model.recommend(user_id_map[user_id], Y_csr[user_id_map[user_id]], N=12, filter_already_liked_items=False)

In [None]:
#items = [item_id_map_rev[idx] for idx in ids]

In [16]:
# predict full batch of users
user_idx = [user_id_map[id] for id in user_ids]
ids, scores = model.recommend(user_idx, Y_csr[user_idx], N=12, filter_already_liked_items=False)

In [17]:
user_ids_diff = df_trans.set_index('customer_id').drop(user_ids, axis=0).reset_index().customer_id.unique()

In [18]:
len(user_ids_diff)

937470

In [19]:
user_idx_diff = [user_id_map_full[id] for id in user_ids_diff]

In [20]:
# use full model to predict customers not considered by the "local" model
ids_diff, scores_diff = model_full.recommend(user_idx_diff, Y_full_csr[user_idx_diff], N=12, filter_already_liked_items=False)

In [21]:
# convert from matrix indices to item ids
tmp = pd.DataFrame(ids, index=user_ids)
tmp = tmp.apply(lambda s: ' '.join(s.apply(lambda id: item_id_map_rev[id])), axis=1)

In [22]:
tmp2 = pd.DataFrame(ids_diff, index=user_ids_diff)
tmp2 = tmp2.apply(lambda s: ' '.join(s.apply(lambda id: item_id_map_rev_full[id])), axis=1)

In [23]:
predictions = pd.concat([tmp, tmp2], axis=0)

In [24]:
predictions.index.nunique()

1339660

In [25]:
# make frame containing all available individualized recommendations and join with customer table
ids_all = np.hstack([user_ids, user_ids_diff])
print(len(ids_all), len(np.unique(ids_all)))
submission = pd.DataFrame({'prediction':predictions}, index=ids_all)
print(submission.index.nunique())
submission = df_customers.join(submission, on='customer_id', how='left').set_index('customer_id')
#submission = pd.concat([df_customers.set_index('customer_id'), submission], axis=1)

# now fill empty predictions with baseline
baseline_prediction = '0706016001 0706016002 0372860001 0610776002 0759871002 0464297007 0372860002 0610776001 0399223001 0706016003 0720125001 0156231001'
submission.fillna(baseline_prediction, inplace=True)


1339660 1339660
1339660


In [26]:
submission.index.nunique()

1371980

In [27]:
submission.loc[:, 'prediction'].to_csv('../data/seponly_noweights_lowreg_split-model_test-no-testweeks.csv')

In [None]:
# display results
fig, axs = plt.subplots(4, 4, figsize=(25, 20))
c = 0
for i in range(4):
    for j in range(4):
        ax = axs[i][j]
        if i == 0:
            ax.set_title(article_name)
            fname = article_id + '.jpg'
            subdir = fname[0:3]
            full_path = os.path.join('../data', 'images', subdir, fname)
        else:
            fname = similar_items[c] + '.jpg'
            subdir = fname[0:3]
            full_path = os.path.join('../data', 'images', subdir, fname)
            ax.set_title(item_names[similar_items[c]])
            c += 1
         
        if os.path.exists(full_path):
            img = mpimg.imread(full_path)
            ax.imshow(img)