https://pub.towardsai.net/recommendation-system-in-depth-tutorial-with-python-for-netflix-using-collaborative-filtering-533ff8a0e444

https://realpython.com/build-recommendation-engine-collaborative-filtering/

https://heartbeat.fritz.ai/recommender-systems-with-python-part-ii-collaborative-filtering-k-nearest-neighbors-algorithm-c8dcd5fd89b2

https://github.com/benfred/implicit

In [None]:
import pandas as pd
import numpy as np

import plotly.express as px
from scipy import sparse

try:
  import implicit
  from implicit.evaluation import ranking_metrics_at_k, train_test_split
except:
  !pip install implicit
  import implicit
  from implicit.evaluation import ranking_metrics_at_k, train_test_split

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/joaopfonseca/business-cases/master/BC4_recommendation_system/retail.csv')

In [None]:
data = data.dropna()
data = data[['CustomerID', 'StockCode', 'Quantity']]

In [None]:
temp = data.drop_duplicates()

temp = temp.groupby(['CustomerID', 'StockCode']).sum().reset_index()
temp['CustomerID'] = temp['CustomerID'].astype('int').astype('category')
temp['StockCode'] = temp['StockCode'].astype('category')

visitor_cat = dict(zip(temp['CustomerID'], temp['CustomerID'].cat.codes))
item_cat = dict(zip(temp['StockCode'], temp['StockCode'].cat.codes))

inv_visitor_cat = {v: k for k, v in visitor_cat.items()}
inv_item_cat = {v: k for k, v in item_cat.items()}

temp['CustomerID'] = temp['CustomerID'].cat.codes
temp['StockCode'] = temp['StockCode'].cat.codes

temp

Unnamed: 0,CustomerID,StockCode,Quantity
0,0,2001,0
1,1,25,24
2,1,87,36
3,1,130,6
4,1,167,10
...,...,...,...
267610,4371,3087,4
267611,4371,3190,48
267612,4371,3191,120
267613,4371,3193,48


In [None]:
visitor_item_data = sparse.csc_matrix((temp['Quantity'], (temp['CustomerID'], temp['StockCode'])))
item_visitor_data = sparse.csc_matrix((temp['Quantity'], (temp['StockCode'], temp['CustomerID'])))

In [None]:
bpr_model = implicit.bpr.BayesianPersonalizedRanking(factors=200, use_gpu=False)
bpr_model.fit(item_visitor_data)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




In [None]:
visitor_id = list(inv_visitor_cat.keys())[0]
recommendations = bpr_model.recommend(visitor_id, visitor_item_data)
[(inv_item_cat[itemid], value) for itemid,value in recommendations]

[('22961', 1.3660692),
 ('22960', 1.3415065),
 ('22720', 1.3139504),
 ('23165', 1.1852032),
 ('22993', 1.119389),
 ('23166', 1.1050706),
 ('22722', 1.0812507),
 ('23245', 1.0471076),
 ('22666', 1.0451943),
 ('22423', 1.0017123)]

In [None]:
lmf_model = implicit.lmf.LogisticMatrixFactorization(factors=200, use_gpu=False, iterations = 50)
lmf_model.fit(item_visitor_data)

100%|██████████| 50/50 [00:44<00:00,  1.13it/s]


In [None]:
visitor_id = list(visitor_cat.keys())[3]
recommendations = lmf_model.recommend(visitor_cat[visitor_id], visitor_item_data)
[(inv_item_cat[itemid], value) for itemid,value in recommendations]

[('21232', 1593.7103),
 ('84826', 295.27094),
 ('M', 208.86496),
 ('22553', 184.48424),
 ('22557', 177.51266),
 ('22556', 164.24747),
 ('21983', 153.07788),
 ('D', 151.14673),
 ('21980', 149.49841),
 ('21484', 132.21579)]

In [None]:
ranking_metrics_at_k(lmf_model, *train_test_split(visitor_item_data, 0.8), 20)

HBox(children=(FloatProgress(value=0.0, max=4372.0), HTML(value='')))




{'auc': 0.5923336822929973,
 'map': 0.04697532439851182,
 'ndcg': 0.1201930958947562,
 'precision': 0.1175946661237785}

In [None]:
ranking_metrics_at_k(bpr_model, *train_test_split(visitor_item_data, 0.8), 20)

HBox(children=(FloatProgress(value=0.0, max=4372.0), HTML(value='')))




{'auc': 0.6755501153091117,
 'map': 0.19094192993301098,
 'ndcg': 0.32727522402072556,
 'precision': 0.32829490244466725}

In [None]:
def recommend(visitor_id):
  if visitor_id in visitor_cat.keys():
    recommendations = bpr_model.recommend(visitor_cat[visitor_id], visitor_item_data)
    return [inv_item_cat[itemid] for itemid,_ in recommendations]
  else:
    # cold start solution
    data = pd.read_csv('https://raw.githubusercontent.com/joaopfonseca/business-cases/master/BC4_recommendation_system/retail.csv')
    data = data.dropna()
    data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
    data = data.sort_values('InvoiceDate')
    recommendations = data.tail(100).sort_values('Quantity').tail(10)['StockCode'].values.tolist()
    return recommendations

In [None]:
recommend(data.sample(1)['CustomerID'].astype(int).values[0])

['23225',
 '23223',
 '22835',
 '21811',
 '23217',
 '22112',
 '23355',
 '22696',
 '22595',
 '84755']

In [None]:
recommend('Unknown user')

['22704',
 '23350',
 '84692',
 '23343',
 '23199',
 '85038',
 '23581',
 '20725',
 '85038',
 '20832']