In [1]:
import numpy as np
import pandas as pd
from implicit.datasets.lastfm import get_lastfm
from implicit.nearest_neighbours import bm25_weight
from implicit.cpu.bpr import BayesianPersonalizedRanking

  from .autonotebook import tqdm as notebook_tqdm


## Experiment with the given example dataset

In [2]:
artists, users, artist_user_plays = get_lastfm()

In [3]:
type(artist_user_plays)

scipy.sparse._csr.csr_matrix

In [4]:
# weight the matrix, both to reduce impact of users that have played the same artist thousands of times
# and to reduce the weight given to popular items
# https://benfred.github.io/implicit/tutorial_lastfm.html#:~:text=The%20first%20step,classic%20information%20retrieval%3A
# https://en.wikipedia.org/wiki/Okapi_BM25
artist_user_plays = bm25_weight(artist_user_plays, K1=100, B=0.8)

# get the transpose since the most of the functions in implicit expect (user, item) sparse matrices instead of (item, user)
user_plays = artist_user_plays.T.tocsr()

In [6]:
model = BayesianPersonalizedRanking(factors=64, regularization=0.01, learning_rate=0.1, iterations=15, num_threads=0)
model.fit(user_plays)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [09:12<00:00, 36.86s/it, train_auc=95.25%, skipped=2.18%]


In [7]:
# Get recommendations for the a single user
userid = 12345
ids, scores = model.recommend(userid, user_plays[userid], N=10, filter_already_liked_items=False)
pd.DataFrame({"artist": artists[ids], "score": scores, "already_liked": np.in1d(ids, user_plays[userid].indices)})

Unnamed: 0,artist,score,already_liked
0,laibach,4.972835,False
1,mortiis,4.710769,True
2,sopor aeternus & the ensemble of shadows,4.707899,False
3,the sisters of mercy,4.629304,False
4,fields of the nephilim,4.560995,False
5,the red army choir,4.531089,False
6,za frûmi,4.49164,False
7,coptic rain,4.488866,False
8,christian death,4.481449,False
9,welle:erdball,4.477931,False


In [8]:
# get related items for the beatles (itemid = 25512)
ids, scores= model.similar_items(252512)

# display the results using pandas for nicer formatting
pd.DataFrame({"artist": artists[ids], "score": scores})

Unnamed: 0,artist,score
0,the beatles,1.0
1,paul mccartney,0.79347
2,the beach boys,0.756167
3,ahoud banai,0.755275
4,the british blues quintet,0.748916
5,orville hammond trio,0.742344
6,movie cast,0.738473
7,lovin spoonful,0.735691
8,aljazeeraenglish,0.7353
9,byron t,0.734515


In [None]:
# Make recommendations for the first 1000 users in the dataset
userids = np.arange(1000)
ids, scores = model.recommend(userids, user_plays[userids])
ids, ids.shape

## Experiment with amazon beauty dataset

In [1]:
import numpy as np
import pandas as pd
from implicit.datasets.lastfm import get_lastfm
from implicit.nearest_neighbours import bm25_weight
from implicit.cpu.bpr import BayesianPersonalizedRanking
from utils import pandas_df_to_csr

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
amazon_beauty_df = pd.read_csv("ratings_Beauty.csv")

In [3]:
user_map, item_map, amazon_beauty_csr = pandas_df_to_csr(amazon_beauty_df)

In [4]:
amazon_beauty_csr_bm25 = bm25_weight(amazon_beauty_csr, K1=100, B=0.8)

In [5]:
amazon_beauty_csr_bm25 = amazon_beauty_csr_bm25.tocsr()

In [6]:
model = BayesianPersonalizedRanking(factors=64, regularization=0.01, learning_rate=0.1, iterations=15, num_threads=0)
model.fit(amazon_beauty_csr_bm25)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:45<00:00,  3.04s/it, train_auc=91.64%, skipped=0.03%]


In [7]:
# Get recommendations for the a single user
userid = 725046
ids, scores = model.recommend(userid, amazon_beauty_csr_bm25[userid], N=10, filter_already_liked_items=False)

In [8]:
pd.DataFrame({"ProductId": item_map.loc[ids]["ProductId"], "score": scores, "already_purchased": np.in1d(ids, amazon_beauty_csr_bm25[userid].indices)})

Unnamed: 0_level_0,ProductId,score,already_purchased
ItemIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
81854,B002OVV7F0,2.696701,True
62199,B001H928KI,2.067228,False
64175,B001KYPZRS,2.022594,False
98082,B003HFSZ3Y,1.952995,False
68981,B001R66FFA,1.947325,False
68342,B001Q88FO8,1.925719,False
113434,B0047PPO0U,1.914744,False
203415,B00B1ZRTP2,1.899756,False
98092,B003HG4VLS,1.854516,False
112906,B0046VEREY,1.842274,False


In [13]:
item_map.loc[ids]["ProductId"]

ItemIndex
114276    B0049WJA9C
65766     B001MA0QY2
181492    B008O4YM4Y
10253     B0009PVV40
103349    B003S516XO
174368    B0085WHBHU
75617     B002B9DWBC
89252     B00325D0WK
73976     B0027A7CLG
181069    B008MP481M
Name: ProductId, dtype: object

## End of Experiments

In [1]:
import numpy as np
import pandas as pd
import gc
from implicit.datasets.lastfm import get_lastfm
from implicit.nearest_neighbours import bm25_weight
from implicit.cpu.bpr import BayesianPersonalizedRanking
from implicit import evaluation
from utils import pandas_df_to_csr

  from .autonotebook import tqdm as notebook_tqdm


## Pre-processing

In [2]:
amazon_beauty_df = pd.read_csv("ratings_Beauty.csv")

In [3]:
# Convert pandas df to CSR format
# user_map and item_map contains index in csr format to original id mappings for users and items repectively
user_map, item_map, amazon_beauty_csr = pandas_df_to_csr(amazon_beauty_df)

In [4]:
# weight the matrix, both to reduce impact of users that have puchased the same item thousands of times
# and to reduce the weight given to popular items
# Output is a COO matrix
amazon_beauty_coo_bm25 = bm25_weight(amazon_beauty_csr, K1=100, B=0.8)

### Train-test split

In [6]:
train_coo, test_coo = evaluation.train_test_split(amazon_beauty_coo_bm25, train_percentage=0.8, random_state=55)
print(f"Train size: {train_coo.size} \n Test size: {test_coo.size}")

Train size: 1618938 
 Test size: 404132


In [7]:
#Convert coo to csr
train_csr = train_coo.tocsr()
test_csr = test_coo.tocsr()

## Training

In [8]:
model = BayesianPersonalizedRanking(factors=64, regularization=0.01, learning_rate=0.1, iterations=15, num_threads=0)
model.fit(train_csr)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:39<00:00,  2.62s/it, train_auc=89.26%, skipped=0.03%]


## Evaluation

In [9]:
ranking_metrics_at_10 = evaluation.ranking_metrics_at_k(model, train_csr, test_csr, K=10, show_progress=True, num_threads=0)
ranking_metrics_at_10

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 330846/330846 [00:59<00:00, 5559.59it/s]


{'precision': 0.0057939534640418676,
 'map': 0.0023882916029540897,
 'ndcg': 0.0030764243049914842,
 'auc': 0.5021825331755935}

## Hyper-parameter Tuning

In [37]:
# Hyper-parameters lists
latent_factors = [32, 64, 128, 256]
regularization = [0.001, 0.005, 0.01, 0.05, 0.1]
learning_rate = [0.5, 1.0, 1.5, 2.0, 2.5, 3.0]

In [None]:
results = []
for f in latent_factors:
    for r in regularization:
        for l in learning_rate:
            print("Training model with below parameter values --------")
            print(f"latent_factors: {f}, regularization: {r}, learning_rate: {l}")
            model =  BayesianPersonalizedRanking(factors=f, regularization=r, learning_rate=l, iterations=15, num_threads=0)
            model.fit(train_csr)
            ranking_metrics_at_10 = evaluation.ranking_metrics_at_k(model, train_csr, test_csr, K=10, show_progress=True, num_threads=0)
            print("Evaluation results: \n", ranking_metrics_at_10)
            results.append(
                (f,r,l,ranking_metrics_at_10['precision'],ranking_metrics_at_10['map'],ranking_metrics_at_10['ndcg'],ranking_metrics_at_10['auc'])
            )
            # Garbage handling
            del model
            gc.collect()

Training model with below parameter values --------
latent_factors: 32, regularization: 0.001, alpha: 0.5


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [5:11:20<00:00, 1245.35s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 330846/330846 [00:55<00:00, 5932.94it/s]


Evaluation results: 
 {'precision': 0.01006593548933499, 'map': 0.004695287763652912, 'ndcg': 0.006078225427714033, 'auc': 0.504619818152965}
Training model with below parameter values --------
latent_factors: 32, regularization: 0.001, alpha: 1.0


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [12:14:54<00:00, 2939.63s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 330846/330846 [00:54<00:00, 6070.11it/s]


Evaluation results: 
 {'precision': 0.011246646556383936, 'map': 0.005015572210796884, 'ndcg': 0.006602003745638734, 'auc': 0.505113884888769}
Training model with below parameter values --------
latent_factors: 32, regularization: 0.001, alpha: 1.5


 93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 14/15 [1:06:56<04:27, 267.77s/it]

In [None]:
results_df = pd.DataFrame(results, columns=['latent_factors', 'regularization', 'learning_rate', 'precision', 'map', 'ndcg', 'auc'])
results_df