# Collaborative filtering model

In [3]:
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from implicit.nearest_neighbours import bm25_weight
from implicit.als import AlternatingLeastSquares
import os

In [4]:
materials_root = "../materials"
working_root = "../working"

## Load data

**Load `userid-artist-counts.csv`** for training the model.

In [5]:
data = pd.read_csv(os.path.join(working_root, "userid-artist-counts.csv"))
data.head()

Unnamed: 0,userid,artist_mbids,count
0,1,000fc734-b7e1-4a01-92d1-f544261b43f5,1
1,1,00eb9d25-0465-49e3-8e7a-3eacbd9ebb0d,3
2,1,012b5f71-b079-48f1-892e-af645c6576cb,1
3,1,01ce0542-1e29-4ab6-bffe-ba0e2fb61ce8,2
4,1,0383dadf-2a4e-4d10-a46a-e9e041da8eb3,1


**Load `musicbrainz_artist.csv` for `artist_mbids` to `name` mapping.**

In [6]:
musicbrainz_artist = pd.read_csv(os.path.join(materials_root, "musicbrainz_artist.csv"))
musicbrainz_artist.set_index("artist_mbid", inplace=True)
musicbrainz_artist.head()

Unnamed: 0_level_0,name
artist_mbid,Unnamed: 1_level_1
fadeb38c-833f-40bc-9d8c-a6383b38b1be,Доктор Сатана
49add228-eac5-4de8-836c-d75cde7369c3,Pete Moutso
165a49a0-2b3b-4078-a3c1-905afdc07c0a,Babyglock
7b4a548e-a01a-49b7-82e7-b49efeb9732c,Aric Leavitt
60aca66f-e91a-4cb5-9308-b6e293cd833e,Fonograff


**Process the date for training**

In [7]:
users = data["userid"].astype("category")
artists = data["artist_mbids"].astype("category")
plays = coo_matrix(
        (data["count"].astype(np.float32),
        (
            users.cat.codes.copy(),
            artists.cat.codes.copy()
        ),
    )
).tocsr()

artist_user_plays = bm25_weight(plays, K1=100, B=0.8)
user_plays = artist_user_plays.T.tocsr()

## Model training

In [8]:
model = AlternatingLeastSquares(factors=64, regularization=0.05, alpha=2.0)
model.fit(artist_user_plays.tocsr())

100%|███████████████████████████████████████████████████████████████████████████████████| 15/15 [00:03<00:00,  4.27it/s]


## Generate recommendations

**Recommendation example 1**

In [9]:
userid = 123
ids, scores = model.recommend(userid, user_plays[userid], N=10)

In [10]:
print(f'Recommendations for user: {userid}')
recommendation = pd.DataFrame({"artist": musicbrainz_artist.loc[artists[ids]]['name'].tolist(), "score": scores})
recommendation.reset_index(drop=True, inplace=True)
recommendation

Recommendations for user: 123


Unnamed: 0,artist,score
0,Siouxsie Sioux,1.227083
1,John Butler Trio,1.185999
2,The Outcasts,1.082013
3,Camouflage,1.060227
4,Sharon Van Etten,1.058135
5,＊(Asterisk),1.02106
6,William Orbit,1.019111
7,KAUAN,1.012212
8,Salut c’est cool,1.009765
9,Paul Johnson,0.998471


**Recommendation example 2**

In [11]:
userid = 1234
ids, scores = model.recommend(userid, user_plays[userid], N=10)

In [12]:
print(f'Recommendations for user: {userid}')
recommendation = pd.DataFrame({"artist": musicbrainz_artist.loc[artists[ids]]['name'].tolist(), "score": scores})
recommendation.reset_index(drop=True, inplace=True)
recommendation

Recommendations for user: 1234


Unnamed: 0,artist,score
0,Jean‐Michel Blais,1.203481
1,Happy Go Lumpy's Bass & Friendship Club,1.173286
2,Wizzard,1.138032
3,New Order,1.135313
4,Godspeed You! Black Emperor,1.125593
5,Terry Riley,1.116322
6,Yung Joc,1.111145
7,PVRIS,1.109415
8,Styx,1.089745
9,Aquilo,1.087665


## Find similar artists

**Similar artists example 1**

In [13]:
artistid = "03f93de6-6d62-4710-bcc7-9b3d7c3d95f5"
artistname = musicbrainz_artist.loc[artistid]['name']
artist_idx = artists.index[artists == artistid].tolist()[0]

In [14]:
ids, scores = model.similar_items(artist_idx, N=10)

In [15]:
print(f'Artists similar to {artistname}')
similar_artists = pd.DataFrame({"artist": musicbrainz_artist.loc[artists[ids]]['name'].tolist(), "score": scores})
similar_artists.reset_index(drop=True, inplace=True)
similar_artists

Artists similar to Z‐Trip


Unnamed: 0,artist,score
0,Z‐Trip,1.0
1,近藤浩治,0.999161
2,Rupie Edwards,0.997485
3,Roxy Music,0.995935
4,Van Halen,0.992427
5,Tantric,0.967676
6,AlgoRythmiK,0.965064
7,Irène Drésel,0.964272
8,Iron & Wine,0.963777
9,Within Temptation,0.96013


## Tests

**Find similar artists to A. R. Rahman `MBID:e0bba708-bdd3-478d-84ea-c706413bedab`**

<img src='../scratch/AR_Rahman_At_The_‘Marvel_Anthem’_Launch_(3x4_cropped).jpg' width=200>

In [16]:
artistid = "e0bba708-bdd3-478d-84ea-c706413bedab"
artistname = musicbrainz_artist.loc[artistid]['name']
artist_idx = artists.index[artists == artistid].tolist()[0]

In [17]:
ids, scores = model.similar_items(artist_idx, N=10)

In [18]:
print(f'Artists similar to {artistname}')
similar_artists = pd.DataFrame({"artist": musicbrainz_artist.loc[artists[ids]]['name'].tolist(), "score": scores})
similar_artists.reset_index(drop=True, inplace=True)
similar_artists

Artists similar to A. R. Rahman


Unnamed: 0,artist,score
0,A. R. Rahman,1.0
1,Limp Bizkit,0.833495
2,Mokhov,0.833336
3,Lustre,0.831047
4,The Merry Thoughts,0.831047
5,JMSN,0.831047
6,Faith No More,0.831047
7,Giraffage,0.831047
8,Katy Carr,0.831047
9,Low Roar,0.831047


**Find similar artists to Goran Bregović `MBID:883ece23-2779-4091-b527-62eb07ee79d4`**

<img src='../scratch/20220710-Rudolstadt-Festival-2022-Goran-Bregovic-7849_(cropped).jpg' width=200>

In [19]:
artistid = "883ece23-2779-4091-b527-62eb07ee79d4"
artistname = musicbrainz_artist.loc[artistid]['name']
artist_idx = artists.index[artists == artistid].tolist()[0]

In [20]:
ids, scores = model.similar_items(artist_idx, N=10)

In [21]:
print(f'Artists similar to {artistname}')
similar_artists = pd.DataFrame({"artist": musicbrainz_artist.loc[artists[ids]]['name'].tolist(), "score": scores})
similar_artists.reset_index(drop=True, inplace=True)
similar_artists

Artists similar to Goran Bregović


Unnamed: 0,artist,score
0,Goran Bregović,1.0
1,T. Rex,0.974756
2,The Mavericks,0.974744
3,Technohead,0.974744
4,Julee Cruise,0.974744
5,DNCE,0.974744
6,Marshall Jefferson,0.974744
7,Pomplamoose,0.974744
8,Waldeck,0.974744
9,Alejandro Escovedo,0.97377


**Find similar artists to Linkin park `MBID:f59c5520-5f46-4d2c-b2c4-822eabf53419`**

<img src='../scratch/1920px-LinkinParkBerlin2010.jpg' width=400>

In [22]:
artistid = "f59c5520-5f46-4d2c-b2c4-822eabf53419"
artistname = musicbrainz_artist.loc[artistid]['name']
artist_idx = artists.index[artists == artistid].tolist()[0]

In [23]:
ids, scores = model.similar_items(artist_idx, N=10)

In [24]:
print(f'Artists similar to {artistname}')
similar_artists = pd.DataFrame({"artist": musicbrainz_artist.loc[artists[ids]]['name'].tolist(), "score": scores})
similar_artists.reset_index(drop=True, inplace=True)
similar_artists

Artists similar to Linkin Park


Unnamed: 0,artist,score
0,Linkin Park,1.0
1,Wau Wau Collectif,0.919157
2,Kid Cudi,0.91299
3,Stone Sour,0.91299
4,Ray Charles,0.873306
5,Elwood,0.664842
6,Jonathan Groff,0.599611
7,Lawrence,0.587182
8,Gorgon,0.565327
9,London Grammar,0.554062


## Database properties

### Number of `userid`

In [30]:
num_users = data["userid"].unique().shape[0]
print(f'There are {num_users} in the database used to train the model')

There are 10826 in the database used to train the model


### Most popular `artists`

In [57]:
popular_artists = pd.DataFrame({"artist_mbids": data['artist_mbids'].tolist()})
popular_artists = popular_artists.groupby(["artist_mbids"]).size().reset_index(name="count")
popular_artists = popular_artists.sort_values(by='count', ascending=False).head(10)
popular_artists["artists"] = musicbrainz_artist.loc[popular_artists["artist_mbids"]]['name'].tolist()
popular_artists = popular_artists.drop("artist_mbids", axis=1)
popular_artists = popular_artists.drop("count", axis=1)
popular_artists.reset_index(drop=True, inplace=True)
popular_artists

Unnamed: 0,artists
0,Linkin Park
1,Radiohead
2,Coldplay
3,Daft Punk
4,Billie Eilish
5,Green Day
6,Kendrick Lamar
7,Nirvana
8,Gorillaz
9,The Cure
