In [30]:
import numpy as np
import pandas as pd

from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix, hstack, vstack
from sklearn.preprocessing import normalize
from tqdm.notebook import tqdm
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
user_plays = pd.read_csv('usersha1-artmbid-artname-plays.tsv', sep='\t', header=None)
user_profiles = pd.read_csv('usersha1-profile.tsv', sep='\t', header=None)

In [3]:
user_plays.columns = ['user-mboxsha1', 'musicbrainz-artist-id', 'artist-name', 'plays']
user_profiles.columns = ['user-mboxsha1', 'gender', 'age', 'country', 'signup']

### Keep only Female user for development

In [4]:
# keep only female users
user_profiles = user_profiles[user_profiles['gender']=='f']

In [5]:
user_plays = user_plays[user_plays['user-mboxsha1'].isin(user_profiles['user-mboxsha1'].unique())]

### Sample 10% of data for development
running the following program on all data takes around 30 minutes. Here we take 10% for demonstration/development purpose.

In [6]:
SAMPLE = True
sample_size = 0.1
if SAMPLE:
    user_profiles = user_profiles.sample(frac=sample_size)
    user_plays = user_plays[user_plays['user-mboxsha1'].isin(user_profiles['user-mboxsha1'].unique())]

### Quick EDA

In [7]:
user_plays.describe()

Unnamed: 0,plays
count,409174.0
mean,191.331
std,544.804
min,1.0
25%,32.0
50%,85.0
75%,198.0
max,85629.0


In [8]:
user_plays.describe(include='O')

Unnamed: 0,user-mboxsha1,musicbrainz-artist-id,artist-name
count,409174,403745,409173
unique,8482,39638,45492
top,4ddafccadfb57dfd1096f615ed67816c41512464,cc197bad-dc9c-440d-a5b5-d52ba2e14234,coldplay
freq,99,1992,1992


there are more unique artist-name than musicbrainz-artist-id. Looks like some artist names share the same musicbrainz-artist-id

In [9]:
grouped = user_plays[['musicbrainz-artist-id', 'artist-name']].groupby(['musicbrainz-artist-id']).nunique()
grouped[grouped['artist-name'] > 2].head()

Unnamed: 0_level_0,artist-name
musicbrainz-artist-id,Unnamed: 1_level_1
01726f9b-6e54-48e5-af05-871678ce7d1c,4
01a547e8-4b6f-4734-a1b7-7b4d767900db,3
01e10b31-ae0f-49a6-b4f4-5c3dcff9788a,3
04124bb0-1f13-468c-817d-af9a0e7d3a50,5
0460fa37-b2e9-4607-b295-5654ff51cd35,4


In [10]:
user_plays[user_plays['musicbrainz-artist-id'] == grouped[grouped['artist-name'] > 2].iloc[0].name]['artist-name'].unique()

array(['Руки вверх!', 'Руки Вверх', 'ryki vverh', 'Руки в верх'],
      dtype=object)

Looks like its due to different spelling/punctuation, we should use the id to distinguish between artists

In [11]:
user_plays.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 409174 entries, 4594 to 17534267
Data columns (total 4 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   user-mboxsha1          409174 non-null  object
 1   musicbrainz-artist-id  403745 non-null  object
 2   artist-name            409173 non-null  object
 3   plays                  409174 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 15.6+ MB


In [12]:
user_plays.isna().sum()

user-mboxsha1               0
musicbrainz-artist-id    5429
artist-name                 1
plays                       0
dtype: int64

In [13]:
# simply drop NaNs as the data volumne is quite large (4 million rows compare to ~50,000 rows of NaNs)
user_plays.dropna(inplace=True)

In [14]:
user_plays.shape

(403744, 4)

### Data preprocessing

In [15]:
users = list(np.sort(user_plays['user-mboxsha1'].unique()))
artists = list(user_plays['musicbrainz-artist-id'].unique())
plays = list(user_plays['plays'])

In [16]:
rows = user_plays['user-mboxsha1'].astype(CategoricalDtype(categories=users)).cat.codes
cols = user_plays['musicbrainz-artist-id'].astype(CategoricalDtype(categories=artists)).cat.codes
plays_sparse = csr_matrix((plays, (rows, cols)), shape=(len(users), len(artists)))

Dimensions are very high so here we use csr matrix

In [17]:
users_that_never_listened_to_any_songs = list(set(user_profiles['user-mboxsha1'].unique()) - set(user_plays['user-mboxsha1'].unique()))

In [18]:
# vstack with users that never listened to any songs (we put the average plays vector for these people)
plays_sparse = vstack((plays_sparse , csr_matrix(np.tile(plays_sparse.mean(axis=0), (len(users_that_never_listened_to_any_songs), 1)))))

some users have never listened to any songs, we fill in the average plays vector for these users (Global baseline approach)

In [19]:
user_profiles.drop(['gender', 'signup'], axis=1, inplace=True)

dropping gender and signup date as we are only dealing with female users only and signup date should not be a factor that can help distinguish user preference.

In [20]:
user_profiles.loc[(user_profiles.age>100)|(user_profiles.age<0), 'age'] = user_profiles.age.mean()

There are very large and negative values for age, replace them with the mean, here we assume these are inaccurate values

In [21]:
user_profiles['age'].fillna(user_profiles.age.mean(), inplace=True)

some user has not specified their age, here we fill nan with the mean

In [22]:
processed_user_profiles = pd.merge(user_profiles.drop(['country', 'user-mboxsha1'], axis=1), pd.get_dummies(user_profiles['country']), left_index=True, right_index=True)

Use one-hot encoding for countries. Do not use label encoding as that will give countries order, and also some kind of distance in the feature space.

In [23]:
mms = MinMaxScaler()
processed_user_profiles['age'] = mms.fit_transform(processed_user_profiles['age'].to_numpy().reshape(-1, 1))

In [24]:
# normalise the plays of each user, as some user are frequent users and some aren't.
normalized_plays_sparse = normalize(plays_sparse, norm='l1', axis=1)
X = hstack((normalized_plays_sparse, csr_matrix(processed_user_profiles)))

here we normalize the plays to balance frequent users and non-frequent users, as frequent users usually have much more plays. Normalizating each row will make sure the preference is fair to be compared. The user profile is stacked with the csr matrix to form the input data for our model.

### Nearest Neighbours Model

In [25]:
# n_neighbors can be tuned to consider more neighbors
nbrs = NearestNeighbors(n_neighbors=100, n_jobs=-1).fit(X)
distances, indices = nbrs.kneighbors(X)

In [26]:
indices

array([[   0, 3027, 7111, ..., 1457, 8251, 7100],
       [   1, 5763, 5832, ..., 4840, 4707, 8478],
       [   2, 5896, 4080, ..., 4799, 4221, 8030],
       ...,
       [8490, 1938, 3607, ..., 4049, 8377, 3310],
       [8491, 8486, 3172, ..., 3874, 1365, 4057],
       [8492, 8484, 2962, ..., 5196, 7069, 2247]], dtype=int64)

In [27]:
distances

array([[0.        , 0.2388725 , 0.24023269, ..., 0.28518627, 0.28580433,
        0.28606168],
       [0.        , 0.22131211, 0.22502863, ..., 0.24894273, 0.24894664,
        0.24895929],
       [0.        , 0.22789906, 0.23215735, ..., 0.31854941, 0.32137542,
        0.32252755],
       ...,
       [0.        , 0.12933415, 0.13349742, ..., 0.18911138, 0.18997919,
        0.19022093],
       [0.        , 0.03030303, 0.14351687, ..., 0.17194213, 0.17199341,
        0.17229742],
       [0.        , 0.01010101, 0.13500174, ..., 0.19485232, 0.19523769,
        0.19557231]])

indices is an 2D array where each row is a "neighbourhood". i.e. the nearest neighbors of the item in the 0 index for each row.

distances is another 2D array where each row stores the distance (distance in feature space) from the 0 indexed item in indices to other items

### Predict the normalised plays for each user

In [28]:
artists = np.array(artists)
all_users = users + users_that_never_listened_to_any_songs
max_distance = distances.max()

recommendations = {}
number_of_recommendations = 20
for user, distance in tqdm(zip(indices, distances), total=len(distances)):
    
    # use distances as weights to make closer neighbours to contribute more to the prediction
    weights = 1 - (distance[1:] / max_distance)
    
    # predict the user's plays on all artists
    predicted_user_preference = normalized_plays_sparse[user[1:]].multiply(weights[:, np.newaxis]).mean(axis=0).A[0]
    
    # get artists that the user already listened to
    true_user_preference = plays_sparse[user[0]].A[0]
    artists_already_listened_to = np.where(true_user_preference!=0)

    # change the value of artists already listened to to 0 to prevent recommendation
    predicted_user_preference[artists_already_listened_to] = 0
    
    # get top artists for this user
    recommended_artists = artists[predicted_user_preference.argsort()[-number_of_recommendations:][::-1]]
    
    # save artists for this user
    recommendations[all_users[user[0]]] = list(recommended_artists)

  0%|          | 0/8493 [00:00<?, ?it/s]

### How to recommend artists to users
The generated dictionary contains recommendations for each user. It can then be written to a database that support fast reads (e.g. DynamoDB).
The keys are the user ids and the value are the lists of artist ids.

This can be used for example:
- when a user renders the view of the app/webpage, the app/webserver fetch the user's recommendations from the database and display the artists

In [29]:
print('example values in recommendations')
{k:recommendations[k] for k in list(recommendations.keys())[:5]}

example values in recommendations


{'000fff7e107172b2fdee439636daccf8287b34a7': ['b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d',
  'cc197bad-dc9c-440d-a5b5-d52ba2e14234',
  '9c9f1380-2516-4fc9-a3e6-f9f61941d090',
  '95e1ead9-4d31-4808-a7ac-32c3614c116b',
  '0039c7ae-e1a7-4a7d-9b49-0cbc716821a6',
  'c0b2500e-0cef-4130-869d-732b23ed9df5',
  '6ffb8ea9-2370-44d8-b678-e9237bbd347b',
  'f82f3a3e-29c2-42ca-b589-bc5dc210fa9e',
  '8bfac288-ccc5-448d-9573-c33ea2aa5c30',
  '8c538f11-c141-4588-8ecb-931083524186',
  '83d91898-7763-47d7-b03b-b92132375c47',
  'ff6e677f-91dd-4986-a174-8db0474b1799',
  '73e5e69d-3554-40d8-8516-00cb38737a1c',
  '45a663b5-b1cb-4a91-bff6-2bef7bbfdd76',
  'aa7a2827-f74b-473c-bd79-03d065835cf7',
  '87c5dedd-371d-4a53-9f7f-80522fb7f3cb',
  '39ab1aed-75e0-4140-bd47-540276886b60',
  'f1b525b4-ddd0-4d39-85b2-d8fa26a7f279',
  'a7bdc71f-697a-45d9-92b2-a01fbbe50272',
  'dfe9a7c4-8cf2-47f4-9dcb-d233c2b86ec3'],
 '00210ff162f0b1f2197b2add462ac17b07bf91f6': ['b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d',
  '69ee3720-a7cb-4402-b48d-a0

### Further improvements:
- Currently this method is user to user. There are other ways such as item to item and even hybrid.
- The are better models, such as using factorization machines
- Can try deep models such as DeepFM/VAEs/GNNs, VAEs and GNNs seem to be current state-of-the-art models for recommendation systems