In [2]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import implicit

# Load the data
raw_data = pd.read_table('data/usersha1-artmbid-artname-plays.tsv')
raw_data = raw_data.drop(raw_data.columns[1], axis=1)
raw_data.columns = ['user', 'artist', 'plays']

In [3]:
raw_data.head()

Unnamed: 0,user,artist,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
1,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
2,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
3,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706
4,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691


In [5]:
raw_data1 = raw_data[0:2000000]

In [10]:
print(f'Total number of users in the data: {raw_data1.user.nunique()}')
print(f'Total number of artists in the data: {raw_data1.artist.nunique()}')

Total number of users in the data: 40913
Total number of artists in the data: 110820


In [11]:
data = raw_data1.dropna()
data = data.copy()

In [15]:
data['user'] = data['user'].astype('category')
data['artist'] = data['artist'].astype('category')
data['user_id'] = data['user'].cat.codes
data['artist_id'] = data['artist'].cat.codes

In [17]:
data.head(2)

Unnamed: 0,user,artist,plays,user_id,artist_id
0,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,0,30264
1,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897,0,66512


In [18]:
sparse_item_user = sparse.csr_matrix((data['plays'].astype(float), (data['artist_id'], data['user_id'])))
sparse_user_item = sparse.csr_matrix((data['plays'].astype(float), (data['user_id'], data['artist_id'])))

In [19]:
sparse_item_user

<110820x40913 sparse matrix of type '<class 'numpy.float64'>'
	with 1999967 stored elements in Compressed Sparse Row format>

In [20]:
sparse_user_item

<40913x110820 sparse matrix of type '<class 'numpy.float64'>'
	with 1999967 stored elements in Compressed Sparse Row format>

In [22]:
matrix_size = sparse_user_item.shape[0]*sparse_user_item.shape[1]
num_purchases = len(sparse_user_item.nonzero()[0])
sparsity = 100*(1 - (num_purchases / matrix_size))
sparsity

99.95588936009682

In [23]:
import random

def make_train(ratings, pct_test = 0.2):
    
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of item,user index into list

    
    random.seed(0) # Set the random seed to zero for reproducibility
    
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of item-user pairs without replacement

    item_inds = [index[0] for index in samples] # Get the item row indices

    user_inds = [index[1] for index in samples] # Get the user column indices

    
    training_set[item_inds, user_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    
    return training_set, test_set, list(set(user_inds))

In [24]:
product_train, product_test, product_users_altered = make_train(sparse_item_user, pct_test = 0.05)

In [26]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=40)

# Calculate the confidence by multiplying it by our alpha value.(alpha value corresponds to the confidence metric 
# that we discussed earlier)

alpha_val = 15
data_conf = (product_train * alpha_val).astype('double')

# We have used an alpha_val of 15 after performing some iterations with different alpha values
#Fit the model
model.fit(data_conf)



  0%|          | 0/40 [00:00<?, ?it/s]

In [27]:
item_vecs = model.item_factors
user_vecs = model.user_factors

In [31]:
print(f'Shape of the artist vector matrix: {item_vecs.shape}')
print(f'Shape of the user vector matrix: {user_vecs.shape}')

Shape of the artist vector matrix: (110820, 20)
Shape of the user vector matrix: (40913, 20)


In [32]:
from sklearn import metrics
import matplotlib.pylab as plt
def auc_score(predictions, test):
    '''
    This simple function will output the area under the curve using sklearn's metrics. 
    
    parameters:
    
    - predictions: your prediction output
    
    - test: the actual target result you are comparing to
    
    returns:
    
    - AUC (area under the Receiver Operating Characterisic curve)
    '''
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)

In [33]:
def calc_mean_auc(training_set, altered_users, predictions, test_set):
    
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_items = np.array(test_set.sum(axis = 1)).reshape(-1) # Get sum of item iteractions to find most popular
    item_vecs = predictions[1]
    for user in altered_users: # Iterate through each user that had an item altered
        training_column = training_set[:,user].toarray().reshape(-1) # Get the training set column
        zero_inds = np.where(training_column == 0) # Find where the interaction had not yet occurred
        
        # Get the predicted values based on our user/item vectors
        user_vec = predictions[0][user,:]
        pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
        
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[:,user].toarray()[zero_inds,0].reshape(-1)
        
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_items[zero_inds] # Get the item popularity for our chosen items
        
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))

In [34]:
calc_mean_auc(product_train, product_users_altered,
              [sparse.csr_matrix(user_vecs), sparse.csr_matrix(item_vecs.T)], product_test)

(0.962, 0.934)

In [35]:
data[data['artist'] == 'red hot chili peppers']

Unnamed: 0,user,artist,plays,user_id,artist_id
4,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691,0,80876
1422,000429493d9716b66b02180d208d09b5b89fbe64,red hot chili peppers,234,29,80876
2139,0007e26aafcfc0b6dcb87d7041583fbb7cced88a,red hot chili peppers,159,44,80876
3284,000b0bb32f149504e1df3cce85b6bfd20cef3dd0,red hot chili peppers,46,68,80876
3322,000b2ee840cbda56e0f41c8f248c4fb7ee275db3,red hot chili peppers,87,69,80876
...,...,...,...,...,...
1998983,1d2c0053f7e585caa02ccbb3a96b708c078e9300,red hot chili peppers,1024,40892,80876
1999418,1d2d697dbe47814d1de4064508b32ec00a1ec767,red hot chili peppers,384,40901,80876
1999642,1d2db9f945283c3f48ab685a84a78429a63a3f19,red hot chili peppers,66,40905,80876
1999823,1d2e1fa030318413f194f741c674b6032094a0ce,red hot chili peppers,389,40909,80876


In [36]:
artist_id = 80876
n_similar = 10 # getting the top ten similar items

# Use implicit to get similar items.
similar = model.similar_items(artist_id, n_similar)
# Print the names of our most similar artists
for artist in similar:
    idx, score = artist
    print (data.artist.loc[data.artist_id == idx].iloc[0])

red hot chili peppers
muse
nirvana
coldplay
queen
the killers
foo fighters
placebo
pink floyd
the beatles


In [37]:
data[data['artist'] == 'die Ärzte'].head(5)

Unnamed: 0,user,artist,plays,user_id,artist_id
0,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,0,30264
2943,000a1585c5f65532a9c9187a882892982d345a5c,die Ärzte,148,61,30264
3787,000cb6427411006fe9a6193d3c4f59efed53fbef,die Ärzte,7,78,30264
6295,0014ffc91d3a5b59cce9bceaf22ef0d72e5711b8,die Ärzte,88,128,30264
13513,003059a886782e4d7936da913d3f064f637d0b2b,die Ärzte,5,274,30264


In [38]:
# Find the 10 most similar to die Ärzte
artist_id = 30264
n_similar = 10 # getting the top ten similar items

# Use implicit to get similar items.
similar = model.similar_items(artist_id, n_similar)
# Print the names of our most similar artists
for artist in similar:
    idx, score = artist
    print (data.artist.loc[data.artist_id == idx].iloc[0])

die Ärzte
guano apes
mando diao
soundtrack
apocalyptica
him
limp bizkit
[unknown]
bloodhound gang
nightwish


In [39]:
data['rank'] = data.groupby(['user_id'])['plays'].rank(ascending = False)

# filtering for their first choice
data_1  = data[data['rank'] == 1]

In [40]:
data_1[data_1['artist_id'] == 30264].head()

Unnamed: 0,user,artist,plays,user_id,artist_id,rank
0,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099,0,30264,1.0
25686,005d521c9f8b1acfc13b7a4cc4b39085edfc786a,die Ärzte,933,523,30264,1.0
29112,006aaaaf386fdbb0aea3f2bf9019e346a7294b6a,die Ärzte,946,595,30264,1.0
30440,006f93b4213be13020a3819e0d0a86dcf97b58de,die Ärzte,1375,622,30264,1.0
53937,00c465e5b33365ab91cc5cf161590a38044954af,die Ärzte,2924,1094,30264,1.0


In [41]:
data[data['user_id'] == 1094].head(10)

Unnamed: 0,user,artist,plays,user_id,artist_id,rank
53937,00c465e5b33365ab91cc5cf161590a38044954af,die Ärzte,2924,1094,30264,1.0
53938,00c465e5b33365ab91cc5cf161590a38044954af,equilibrium,1936,1094,36491,2.0
53939,00c465e5b33365ab91cc5cf161590a38044954af,ensiferum,1782,1094,36343,3.0
53940,00c465e5b33365ab91cc5cf161590a38044954af,system of a down,1167,1094,92524,4.0
53941,00c465e5b33365ab91cc5cf161590a38044954af,sdp,1042,1094,85406,5.0
53942,00c465e5b33365ab91cc5cf161590a38044954af,deichkind,1013,1094,29016,6.0
53943,00c465e5b33365ab91cc5cf161590a38044954af,knorkator,920,1094,57157,7.0
53944,00c465e5b33365ab91cc5cf161590a38044954af,typ:t.u.r.b.o.,911,1094,103029,8.0
53945,00c465e5b33365ab91cc5cf161590a38044954af,serj tankian,813,1094,85997,9.0
53946,00c465e5b33365ab91cc5cf161590a38044954af,rise against,779,1094,82040,10.0


In [42]:
user_id = 1094

# Use the implicit recommender.
recommended = model.recommend(user_id, sparse_user_item,N = 20,filter_already_liked_items = False)

artists = []
scores = []

# Get artist names from ids
for item in recommended:
    idx, score = item
    artists.append(data.artist.loc[data.artist_id == idx].iloc[0])
    scores.append(score)

# Create a dataframe of artist names and scores
recommendations = pd.DataFrame({'artist': artists, 'score': scores})

print (recommendations)

                     artist     score
0               tyler bates  1.337788
1             jan hegenberg  1.193714
2   five finger death punch  1.190098
3          slagsmålsklubben  1.178774
4              daniel licht  1.163203
5         kaizers orchestra  1.141305
6                       eav  1.137476
7                 fightstar  1.131872
8                   flyleaf  1.128522
9                   volbeat  1.127203
10                   saliva  1.124229
11             freedom call  1.123379
12         jennifer rostock  1.118862
13                  flobots  1.117408
14                  skillet  1.115119
15       machinae supremacy  1.112002
16               eisbrecher  1.111799
17                    aiden  1.107183
18                      asp  1.102318
19            drowning pool  1.102212
