In [1]:
# !pip install -U scikit-learn scikit-surprise numpy pandas

In [2]:
from surprise import Dataset, Reader
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import MinMaxScaler
from surprise import SVD
from surprise.model_selection import KFold
from surprise import accuracy

In [3]:
df = pd.read_csv('./data/spotify_dataset.csv', on_bad_lines="skip")
df.head()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


In [4]:
df.columns = df.columns.str.replace('"', '')
df.columns = df.columns.str.replace('name', '')
df.columns = df.columns.str.replace(' ', '')
df.columns

Index(['user_id', 'artist', 'track', 'playlist'], dtype='object')

In [5]:
df.to_csv('./data/unprocessed.csv', index=False)

In [5]:
size = lambda x: len(x)
df_freq = df.groupby(['user_id', 'artist']).agg('size').reset_index().rename(columns={0:'freq'})[['user_id', 'artist', 'freq']]
df_freq.head()artist

Unnamed: 0,user_id,artist,freq
0,00055176fea33f6e027cd3302289378b,5 Seconds Of Summer,10
1,00055176fea33f6e027cd3302289378b,Abigail Breslin,1
2,00055176fea33f6e027cd3302289378b,Against The Current,3
3,00055176fea33f6e027cd3302289378b,All Time Low,8
4,00055176fea33f6e027cd3302289378b,Auryn,1


In [6]:
artist = pd.unique(df['artist'].values.ravel())
artist = pd.Series(np.arange(len(artist)), artist)
df_freq["artist_id"] = df_freq[['artist']].applymap(artist.get)

In [10]:
artist_df = pd.DataFrame(df_freq["artist_id"].unique(), columns = ["artist_id"])
artist_df["artist"] = df_freq["artist"].unique()
df_freq.drop('artist', axis = 1, inplace = True)
df_freq.head()

Unnamed: 0,user_id,freq,artist_id
0,00055176fea33f6e027cd3302289378b,10,713
1,00055176fea33f6e027cd3302289378b,1,132909
2,00055176fea33f6e027cd3302289378b,3,7994
3,00055176fea33f6e027cd3302289378b,8,698
4,00055176fea33f6e027cd3302289378b,1,8100


In [11]:
scaler = MinMaxScaler(feature_range=(1,5))

In [12]:
df_freq[["freq"]] = scaler.fit_transform(df_freq[["freq"]])
df_freq

Unnamed: 0,user_id,freq,artist_id
0,00055176fea33f6e027cd3302289378b,1.010762,713
1,00055176fea33f6e027cd3302289378b,1.000000,132909
2,00055176fea33f6e027cd3302289378b,1.002392,7994
3,00055176fea33f6e027cd3302289378b,1.008371,698
4,00055176fea33f6e027cd3302289378b,1.000000,8100
...,...,...,...
3285626,fff77dadf8528083c920b9c018847e8b,1.000000,1142
3285627,fff77dadf8528083c920b9c018847e8b,1.000000,1580
3285628,fff77dadf8528083c920b9c018847e8b,1.000000,1030
3285629,fff77dadf8528083c920b9c018847e8b,1.001196,699


In [39]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_freq[['user_id', 'artist_id', 'freq']], reader)

In [60]:
def get_testset_accuracy(testset):
    total_matches = 0
    for item in testset:
        uid = item[0]
        preds = []
        for id in df_freq.artist_id.values:
            preds.append(svd.predict(uid=uid, iid=id))
        iid=[]
        for pred in preds:
            iid.append(pred.iid)
        iid=list(dict.fromkeys(iid))
        predicted_artists = set()
        for i in iid[:30]:
            artist = artist_df.loc[artist_df.artist_id == i].artist.values[0]
            predicted_artists.add(artist)
        known_artists = list(set(df.loc[df["user_id"] == uid].artist.values))
        #print(len(known_artists))
        total_matches += len(predicted_artists.intersection(known_artists))
    print(total_matches / (30 * len(testset)))      

In [None]:
# define a cross-validation iterator
kf = KFold(n_splits=2)
svd = SVD(n_epochs=100)
for trainset, testset in kf.split(data):
    # train and test algorithm.
    svd.fit(trainset)
    get_testset_accuracy(testset[:100])
    predictions = svd.test(testset)
    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

In [17]:
preds = []
for id in df_freq.artist_id.values:
    preds.append(svd.predict(uid="00055176fea2346e027cd3302289378b", iid=id))

In [18]:
iid=[]
for pred in preds:
    iid.append(pred.iid)
iid=list(dict.fromkeys(iid))

In [28]:
def get_top_songs(artist, num_songs = 5):
    return list(df[df['artist'] == artist]['track'].value_counts()[0:num_songs].index)

In [29]:
res = {}
for i in iid[:5]:
    artist = artist_df.loc[artist_df.artist_id == i].artist.values[0]
    res[artist] = get_top5_songs(artist)

In [30]:
print(res)

{'5 Seconds Of Summer': ['She Looks So Perfect', 'Amnesia', "Don't Stop", 'Good Girls', 'Voodoo Doll'], 'Abigail Breslin': ['You Suck', 'Fight for Me', 'Christmas In New York (feat. No)', 'House of the Rising Sun', 'Hurricane'], 'Against The Current': ['Closer, Faster', 'Something You Need', 'Gravity', 'Infinity', 'Paralyzed'], 'All Time Low': ['Dear Maria, Count Me In', 'Weightless', 'Remembering Sunday', 'Lost In Stereo', 'Break Your Little Heart'], 'Auryn': ["Don't give up my game", 'Heartbreaker', 'Make my day', "Saturday I'm in love", 'Cuando Te Volveré A Ver']}
