In [1]:
# Get Imports
import pandas as pd

In [2]:
import json

filepath = "./spotify_million_playlist_dataset/data"
filename = filepath + "/mpd.slice.0-999.json"

MAXPLAYLISTSIZE = 999999
MAXREADFILES = 10000
PLAYLISTPERFILE = 1000

# Input: filesToRead(int) number of files to read
# Add error handling later
def readFile(filesToRead):
    playlistNames = []
    playlistTracksIds = []
    playlistArtistIds = []
    playlistAlbumIds = []
    for slice_number in range(filesToRead):
        filename = filepath + "/mpd.slice." + str(slice_number*PLAYLISTPERFILE) + "-" + str((slice_number*PLAYLISTPERFILE)+PLAYLISTPERFILE-1) + ".json"
        print(filename)
        
        with open(filename, 'r') as file:
            playlist_data = json.load(file)

        listOfPlayLists = playlist_data['playlists']

        for playlist in listOfPlayLists:
            playlistNames.append(playlist['name'])
            trackIds = [track['track_uri'] for track in playlist['tracks']]
            playlistTracksIds.append(trackIds)
            artistIds = [artist['artist_uri'] for artist in playlist['tracks']]
            playlistArtistIds.append(artistIds)
            albumIds = [album['album_uri'] for album in playlist['tracks']]
            playlistAlbumIds.append(albumIds)
    return pd.DataFrame({'name': playlistNames, 'trackIds': playlistTracksIds, 'artistIds': playlistArtistIds})

In [3]:
# Get data 
playListDF = readFile(10)

./spotify_million_playlist_dataset/data/mpd.slice.0-999.json
./spotify_million_playlist_dataset/data/mpd.slice.1000-1999.json
./spotify_million_playlist_dataset/data/mpd.slice.2000-2999.json
./spotify_million_playlist_dataset/data/mpd.slice.3000-3999.json
./spotify_million_playlist_dataset/data/mpd.slice.4000-4999.json
./spotify_million_playlist_dataset/data/mpd.slice.5000-5999.json
./spotify_million_playlist_dataset/data/mpd.slice.6000-6999.json
./spotify_million_playlist_dataset/data/mpd.slice.7000-7999.json
./spotify_million_playlist_dataset/data/mpd.slice.8000-8999.json
./spotify_million_playlist_dataset/data/mpd.slice.9000-9999.json


In [10]:
# Get all the songs 
explodedTracks = playListDF.explode('trackIds')
uniqueTracks = explodedTracks['trackIds'].unique()
# Construct a binary matrix with rows=playlists and columns=songs
tracksMatrix = pd.DataFrame(0, index=range(len(playListDF)), columns=uniqueTracks)
for i, playlist in playListDF.iterrows():
    for tracks in playlist['trackIds']:
        tracksMatrix.loc[i, tracks] = 1



In [22]:
# Reduce the size of the matrix
from scipy.sparse import csr_matrix, coo_matrix
tracksMatrixSparse = csr_matrix(tracksMatrix.values)
tracksMatrixCOO = tracksMatrixSparse.tocoo()
tracksMatrixCOO = pd.DataFrame({'playlistId': tracksMatrixCOO.row, 'trackId':tracksMatrixCOO.col, 'inPlaylist':tracksMatrixCOO.data})

In [60]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(tracksMatrixCOO, reader)

trainset, testset = train_test_split(data, test_size=0.2)

model = SVD(biased=True, n_epochs=30, lr_all=0.01, reg_all=0.1, verbose=True)
model.fit(trainset)

predictions = model.test(testset)

accuracy.rmse(predictions)



Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
RMSE: 0.0108


0.010778256983621741

In [73]:
def getTopPredictions(model, playlist_id, numPredictions):
    # Get all item IDs in the dataset
    item_ids = set(trainset.all_items())

    # Predict ratings for all items for the given user
    predictions = [(iid, model.predict(playlist_id, iid).est) for iid in item_ids]

    # Sort the predictions by predicted rating in descending order
    predictions.sort(key=lambda x: x[1], reverse=True)

    # Extract the top-N item IDs from the sorted predictions
    top_n = [iid for (iid, _) in predictions[:numPredictions]]
    return top_n

In [51]:
def calculate_precision(recommended_items, ground_truth):
    # Calculate the number of overlapping items between recommended items and ground truth
    overlapping_items = set(recommended_items) & set(ground_truth)
    
    # Calculate precision: Number of overlapping items / Total number of recommended items
    if len(recommended_items) > 0:
        precision = len(overlapping_items) / len(recommended_items)
    else:
        precision = 0.0  # Handle the case when there are no recommended items

    return precision

In [95]:
# Lets test using R-precision as used in the Spotify Million Dollar Playset Challenge
# Get the list of unique user IDs in the test set
playlistIds = set([uid for (uid, _, _) in testset])

# Initialize variables to store precision values
total_precision = 0
total_users = 0

index = 0
# Calculate R-Precision for each user in the test set
for playlistId in playlistIds:
    # Get the list of items the user has interacted with (ground truth)
    ground_truth = [iid for (uid, iid, _) in testset if uid == playlistId]

    # Generate recommendations for the user
    recommendations = getTopPredictions(model, playlistId, 10)

    # Calculate precision by comparing the recommended items with the ground truth
    precision = calculate_precision(recommendations, ground_truth)

    print("Index", index, ":", playlistId)
    index += 1
    
    # Update total precision and total users
    total_precision += precision
    total_users += 1

# Calculate average precision (R-Precision)
r_precision = total_precision / total_users

Index 0 : 0
Index 1 : 1
Index 2 : 2
Index 3 : 3
Index 4 : 4
Index 5 : 5
Index 6 : 6
Index 7 : 7
Index 8 : 8
Index 9 : 9
Index 10 : 10
Index 11 : 11
Index 12 : 12
Index 13 : 13
Index 14 : 14
Index 15 : 15
Index 16 : 16
Index 17 : 17
Index 18 : 18
Index 19 : 19
Index 20 : 20
Index 21 : 21
Index 22 : 22
Index 23 : 23
Index 24 : 24
Index 25 : 25
Index 26 : 26
Index 27 : 28
Index 28 : 29
Index 29 : 30
Index 30 : 31
Index 31 : 32
Index 32 : 33
Index 33 : 34
Index 34 : 35
Index 35 : 36
Index 36 : 37
Index 37 : 38
Index 38 : 39
Index 39 : 40
Index 40 : 41
Index 41 : 42
Index 42 : 43
Index 43 : 44
Index 44 : 45
Index 45 : 46
Index 46 : 47
Index 47 : 48
Index 48 : 49
Index 49 : 50
Index 50 : 51
Index 51 : 52
Index 52 : 53
Index 53 : 55
Index 54 : 56
Index 55 : 57
Index 56 : 58
Index 57 : 59
Index 58 : 60
Index 59 : 61
Index 60 : 62
Index 61 : 63
Index 62 : 64
Index 63 : 65
Index 64 : 66
Index 65 : 67
Index 66 : 68
Index 67 : 69
Index 68 : 70
Index 69 : 71
Index 70 : 72
Index 71 : 73
Index 72 : 7

In [97]:
print(r_precision) 

0.002016823755954191
