In [63]:
import os
import json
import pandas as pd
import numpy as np
import scipy as sp
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Metrics obtained from an organizer of the Challenge
# https://github.com/plamere/RecsysChallengeTools/blob/master/metrics.py
from metrics import r_precision

In [3]:
playlists = list()
tracks = dict()
map_pl = list()

max_files_for_quick_processing = 1


def process_track(track):
    key = track['track_uri']
    if not key in tracks:
        tk = dict()
        tk['track_artist_name'] = track['artist_name']
        tk['track_artist_uri'] = track['artist_uri']
        tk['track_name'] = track['track_name']
        tk['track_album_uri'] = track['album_uri']
        tk['track_duration_ms'] = track['duration_ms']
        tk['track_album_name'] = track['album_name']
        tracks[track['track_uri']] = tk
    return key


def process_playlist(playlist):
    pl = dict()
    pl['playlist_name'] = playlist['name']
    pl['playlist_collaborative'] = playlist['collaborative']
    pl['playlist_pid'] = playlist['pid']
    pl['playlist_modified_at'] = playlist['modified_at']
    pl['playlist_num_albums'] = playlist['num_albums']
    pl['playlist_num_tracks'] = playlist['num_tracks']
    pl['playlist_num_followers'] = playlist['num_followers']
    pl['playlist_num_edits'] = playlist['num_edits']
    pl['playlist_duration_ms'] = playlist['duration_ms']
    pl['playlist_num_artists'] = playlist['num_artists']
    if 'description' in playlist:
        pl['playlist_description'] = playlist['description']
    else:
        pl['playlist_description'] = None
    trks = list()
    for track in playlist['tracks']:
        map_pl.append([playlist['pid'], track['track_uri']])
        trks.append(track['track_uri'])
        process_track(track)
    return pl

def process_mpd(path):
    count = 0
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        print(filename)
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            slice = json.loads(js)
            for playlist in slice['playlists']:
                playlists.append(process_playlist(playlist))
            count += 1
            if quick and count > max_files_for_quick_processing:
                break

quick = True
process_mpd('mpd.v1/data')

mpd.slice.0-999.json
mpd.slice.1000-1999.json


In [4]:
print(len(playlists))
print(len(tracks))
print(len(map_pl))

2000
57884
134125


In [5]:
playlist_df = pd.DataFrame(playlists)
playlist_df.head()
# print(playlist_df.describe())

Unnamed: 0,playlist_collaborative,playlist_description,playlist_duration_ms,playlist_modified_at,playlist_name,playlist_num_albums,playlist_num_artists,playlist_num_edits,playlist_num_followers,playlist_num_tracks,playlist_pid
0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,0
1,False,,11656470,1506556800,Awesome Playlist,23,21,5,1,39,1
2,False,,14039958,1505692800,korean,51,31,18,1,64,2
3,False,,28926058,1501027200,mat,107,86,4,1,126,3
4,False,,4335282,1401667200,90s,16,16,7,2,17,4


In [6]:
tracks_df = pd.DataFrame.from_dict(tracks, orient='index')
tracks_df.head()
# print(tracks_df.describe())

Unnamed: 0,track_artist_name,track_artist_uri,track_name,track_album_uri,track_duration_ms,track_album_name
spotify:track:000mA0etY38nKdvf1N04af,The Coronas,spotify:artist:2tppd6KkhK4ULAd217Ecq1,If I Gave Myself To Someone Else,spotify:album:662PiU3dRsilN0Gp87IiSF,214506,The Long Way
spotify:track:000xQL6tZNLJzIrtIgxqSl,ZAYN,spotify:artist:5ZsFI1h6hIdQRw2ti0hz81,Still Got Time,spotify:album:2kGUeTGnkLOYlinKRJe47G,188490,Still Got Time
spotify:track:002PgfoyfrOGiKch4EW8Wm,Alan Menken,spotify:artist:5sy77gt4bfsLcSQ8GIe4ZZ,Jasmine Runs Away,spotify:album:29EiOQJnxWlX5nVOWQpu3u,46866,Aladdin
spotify:track:004skCQeDn1iLntSom0rRr,Kevin Jonas,spotify:artist:11bdWrCwHE2gCdVARdASuu,Heart and Soul,spotify:album:5iK842b9xnZblgZkRxWCFe,177773,Camp Rock 2: The Final Jam
spotify:track:005CGalYNgMNZcvWMIFeK8,Nelson Y Sus Estrellas,spotify:artist:5SRwYMyavJCNT2AdVMpPgi,LLorándote,spotify:album:2TfoLoZHhsL0oAYBxDi4t2,321480,Orquídea de Plata


In [7]:
playlist_map_df = pd.DataFrame(map_pl, columns=['playlist_pid', 'track_uri'])
playlist_map_df.head()
#print(playlist_map_df.describe())

Unnamed: 0,playlist_pid,track_uri
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1,0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2,0,spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3,0,spotify:track:1AWQoqb9bSvzTjaLralEkT
4,0,spotify:track:1lzr43nnXAijIGYnCT8M8H


In [8]:
merged = pd.merge(
    pd.merge(
        tracks_df, playlist_map_df, left_index=True, right_on='track_uri'),
    playlist_df,
    on='playlist_pid')

In [9]:
merged.head()

Unnamed: 0,track_artist_name,track_artist_uri,track_name,track_album_uri,track_duration_ms,track_album_name,playlist_pid,track_uri,playlist_collaborative,playlist_description,playlist_duration_ms,playlist_modified_at,playlist_name,playlist_num_albums,playlist_num_artists,playlist_num_edits,playlist_num_followers,playlist_num_tracks
0,The Coronas,spotify:artist:2tppd6KkhK4ULAd217Ecq1,If I Gave Myself To Someone Else,spotify:album:662PiU3dRsilN0Gp87IiSF,214506,The Long Way,371,spotify:track:000mA0etY38nKdvf1N04af,False,,3675883,1496793600,quiet,15,14,4,1,15
1,Lewis Watson,spotify:artist:40ELTAg7Kg6vbWnlyx2n9R,stones around the sun,spotify:album:4cKBAg2zgjrVF2XefrW4WC,224440,the morning,371,spotify:track:0JGbwcwPV0VfuR4zDcZ9ce,False,,3675883,1496793600,quiet,15,14,4,1,15
2,Jaymes Young,spotify:artist:6QrQ7OrISRYIfS5mtacaw2,We Won't,spotify:album:6MuWCR3WPjwyKhqsTKLZ3z,240586,Feel Something,371,spotify:track:0Zge2Kfo3Yd9JOGnAmVPbb,False,,3675883,1496793600,quiet,15,14,4,1,15
3,John Lucas,spotify:artist:7iEy8zKFtlYIINaxxLIyBk,This Will Be Our Home,spotify:album:2N7sEVVS3jKMJJuJi4v0UF,277160,Promised Land,371,spotify:track:1Sw7fhf7YJCD6GcWW0wETD,False,,3675883,1496793600,quiet,15,14,4,1,15
4,Hozier,spotify:artist:2FXC3k01G6Gw61bmprjgqS,Cherry Wine - Live,spotify:album:36k5aXpxffjVGcNce12GLZ,240147,Hozier,371,spotify:track:1ivHxaGL5ld9VS1zsYc4YN,False,,3675883,1496793600,quiet,15,14,4,1,15


In [121]:
# Create Negative Samples
negative_samples = pd.DataFrame([])
for pid, df in tqdm(merged.groupby(["playlist_pid"])):
    negative_tracks = tracks_df.drop(df.track_uri).sample(df.shape[0])
    negative_playlist_tracks = pd.concat(
        [
            df.drop(list(tracks_df.columns) + ['track_uri'], axis=1).reset_index(
                drop=True), negative_tracks.reset_index()
        ],
        axis=1)
    negative_playlist_tracks.rename(columns={'index': 'track_uri'}, inplace=True)
    negative_samples = negative_samples.append(negative_playlist_tracks)


100%|██████████| 2000/2000 [03:39<00:00,  9.10it/s]


In [108]:
# Provide labels
negative_samples['match'] = 0
merged['match'] = 1

In [110]:
negative_samples.head()

Unnamed: 0,playlist_pid,playlist_collaborative,playlist_description,playlist_duration_ms,playlist_modified_at,playlist_name,playlist_num_albums,playlist_num_artists,playlist_num_edits,playlist_num_followers,playlist_num_tracks,match,track_uri,track_artist_name,track_artist_uri,track_name,track_album_uri,track_duration_ms,track_album_name
0,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,0,spotify:track:0WnGvxW15RZyOvZxDYiWF4,Josh Abbott Band,spotify:artist:2EJ5MRZCzpHSSNNEpTx9Kb,Wasn't That Drunk (feat. Carly Pearce),spotify:album:7yxhlLyOZAyFyjE3PyTaKX,223451,Front Row Seat
1,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,0,spotify:track:7o8hpxUFzvswawT21GQBXd,Soulsavers,spotify:artist:762Lq0yk2VhjIfkTVDzlhv,Point Sur Pt 1,spotify:album:6eV9rhs3qFonmspoqWF2vG,102626,The Light The Dead See
2,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,0,spotify:track:1t88m8JUlqn9kf0FLmVta5,SISTAR,spotify:artist:2wTLheTmMcFCA4hdY8hZJP,Give it to me,spotify:album:1nErNFyYvHnuCYvfwrMyoz,201351,Give It To Me
3,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,0,spotify:track:4OXbg79dU25xStMmSKQvun,Bonobo,spotify:artist:0cmWgDlu9CwTgxPhf403hb,Recurring,spotify:album:3JN1kCFyxsQfMpD3TKMyde,303013,Recurring (The Live Sessions)
4,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,0,spotify:track:45udgBYzuYbE9bC4uKCm14,Shakira,spotify:artist:0EmeFodog0BfCgMzAIvKQp,Gordita,spotify:album:2cxYK3rgZ4w8oZftOSoMuS,204986,Sale El Sol


In [111]:
merged.head()

Unnamed: 0,track_artist_name,track_artist_uri,track_name,track_album_uri,track_duration_ms,track_album_name,playlist_pid,track_uri,playlist_collaborative,playlist_description,playlist_duration_ms,playlist_modified_at,playlist_name,playlist_num_albums,playlist_num_artists,playlist_num_edits,playlist_num_followers,playlist_num_tracks,match
0,The Coronas,spotify:artist:2tppd6KkhK4ULAd217Ecq1,If I Gave Myself To Someone Else,spotify:album:662PiU3dRsilN0Gp87IiSF,214506,The Long Way,371,spotify:track:000mA0etY38nKdvf1N04af,False,,3675883,1496793600,quiet,15,14,4,1,15,1
1,Lewis Watson,spotify:artist:40ELTAg7Kg6vbWnlyx2n9R,stones around the sun,spotify:album:4cKBAg2zgjrVF2XefrW4WC,224440,the morning,371,spotify:track:0JGbwcwPV0VfuR4zDcZ9ce,False,,3675883,1496793600,quiet,15,14,4,1,15,1
2,Jaymes Young,spotify:artist:6QrQ7OrISRYIfS5mtacaw2,We Won't,spotify:album:6MuWCR3WPjwyKhqsTKLZ3z,240586,Feel Something,371,spotify:track:0Zge2Kfo3Yd9JOGnAmVPbb,False,,3675883,1496793600,quiet,15,14,4,1,15,1
3,John Lucas,spotify:artist:7iEy8zKFtlYIINaxxLIyBk,This Will Be Our Home,spotify:album:2N7sEVVS3jKMJJuJi4v0UF,277160,Promised Land,371,spotify:track:1Sw7fhf7YJCD6GcWW0wETD,False,,3675883,1496793600,quiet,15,14,4,1,15,1
4,Hozier,spotify:artist:2FXC3k01G6Gw61bmprjgqS,Cherry Wine - Live,spotify:album:36k5aXpxffjVGcNce12GLZ,240147,Hozier,371,spotify:track:1ivHxaGL5ld9VS1zsYc4YN,False,,3675883,1496793600,quiet,15,14,4,1,15,1


In [112]:
negative_samples.head()

Unnamed: 0,playlist_pid,playlist_collaborative,playlist_description,playlist_duration_ms,playlist_modified_at,playlist_name,playlist_num_albums,playlist_num_artists,playlist_num_edits,playlist_num_followers,playlist_num_tracks,match,track_uri,track_artist_name,track_artist_uri,track_name,track_album_uri,track_duration_ms,track_album_name
0,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,0,spotify:track:0WnGvxW15RZyOvZxDYiWF4,Josh Abbott Band,spotify:artist:2EJ5MRZCzpHSSNNEpTx9Kb,Wasn't That Drunk (feat. Carly Pearce),spotify:album:7yxhlLyOZAyFyjE3PyTaKX,223451,Front Row Seat
1,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,0,spotify:track:7o8hpxUFzvswawT21GQBXd,Soulsavers,spotify:artist:762Lq0yk2VhjIfkTVDzlhv,Point Sur Pt 1,spotify:album:6eV9rhs3qFonmspoqWF2vG,102626,The Light The Dead See
2,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,0,spotify:track:1t88m8JUlqn9kf0FLmVta5,SISTAR,spotify:artist:2wTLheTmMcFCA4hdY8hZJP,Give it to me,spotify:album:1nErNFyYvHnuCYvfwrMyoz,201351,Give It To Me
3,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,0,spotify:track:4OXbg79dU25xStMmSKQvun,Bonobo,spotify:artist:0cmWgDlu9CwTgxPhf403hb,Recurring,spotify:album:3JN1kCFyxsQfMpD3TKMyde,303013,Recurring (The Live Sessions)
4,0,False,,11532414,1493424000,Throwbacks,47,37,6,1,52,0,spotify:track:45udgBYzuYbE9bC4uKCm14,Shakira,spotify:artist:0EmeFodog0BfCgMzAIvKQp,Gordita,spotify:album:2cxYK3rgZ4w8oZftOSoMuS,204986,Sale El Sol


In [113]:
merged.columns

Index(['track_artist_name', 'track_artist_uri', 'track_name',
       'track_album_uri', 'track_duration_ms', 'track_album_name',
       'playlist_pid', 'track_uri', 'playlist_collaborative',
       'playlist_description', 'playlist_duration_ms', 'playlist_modified_at',
       'playlist_name', 'playlist_num_albums', 'playlist_num_artists',
       'playlist_num_edits', 'playlist_num_followers', 'playlist_num_tracks',
       'match'],
      dtype='object')

In [115]:
negative_samples.columns

Index(['playlist_pid', 'playlist_collaborative', 'playlist_description',
       'playlist_duration_ms', 'playlist_modified_at', 'playlist_name',
       'playlist_num_albums', 'playlist_num_artists', 'playlist_num_edits',
       'playlist_num_followers', 'playlist_num_tracks', 'match', 'track_uri',
       'track_artist_name', 'track_artist_uri', 'track_name',
       'track_album_uri', 'track_duration_ms', 'track_album_name'],
      dtype='object')

In [119]:
dataset = merged.append(negative_samples[merged.columns]).sort_values(by=['playlist_pid']).reset_index(drop=True)

In [122]:
dataset.tail(32)

Unnamed: 0,track_artist_name,track_artist_uri,track_name,track_album_uri,track_duration_ms,track_album_name,playlist_pid,track_uri,playlist_collaborative,playlist_description,playlist_duration_ms,playlist_modified_at,playlist_name,playlist_num_albums,playlist_num_artists,playlist_num_edits,playlist_num_followers,playlist_num_tracks,match
268218,Dispatch,spotify:artist:6v4jPZO3UIDNJIgdxRxtr9,Elias,spotify:album:6uEgPlkfgRwWwEa4w62ZC2,404386,Silent Steeples,1999,spotify:track:4TxHeNgKUd5erPefZOtGhA,False,,3701742,1464307200,Sad Music,14,12,3,1,16,0
268219,Hammock,spotify:artist:0VOR7Ie9xUSb45fzIIVJQ1,I Can Almost See You,spotify:album:1xGZ70dEsN8UyiXvcXWwHI,253146,Raising Your Voice...Trying to Stop an Echo,1999,spotify:track:0jVy0Y3QEtUKsrIHAZuR45,False,,3701742,1464307200,Sad Music,14,12,3,1,16,0
268220,Woods Of Ypres,spotify:artist:24dCDrr94SNSAREMJhBHCs,Shards Of Love,spotify:album:6QCd6IRkbpty4LPPajcYvN,318640,Woods 4: The Green Album,1999,spotify:track:6vkQscRGFLRSLJ3g9uDbKW,False,,3701742,1464307200,Sad Music,14,12,3,1,16,0
268221,Smokepurpp,spotify:artist:21dooacK2WGBB5amYvKyfM,Different Color Molly,spotify:album:2TWKabrKN6YOZ1jgqo97uz,185298,Deadstar,1999,spotify:track:6gO7SsL92jgKRP8lsDGxYo,False,,3701742,1464307200,Sad Music,14,12,3,1,16,0
268222,Natalie Grant,spotify:artist:6KVnMm856M8CHHBCw53Ihh,Your Great Name,spotify:album:7lxvGTdwoLfkQ48E1wT2HG,361413,Love Revolution,1999,spotify:track:5pdYFUzu94cFVCqOFCd3lk,False,,3701742,1464307200,Sad Music,14,12,3,1,16,0
268223,Bronco,spotify:artist:0VKh7CQDi9MkUvaBMoK1V0,Un Fin de Semana,spotify:album:7kYO9wfvxwhrxQRINgBDj8,201026,A Todo Galope,1999,spotify:track:4YopYXMzcifG1jm94vnhbW,False,,3701742,1464307200,Sad Music,14,12,3,1,16,0
268224,Message To Bears,spotify:artist:6MmQrV24zUeieNf027zkh3,I Know You Love to Fall,spotify:album:13fSrslfho3Hi0ZGzEdYoI,236106,Maps,1999,spotify:track:1uaT7uj0uLOFfq3Uvx6hJt,False,,3701742,1464307200,Sad Music,14,12,3,1,16,0
268225,2Pac,spotify:artist:1ZwdS5xdxEREPySFridCfh,Thugz Mansion - Nas Acoustic,spotify:album:7rBSWFxL81HFhjaubI1y7P,252506,Better Dayz,1999,spotify:track:6uSl1oTb01LwPBaruCA5wm,False,,3701742,1464307200,Sad Music,14,12,3,1,16,0
268226,Wiley,spotify:artist:7k9T7lZlHjRAM1bb0r9Rm3,Laptop (feat. Manga),spotify:album:1F4VaFhSDTVK5uDPXbm42l,178680,Godfather,1999,spotify:track:6vhqZEPSGeAC9OhHP5PjOa,False,,3701742,1464307200,Sad Music,14,12,3,1,16,0
268227,Sinan Akçıl,spotify:artist:31czVneIywpUwJTNOVviJx,Rüya,spotify:album:7ASyEf6DbN52gEf7f0Kr3k,211638,Best of Aşk,1999,spotify:track:16Mlt0FtuqF71yvDoOIBTT,False,,3701742,1464307200,Sad Music,14,12,3,1,16,0


In [102]:
# Include the features in the list below
features = [
    'playlist_duration_ms', 'playlist_num_albums', 'playlist_num_artists',
    'playlist_num_edits', 'playlist_num_followers', 'playlist_num_tracks',
    'playlist_pid', 'track_duration_ms'
]
data_x = dataset[features]
data_y = dataset.match
X_train, X_test, y_train, y_test = train_test_split(
    data_x, data_y, test_size=0.1, random_state=42, shuffle=True)

In [123]:
X_test.head()

Unnamed: 0,playlist_duration_ms,playlist_num_albums,playlist_num_artists,playlist_num_edits,playlist_num_followers,playlist_num_tracks,playlist_pid,track_duration_ms
20901,49012499,157,103,57,2,188,156,197373
219809,10821965,44,39,3,1,48,1632,198360
119281,14042586,54,48,5,1,59,885,234684
10627,18184816,54,37,9,3,68,91,240320
118291,17181746,82,58,24,1,85,877,155224


In [105]:
y_test.head()
# y_test[y_test == 1].head()

20901     0
219809    1
119281    0
10627     1
118291    1
Name: match, dtype: int64

In [106]:
y_test[y_test == 1].head()

219809    1
10627     1
118291    1
159640    1
191792    1
Name: match, dtype: int64

In [65]:
# 1. Calculate metric only on test set using the positive samples
#     a. Obtain unique playlists in the test set
#     b. For each playlist, obtain the positve songs.
#     c. Set both `target` and `prediction` as lists of the positive songs
#     d. For each playlist, calculate the r_precision

# 2. Train the X_train on the classifier. Obtain predictions for X_test

# 3. Repeat 1 above, now setting `predictions` as the list of songs "ranked by probability".

# 4. Calculate the r_precision metric based on your predicitons!