In [1]:
import os
import json
import pandas as pd
import numpy as np
import scipy as sp

In [2]:
playlists = list()
tracks = dict()
map_pl = list()

max_files_for_quick_processing = 5


def process_track(track):
    key = track['track_uri']
    if not key in tracks:
        tk = dict()
        tk['artist_name'] = track['artist_name']
        tk['artist_uri'] = track['artist_uri']
        tk['track_name'] = track['track_name']
        tk['album_uri'] = track['album_uri']
        tk['duration_ms'] = track['duration_ms']
        tk['album_name'] = track['album_name']
        tracks[track['track_uri']] = tk
    return key


def process_playlist(playlist):
    pl = dict()
    pl['name'] = playlist['name']
    pl['collaborative'] = playlist['collaborative']
    pl['pid'] = playlist['pid']
    pl['modified_at'] = playlist['modified_at']
    pl['num_albums'] = playlist['num_albums']
    pl['num_tracks'] = playlist['num_tracks']
    pl['num_followers'] = playlist['num_followers']
    pl['num_edits'] = playlist['num_edits']
    pl['duration_ms'] = playlist['duration_ms']
    pl['num_artists'] = playlist['num_artists']
    if 'description' in playlist:
        pl['description'] = playlist['description']
    else:
        pl['description'] = None
    trks = list()
    for track in playlist['tracks']:
        map_pl.append([playlist['pid'], track['track_uri']])
        trks.append(track['track_uri'])
        process_track(track)
    return pl

def process_mpd(path):
    count = 0
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        print(filename)
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            slice = json.loads(js)
            for playlist in slice['playlists']:
                playlists.append(process_playlist(playlist))
            count += 1
            if quick and count > max_files_for_quick_processing:
                break

quick = True
process_mpd('mpd.v1/data')

mpd.slice.0-999.json
mpd.slice.1000-1999.json
mpd.slice.10000-10999.json
mpd.slice.100000-100999.json
mpd.slice.101000-101999.json
mpd.slice.102000-102999.json


In [3]:
print(len(playlists))
print(len(tracks))
print(len(map_pl))

6000
121855
402808


In [4]:
playlist_df = pd.DataFrame(playlists)
print(playlist_df.head())
print(playlist_df.describe())

  collaborative description  duration_ms  modified_at              name  \
0         false        None     11532414   1493424000        Throwbacks   
1         false        None     11656470   1506556800  Awesome Playlist   
2         false        None     14039958   1505692800           korean    
3         false        None     28926058   1501027200               mat   
4         false        None      4335282   1401667200               90s   

   num_albums  num_artists  num_edits  num_followers  num_tracks  pid  
0          47           37          6              1          52    0  
1          23           21          5              1          39    1  
2          51           31         18              1          64    2  
3         107           86          4              1         126    3  
4          16           16          7              2          17    4  
        duration_ms   modified_at   num_albums  num_artists    num_edits  \
count  6.000000e+03  6.000000e+03  6000.0

In [5]:
tracks_df = pd.DataFrame.from_dict(tracks, orient='index')
print(tracks_df.head())
print(tracks_df.describe())

                                                      artist_name  \
spotify:track:000mA0etY38nKdvf1N04af                  The Coronas   
spotify:track:000xQL6tZNLJzIrtIgxqSl                         ZAYN   
spotify:track:0010mZpCCwlPwoBiBsjoac          Bombay Bicycle Club   
spotify:track:0018QzCxmMrpa0FubbNdak  Los Invasores De Nuevo León   
spotify:track:001BVhvaZTf2icV88rU3DA              Amindi K. Fro$t   

                                                                 artist_uri  \
spotify:track:000mA0etY38nKdvf1N04af  spotify:artist:2tppd6KkhK4ULAd217Ecq1   
spotify:track:000xQL6tZNLJzIrtIgxqSl  spotify:artist:5ZsFI1h6hIdQRw2ti0hz81   
spotify:track:0010mZpCCwlPwoBiBsjoac  spotify:artist:3pTE9iaJTkWns3mxpNQlJV   
spotify:track:0018QzCxmMrpa0FubbNdak  spotify:artist:5CGtBYmVPeLhI1kM2Fn9Gv   
spotify:track:001BVhvaZTf2icV88rU3DA  spotify:artist:3zQH24IzdAqloEl72e7hPD   

                                                            track_name  \
spotify:track:000mA0etY38nKdvf1N04af

In [6]:
playlist_map_df = pd.DataFrame(map_pl, columns=['pid', 'track_uri'])
print(playlist_map_df.head())
print(playlist_map_df.describe())

   pid                             track_uri
0    0  spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1    0  spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2    0  spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3    0  spotify:track:1AWQoqb9bSvzTjaLralEkT
4    0  spotify:track:1lzr43nnXAijIGYnCT8M8H
                 pid
count  402808.000000
mean    52976.128404
std     48773.897501
min         0.000000
25%      1498.000000
50%    100010.000000
75%    101494.000000
max    102999.000000


In [7]:
merged = pd.merge(pd.merge(tracks_df, playlist_map_df, left_index=True, right_on='track_uri'), playlist_df, on='pid')

In [8]:
merged

Unnamed: 0,artist_name,artist_uri,track_name,album_uri,duration_ms_x,album_name,pid,track_uri,collaborative,description,duration_ms_y,modified_at,name,num_albums,num_artists,num_edits,num_followers,num_tracks
0,The Coronas,spotify:artist:2tppd6KkhK4ULAd217Ecq1,If I Gave Myself To Someone Else,spotify:album:662PiU3dRsilN0Gp87IiSF,214506,The Long Way,371,spotify:track:000mA0etY38nKdvf1N04af,false,,3675883,1496793600,quiet,15,14,4,1,15
1,Lewis Watson,spotify:artist:40ELTAg7Kg6vbWnlyx2n9R,stones around the sun,spotify:album:4cKBAg2zgjrVF2XefrW4WC,224440,the morning,371,spotify:track:0JGbwcwPV0VfuR4zDcZ9ce,false,,3675883,1496793600,quiet,15,14,4,1,15
2,Jaymes Young,spotify:artist:6QrQ7OrISRYIfS5mtacaw2,We Won't,spotify:album:6MuWCR3WPjwyKhqsTKLZ3z,240586,Feel Something,371,spotify:track:0Zge2Kfo3Yd9JOGnAmVPbb,false,,3675883,1496793600,quiet,15,14,4,1,15
3,John Lucas,spotify:artist:7iEy8zKFtlYIINaxxLIyBk,This Will Be Our Home,spotify:album:2N7sEVVS3jKMJJuJi4v0UF,277160,Promised Land,371,spotify:track:1Sw7fhf7YJCD6GcWW0wETD,false,,3675883,1496793600,quiet,15,14,4,1,15
4,Hozier,spotify:artist:2FXC3k01G6Gw61bmprjgqS,Cherry Wine - Live,spotify:album:36k5aXpxffjVGcNce12GLZ,240147,Hozier,371,spotify:track:1ivHxaGL5ld9VS1zsYc4YN,false,,3675883,1496793600,quiet,15,14,4,1,15
5,The Head and the Heart,spotify:artist:0n94vC3S9c3mb2HyNAOcjg,Rivers And Roads,spotify:album:4JKVVz1tD8PYKoXIZ7ZEJy,284000,The Head And The Heart,371,spotify:track:1t2tKmSYA61IA7scT1yoIn,false,,3675883,1496793600,quiet,15,14,4,1,15
6,Lewis Watson,spotify:artist:40ELTAg7Kg6vbWnlyx2n9R,forever (acoustic version),spotify:album:2T5RZubmhl8kwYxENFQkan,188040,midnight (acoustic),371,spotify:track:2AywFPR5CzTM5TNwEkzzxL,false,,3675883,1496793600,quiet,15,14,4,1,15
7,Ben Howard,spotify:artist:5schNIzWdI9gJ1QRK8SBnc,Only Love,spotify:album:2MxcbOGFi99D9JACvj74AH,249173,Every Kingdom,371,spotify:track:3MdYFBIzPf7lSJnI8wi3Ka,false,,3675883,1496793600,quiet,15,14,4,1,15
8,Ben Rector,spotify:artist:4AapPt7H6bGH4i7chTulpI,Hide Away,spotify:album:4ASegWfBk0kc5psPGAEcfE,199836,Something Like This,371,spotify:track:441XbO0qkxAEoPe1umyzkg,false,,3675883,1496793600,quiet,15,14,4,1,15
9,Thirdstory,spotify:artist:7GJbWH8vhhuW22707B8HsW,Searching For A Feeling,spotify:album:7pjWbCevZabqDKwm26ggKd,230466,Searching,371,spotify:track:4VqJipc8kIFseGlVNKlv0W,false,,3675883,1496793600,quiet,15,14,4,1,15
