In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import json
import re
import collections
import os
import datetime

In [2]:
mpd = []

In [89]:
def load_all(path):
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)
            for playlist in mpd_slice['playlists']:
                mpd.append(playlist)

In [3]:
import sys
import json
import codecs
import datetime


pretty = True
compact = False
cache = {}

def get_playlist(pid):
    if pid >=0 and pid < 1000000:
        low = 1000 * int(pid / 1000)
        high = low + 999
        offset = pid - low
        path = "./mpd.v1/data/mpd.slice." + str(low) + '-' + str(high) + ".json"
        if not path in cache:
            f = codecs.open(path, 'r', 'utf-8')
            js = f.read()
            f.close()
            playlist = json.loads(js)
            cache[path] = playlist

        playlist = cache[path]['playlists'][offset]
        return playlist

def get_playlists_in_range(start, end):
    pls = []
    try:
        istart = int(start)
        iend = int(end)
        if istart <= iend and istart >= 0 and iend <= 1000000:
            for pid in xrange(istart, iend):
                pls.append(get_playlist(pid))
    except:
        raise
        print "bad pid"
    return pls

In [4]:
import sys
import json
import re
import collections
import os
import datetime

total_playlists = 0
total_tracks = 0
tracks = set()
artists = set()
albums = set()
titles = set()
total_descriptions = 0
ntitles = set()

playlist_length_histogram = collections.Counter()
num_followers_histogram = collections.Counter()
artist_histogram = collections.Counter()
track_histogram = collections.Counter()


def process_playlist(playlist):
    global total_playlists, total_tracks, total_descriptions

    total_playlists += 1
    # print playlist['playlist_id'], playlist['name']

    if 'description' in playlist:
        total_descriptions += 1

    titles.add(playlist['name'])
    nname = normalize_name(playlist['name'])
    ntitles.add(nname)

    playlist_length_histogram[playlist['num_tracks']] += 1
    num_followers_histogram[playlist['num_followers']] += 1

    for track in playlist['tracks']:
        total_tracks += 1
        albums.add(track['album_uri'])
        tracks.add(track['track_uri'])
        artists.add(track['artist_uri'])

        full_name = track['track_name'] + " by " + track['artist_name']
        artist_histogram[track['artist_name']] += 1
        track_histogram[full_name] += 1

def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", ' ', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [5]:
def show_summary():
    print
    print "number of playlists", total_playlists
    print "number of tracks", total_tracks
    print "number of unique tracks", len(tracks)
    print "number of unique albums", len(albums)
    print "number of unique artists", len(artists)
    print "number of unique titles", len(titles)
    print "number of playlists with descriptions", total_descriptions
    print "number of unique normalized titles", len(ntitles)
    print "avg playlist length", float(total_tracks) / total_playlists
    
    print
    print "top tracks"
    for track, count in track_histogram.most_common(20):
        print "%7d %s" % (count, track)

    print
    print "top artists"
    for artist, count in artist_histogram.most_common(20):
        print "%7d %s" % (count, artist)

    print
    print "playlist length histogram"
    for length, count in playlist_length_histogram.most_common(20):
        print "%7d %d" % (count, length)

    print
    print "num followers histogram"
    for followers, count in num_followers_histogram.most_common(20):
        print "%7d %d" % (count, followers)


In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


l = get_playlists_in_range(0, 1000)
l = np.array(l)
for plist in l:
    process_playlist(plist)

In [7]:
show_summary()


number of playlists 1000
number of tracks 67503
number of unique tracks 34443
number of unique albums 19261
number of unique artists 9754
number of unique titles 869
number of playlists with descriptions 20
number of unique normalized titles 754
avg playlist length 67.503

top tracks
     55 One Dance by Drake
     52 HUMBLE. by Kendrick Lamar
     50 Broccoli (feat. Lil Yachty) by DRAM
     46 Closer by The Chainsmokers
     44 Congratulations by Post Malone
     42 Don't Let Me Down by The Chainsmokers
     39 Roses by The Chainsmokers
     39 Bounce Back by Big Sean
     39 iSpy (feat. Lil Yachty) by KYLE
     39 Jumpman by Drake
     38 Bad and Boujee (feat. Lil Uzi Vert) by Migos
     38 Mask Off by Future
     37 XO TOUR Llif3 by Lil Uzi Vert
     36 White Iverson by Post Malone
     36 Panda by Desiigner
     35 Caroline by Aminé
     35 Sorry by Justin Bieber
     35 Never Be Like You by Flume
     35 Gold Digger by Kanye West
     35 goosebumps by Travis Scott

top artists
  

Collaborative Filtering

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
l_train, l_test = train_test_split(l)

In [19]:
len(l_train)

750

In [16]:
import copy

#recommend for one list
#go through training set, calculate score, stored in list
#return top n ranked list sorted by score
def calc_score(playlist):
    dic = {}
    tracks = playlist['tracks']
    nname_test = []
    for track in tracks:
        nname = normalize_name(track['track_name'])
        nname_test.append(nname)
    total = len(nname_test)
    
    #for each playlist in training set, calculate score and add in result
    for pl in l_train:
        count = 0
        tracks = pl['tracks']
        for track in tracks:
            nname = normalize_name(track['track_name'])
            if nname in nname_test:
                count+=1
        #after finishing counting
        dic[pl['pid']] = float(count)/total
        
    return dic, nname_test

In [41]:
import operator
dic, names = calc_score(l_test[0])
result = sorted(dic.items(), key=operator.itemgetter(1))
result.reverse()

In [51]:
result[0]

(360, 0.3)

In [52]:
def get_pid_tracks():
    pid_tracks = {}
    for pl in l_train:
        pid = pl['pid']
        tracks = pl['tracks']
        pid_tracks[pid] = tracks
    return pid_tracks

In [63]:
pt = get_pid_tracks()

In [77]:
def recommend(pt, scores, names, count):
    result = []
    for score in scores:
        pid = score[0]
        tracks = pt[pid]
        for track in tracks:
            nname = normalize_name(track['track_name'])
            if nname not in names:
                result.append(track)
                if len(result) >= count:
                    return result
    return result

In [87]:
z = recommend(pt, result, names, 50)

In [88]:
z

[{u'album_name': u'Ice On The Dune',
  u'album_uri': u'spotify:album:1olQuvonXXUEourYrj6daN',
  u'artist_name': u'Empire of the Sun',
  u'artist_uri': u'spotify:artist:67hb7towEyKvt5Z8Bx306c',
  u'duration_ms': 204591,
  u'pos': 0,
  u'track_name': u'Alive',
  u'track_uri': u'spotify:track:5WBnKCEsPwsvWHUZmLjS3s'},
 {u'album_name': u'Surf',
  u'album_uri': u'spotify:album:3eM1KTKmpqrQOvuvYY42cr',
  u'artist_name': u'Donnie Trumpet & The Social Experiment',
  u'artist_uri': u'spotify:artist:0ojcq9LJQWMawQdFDw3M0L',
  u'duration_ms': 226013,
  u'pos': 1,
  u'track_name': u'Sunday Candy',
  u'track_uri': u'spotify:track:6fTdcGsjxlAD9PSkoPaLMX'},
 {u'album_name': u'All My Friends (The Remixes)',
  u'album_uri': u'spotify:album:3SStFwKQqnQMtbMtpdO9Oc',
  u'artist_name': u'Snakehips',
  u'artist_uri': u'spotify:artist:2FwJwEswyIUAljqgjNSHgP',
  u'duration_ms': 355373,
  u'pos': 2,
  u'track_name': u'All My Friends - 99 Souls Remix',
  u'track_uri': u'spotify:track:1szfOuGK4RWHrsHdiMimym'},
 