In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import json
import re
import collections
import os
import datetime

# Preprocessing

In [2]:
mpd = []

In [3]:
import sys
import json
import codecs
import datetime


pretty = True
compact = False
cache = {}

def get_playlist(pid):
    if pid >=0 and pid < 1000000:
        low = 1000 * int(pid / 1000)
        high = low + 999
        offset = pid - low
        path = "./mpd.v1/data/mpd.slice." + str(low) + '-' + str(high) + ".json"
        if not path in cache:
            f = codecs.open(path, 'r', 'utf-8')
            js = f.read()
            f.close()
            playlist = json.loads(js)
            cache[path] = playlist

        playlist = cache[path]['playlists'][offset]
        return playlist

def get_playlists_in_range(start, end):
    pls = []
    try:
        istart = int(start)
        iend = int(end)
        if istart <= iend and istart >= 0 and iend <= 1000000:
            for pid in xrange(istart, iend):
                pls.append(get_playlist(pid))
    except:
        raise
        print "bad pid"
    return pls

In [4]:
total_playlists = 0
total_tracks = 0
tracks = set()
artists = set()
albums = set()
titles = set()
total_descriptions = 0
ntitles = set()

playlist_length_histogram = collections.Counter()
num_followers_histogram = collections.Counter()
artist_histogram = collections.Counter()
track_histogram = collections.Counter()


def process_playlist(playlist):
    global total_playlists, total_tracks, total_descriptions

    total_playlists += 1
    # print playlist['playlist_id'], playlist['name']

    if 'description' in playlist:
        total_descriptions += 1

    titles.add(playlist['name'])
    nname = normalize_name(playlist['name'])
    ntitles.add(nname)

    playlist_length_histogram[playlist['num_tracks']] += 1
    num_followers_histogram[playlist['num_followers']] += 1

    for track in playlist['tracks']:
        total_tracks += 1
        albums.add(track['album_uri'])
        tracks.add(track['track_uri'])
        artists.add(track['artist_uri'])

        full_name = track['track_name'] + " by " + track['artist_name']
        artist_histogram[track['artist_name']] += 1
        track_histogram[full_name] += 1

def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", ' ', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [5]:
def show_summary():
    print
    print "number of playlists", total_playlists
    print "number of tracks", total_tracks
    print "number of unique tracks", len(tracks)
    print "number of unique albums", len(albums)
    print "number of unique artists", len(artists)
    print "number of unique titles", len(titles)
    print "number of playlists with descriptions", total_descriptions
    print "number of unique normalized titles", len(ntitles)
    print "avg playlist length", float(total_tracks) / total_playlists
    
    print
    print "top tracks"
    for track, count in track_histogram.most_common(20):
        print "%7d %s" % (count, track)

    print
    print "top artists"
    for artist, count in artist_histogram.most_common(20):
        print "%7d %s" % (count, artist)

    print
    print "playlist length histogram"
    for length, count in playlist_length_histogram.most_common(20):
        print "%7d %d" % (count, length)

    print
    print "num followers histogram"
    for followers, count in num_followers_histogram.most_common(20):
        print "%7d %d" % (count, followers)


In [6]:
uri_track = {}
def process_tracks(playlist):
    global uri_track
    document = []
    
    for track in playlist['tracks']:
        uri = track['track_uri'].split(':')[2]
        document.append(uri)
        uri_track[uri] = track
    
    return document

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

l = get_playlists_in_range(0, 10000)
l = np.array(l)

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
l_train, l_test = train_test_split(l)

In [9]:
len(l_train)

7500

# Collaborative Filtering

In [None]:
import copy
#recommend for one list
#go through training set, calculate score, stored in list
#return top n ranked list sorted by score
def calc_score(playlist):
    dic = {}
    tracks = playlist['tracks']
    nname_test = []
    for track in tracks:
        nname = normalize_name(track['track_name'])
        nname_test.append(nname)
    total = len(nname_test)
    
    #for each playlist in training set, calculate score and add in result
    for pl in l_train:
        count = 0
        tracks = pl['tracks']
        for track in tracks:
            nname = normalize_name(track['track_name'])
            if nname in nname_test:
                count+=1
        #after finishing counting
        dic[pl['pid']] = float(count)/total
        
    return dic, nname_test

In [None]:
import operator
dic, names = calc_score(l_test[0])
result = sorted(dic.items(), key=operator.itemgetter(1))
result.reverse()

In [None]:
result[0]

In [None]:
pt = get_pid_tracks()

In [None]:
def recommend(pt, scores, names, count):
    result = []
    for score in scores:
        pid = score[0]
        tracks = pt[pid]
        for track in tracks:
            nname = normalize_name(track['track_name'])
            if nname not in names:
                result.append(track)
                if len(result) >= count:
                    return result
    return result

In [None]:
z = recommend(pt, result, names, 10)

In [None]:
for pl in l_test:
    dic, names = calc_score(pl)
    result = sorted(dic.items(), key=operator.itemgetter(1))
    result.reverse()
    print (recommend(pt, result, names, 5))
    print 

# Word2Vec
- uri_tracks

In [123]:
documents = []
for plist in l_train:
    documents.append(process_tracks(plist))

In [124]:
len(documents[0])

49

In [125]:
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [156]:
model = gensim.models.Word2Vec (documents, size=150, window=10, min_count=2, workers=10, sg=1)
model.train(documents,total_examples=len(documents),epochs=10)

2018-04-25 23:32:28,209 : INFO : collecting all words and their counts
2018-04-25 23:32:28,210 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-04-25 23:32:28,426 : INFO : collected 142168 word types from a corpus of 497890 raw words and 7500 sentences
2018-04-25 23:32:28,427 : INFO : Loading a fresh vocabulary
2018-04-25 23:32:28,593 : INFO : min_count=2 retains 49961 unique words (35% of original 142168, drops 92207)
2018-04-25 23:32:28,594 : INFO : min_count=2 leaves 405683 word corpus (81% of original 497890, drops 92207)
2018-04-25 23:32:28,726 : INFO : deleting the raw counts dictionary of 142168 items
2018-04-25 23:32:28,730 : INFO : sample=0.001 downsamples 0 most-common words
2018-04-25 23:32:28,731 : INFO : downsampling leaves estimated 405683 word corpus (100.0% of prior 405683)
2018-04-25 23:32:28,889 : INFO : estimated required memory for 49961 words and 150 dimensions: 84933700 bytes
2018-04-25 23:32:28,890 : INFO : resetting layer weights
2

2018-04-25 23:32:40,134 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-04-25 23:32:40,136 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-25 23:32:40,137 : INFO : EPOCH - 1 : training on 497890 raw words (405683 effective words) took 2.0s, 205796 effective words/s
2018-04-25 23:32:41,191 : INFO : EPOCH 2 - PROGRESS: at 48.03% examples, 190310 words/s, in_qsize 19, out_qsize 0
2018-04-25 23:32:41,733 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-04-25 23:32:41,771 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-04-25 23:32:41,786 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-04-25 23:32:41,816 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-04-25 23:32:41,842 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-04-25 23:32:41,876 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-04-25 23:32:41,907

2018-04-25 23:32:52,423 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-25 23:32:52,424 : INFO : EPOCH - 8 : training on 497890 raw words (405683 effective words) took 1.6s, 249198 effective words/s
2018-04-25 23:32:53,468 : INFO : EPOCH 9 - PROGRESS: at 46.33% examples, 178630 words/s, in_qsize 20, out_qsize 0
2018-04-25 23:32:54,029 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-04-25 23:32:54,058 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-04-25 23:32:54,064 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-04-25 23:32:54,076 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-04-25 23:32:54,106 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-04-25 23:32:54,124 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-04-25 23:32:54,165 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-04-25 23:32:54,172

(4056830, 4978900)

In [157]:
l_train[0]['tracks'][0]['track_uri'].split(":")[2]

u'6xZjmlwgIk0tMwo8buLuzH'

In [158]:
l_train[0]['tracks'][0]

{u'album_name': u'Back II Eden',
 u'album_uri': u'spotify:album:1fY5CnJPKGWE17nFwv5jTv',
 u'artist_name': u'Donald Lawrence & Company',
 u'artist_uri': u'spotify:artist:1haevnXTeIhKJSc04ly4as',
 u'duration_ms': 370480,
 u'pos': 0,
 u'track_name': u'Back II Eden',
 u'track_uri': u'spotify:track:6xZjmlwgIk0tMwo8buLuzH'}

In [159]:
w1 = l_train[0]['tracks'][0]['track_uri'].split(":")[2]
rec_list = model.wv.most_similar(positive=w1, topn = 5)

2018-04-25 23:33:00,428 : INFO : precomputing L2-norms of word weight vectors


In [160]:
len(rec_list)

5

In [161]:
print "Test: " + uri_track[w1]['track_name'] + " By " + uri_track[w1]['artist_name']
print
for l in rec_list:
    print uri_track[l[0]]['track_name'] + " By " + uri_track[l[0]]['artist_name']
    print "similarity: " + str(l[1])
    print
    

Test: Back II Eden By Donald Lawrence & Company

And I By Mary Mary
similarity: 0.998215258121

I Speak Life By Donald Lawrence
similarity: 0.997954785824

Fill Me Up By Casey J
similarity: 0.996827244759

Encourage Yourself By Sheri Jones-Moffett
similarity: 0.996734678745

War - Live By Charles Jenkins & Fellowship Chicago
similarity: 0.996490776539



# Evaluation
- Jaccard Similarity measures intersection over union

In [162]:
def eval(pl):
    
    predicts = []
    tracks = pl['tracks']
    #withheld tracks 50/50
    leng = len(tracks)
    t_train, t_test = train_test_split(tracks,test_size=0.5)
    #for the train 50%, predict x5
    for t in t_train:
        uri = t['track_uri'].split(":")[-1]
        if uri not in model.wv.vocab.keys():
            continue
        res = model.wv.most_similar(positive=uri, topn=5)
        for l in res:
            predicts.append(l[0])
    
    len_res = len(predicts)
    
    compare = []
    for t in t_test:
        compare.append(t['track_uri'].split(":")[-1])
    
    return predicts, compare

In [163]:
def jaccard_distance(l1, l2):
    intersection = 0
    for i in l1:
        if i in l2:
            intersection +=1
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

In [164]:
total = []
for plist in l_test:
    pred, comp = eval(plist)
    dist = jaccard_distance(pred, comp)
    print dist
    total.append(dist)

print "Average Jaccard Distance"
print sum(total) / float(len(total))

0.0617760617761
0.0151515151515
0.0425531914894
0.0
0.0
0.0
0.0720338983051
0.0
0.0796915167095
0.0
0.0
0.0
0.141592920354
0.0430379746835
0.0
0.0806451612903
0.0
0.201612903226
0.0102040816327
0.108786610879
0.0540540540541
0.0363636363636
0.00952380952381
0.0425531914894
0.0762711864407
0.0580046403712
0.0102040816327
0.00704225352113
0.061433447099
0.0298507462687
0.0157480314961
0.0
0.0530035335689
0.0
0.0848214285714
0.0740740740741
0.0239130434783
0.0
0.0
0.0526315789474
0.0316455696203
0.0505050505051
0.0
0.0393835616438
0.091649694501
0.0119760479042
0.0217391304348
0.0104166666667
0.0224215246637
0.21247113164
0.0
0.0522388059701
0.0188679245283
0.0177514792899
0.0205479452055
0.0
0.0141843971631
0.0503597122302
0.0
0.186991869919
0.0121951219512
0.0677506775068
0.0162162162162
0.0
0.0
0.0289855072464
0.0277777777778
0.0502092050209
0.0
0.0988023952096
0.0584415584416
0.0
0.043795620438
0.0353982300885
0.0
0.06
0.0153846153846
0.0227272727273
0.0
0.345609065156
0.0303797468354

0.0171428571429
0.0
0.535714285714
0.0543130990415
0.0214285714286
0.018691588785
0.0438596491228
0.0
0.05
0.0175438596491
0.0062893081761
0.064
0.0549450549451
0.0267857142857
0.0208333333333
0.0232558139535
0.0240963855422
0.0
0.0718562874251
0.0444444444444
0.0111111111111
0.0176470588235
0.0
0.162790697674
0.0947867298578
0.0289855072464
0.046875
0.0196078431373
0.0
0.0
0.0446428571429
0.0
0.0
0.0960698689956
0.0149253731343
0.0880503144654
0.0792079207921
0.0
0.025641025641
0.0
0.0114942528736
0.0
0.037037037037
0.0
0.0
0.0307692307692
0.0493827160494
0.0493827160494
0.0227790432802
0.0281690140845
0.0509554140127
0.0
0.0
0.030303030303
0.00865800865801
0.0583941605839
0.0
0.0281690140845
0.0285714285714
0.0
0.0354609929078
0.0
0.13353115727
0.148409893993
0.0
0.0
0.00740740740741
0.0
0.0434782608696
0.03125
0.0
0.0
0.0
0.0281690140845
0.032967032967
0.0116279069767
0.0173410404624
0.0537634408602
0.0277777777778
0.0
0.167785234899
0.0201342281879
0.167721518987
0.0838709677419
0.

0.00666666666667
0.0072202166065
0.029702970297
0.0392156862745
0.0714285714286
0.0526315789474
0.0102564102564
0.0
0.00925925925926
0.0
0.319444444444
0.0
0.164383561644
0.0829694323144
0.0760456273764
0.0418410041841
0.041564792176
0.0
0.0273037542662
0.195945945946
0.0
0.0
0.0
0.101063829787
0.127962085308
0.0438596491228
0.0368098159509
0.0863309352518
0.0703703703704
0.0
0.0196078431373
0.0503144654088
0.0
0.0384615384615
0.0425531914894
0.0
0.0536912751678
0.0277777777778
0.0689655172414
0.0059880239521
0.0140845070423
0.0
0.00813008130081
0.0254237288136
0.02
0.0
0.00662251655629
0.0
0.352941176471
0.0
0.00518134715026
0.0225563909774
0.0379746835443
0.0454545454545
0.0
0.00754716981132
0.0386178861789
0.0144927536232
0.0
0.0117647058824
0.0140845070423
0.0
0.0
0.015503875969
0.0551181102362
0.0
0.0347826086957
0.0705521472393
0.0
0.117870722433
0.0338983050847
0.0
0.0
0.0350318471338
0.0
0.142857142857
0.00877192982456
0.0
0.0
0.0
0.0
0.0
0.0357142857143
0.0514705882353
0.01818

0.0
0.0348837209302
0.0
0.0377358490566
0.0477707006369
0.0
0.101123595506
0.0555555555556
0.008
0.0409090909091
0.0215517241379
0.0298507462687
0.0975609756098
0.0
0.100217864924
0.0367892976589
0.0
0.198198198198
0.0333333333333
0.0393442622951
0.015873015873
0.0408719346049
0.0165289256198
0.015873015873
0.0118343195266
0.039603960396
0.0481927710843
0.0
0.0204081632653
0.0740740740741
0.128571428571
0.0120481927711
0.0
0.046875
0.025974025974
0.0243902439024
0.00657894736842
0.0191082802548
0.211206896552
0.00578034682081
0.0208333333333
0.0482954545455
0.027027027027
0.0226244343891
0.0
0.0
0.0909090909091
0.0054347826087
0.0
0.0711974110032
0.0
0.0
0.107843137255
0.0556962025316
0.0
0.0681003584229
0.0823529411765
0.0
0.0
0.0161290322581
0.0263653483992
0.0
0.0294117647059
0.0
0.10119047619
0.03125
0.0403225806452
0.0116279069767
0.047619047619
0.0448717948718
0.0386473429952
0.0213523131673
0.215384615385
0.1125
0.0208333333333
0.1
0.0
0.0188679245283
0.0337837837838
0.010416666