In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import json
import re
import collections
import os
import datetime

# Preprocessing

In [2]:
mpd = []

In [3]:
import sys
import json
import codecs
import datetime


pretty = True
compact = False
cache = {}

def get_playlist(pid):
    if pid >=0 and pid < 1000000:
        low = 1000 * int(pid / 1000)
        high = low + 999
        offset = pid - low
        path = "./mpd.v1/data/mpd.slice." + str(low) + '-' + str(high) + ".json"
        if not path in cache:
            f = codecs.open(path, 'r', 'utf-8')
            js = f.read()
            f.close()
            playlist = json.loads(js)
            cache[path] = playlist

        playlist = cache[path]['playlists'][offset]
        return playlist

def get_playlists_in_range(start, end):
    pls = []
    try:
        istart = int(start)
        iend = int(end)
        if istart <= iend and istart >= 0 and iend <= 1000000:
            for pid in xrange(istart, iend):
                pls.append(get_playlist(pid))
    except:
        raise
        print "bad pid"
    return pls

In [4]:
total_playlists = 0
total_tracks = 0
tracks = set()
artists = set()
albums = set()
titles = set()
total_descriptions = 0
ntitles = set()

playlist_length_histogram = collections.Counter()
num_followers_histogram = collections.Counter()
artist_histogram = collections.Counter()
track_histogram = collections.Counter()


def process_playlist(playlist):
    global total_playlists, total_tracks, total_descriptions

    total_playlists += 1
    # print playlist['playlist_id'], playlist['name']

    if 'description' in playlist:
        total_descriptions += 1

    titles.add(playlist['name'])
    nname = normalize_name(playlist['name'])
    ntitles.add(nname)

    playlist_length_histogram[playlist['num_tracks']] += 1
    num_followers_histogram[playlist['num_followers']] += 1

    for track in playlist['tracks']:
        total_tracks += 1
        albums.add(track['album_uri'])
        tracks.add(track['track_uri'])
        artists.add(track['artist_uri'])

        full_name = track['track_name'] + " by " + track['artist_name']
        artist_histogram[track['artist_name']] += 1
        track_histogram[full_name] += 1

def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", ' ', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [5]:
def show_summary():
    print
    print "number of playlists", total_playlists
    print "number of tracks", total_tracks
    print "number of unique tracks", len(tracks)
    print "number of unique albums", len(albums)
    print "number of unique artists", len(artists)
    print "number of unique titles", len(titles)
    print "number of playlists with descriptions", total_descriptions
    print "number of unique normalized titles", len(ntitles)
    print "avg playlist length", float(total_tracks) / total_playlists
    
    print
    print "top tracks"
    for track, count in track_histogram.most_common(20):
        print "%7d %s" % (count, track)

    print
    print "top artists"
    for artist, count in artist_histogram.most_common(20):
        print "%7d %s" % (count, artist)

    print
    print "playlist length histogram"
    for length, count in playlist_length_histogram.most_common(20):
        print "%7d %d" % (count, length)

    print
    print "num followers histogram"
    for followers, count in num_followers_histogram.most_common(20):
        print "%7d %d" % (count, followers)


In [6]:
uri_track = {}
def process_tracks(playlist):
    global uri_track
    document = []
    
    for track in playlist['tracks']:
        uri = track['track_uri'].split(':')[2]
        document.append(uri)
        uri_track[uri] = track
    
    return document

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

l = get_playlists_in_range(0, 50000)
l = np.array(l)

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
l_train, l_test = train_test_split(l)

In [12]:
len(l_train)

37500

# Collaborative Filtering

In [14]:
import copy
#recommend for one list
#go through training set, calculate score, stored in list
#return top n ranked list sorted by score
def calc_score(playlist):
    dic = {}
    tracks = playlist['tracks']
    nname_test = []
    for track in tracks:
        nname = normalize_name(track['track_name'])
        nname_test.append(nname)
    total = len(nname_test)
    
    #for each playlist in training set, calculate score and add in result
    for pl in l_train:
        count = 0
        tracks = pl['tracks']
        for track in tracks:
            nname = normalize_name(track['track_name'])
            if nname in nname_test:
                count+=1
        #after finishing counting
        dic[pl['pid']] = float(count)/total
        
    return dic, nname_test

In [15]:
import operator
dic, names = calc_score(l_test[0])
result = sorted(dic.items(), key=operator.itemgetter(1))
result.reverse()

In [16]:
result[0]

(3382, 0.4074074074074074)

In [None]:
def recommend(pt, scores, names, count):
    result = []
    for score in scores:
        pid = score[0]
        tracks = pt[pid]
        for track in tracks:
            nname = normalize_name(track['track_name'])
            if nname not in names:
                result.append(track)
                if len(result) >= count:
                    return result
    return result

In [None]:
z = recommend(pt, result, names, 10)

In [None]:
for pl in l_test:
    dic, names = calc_score(pl)
    result = sorted(dic.items(), key=operator.itemgetter(1))
    result.reverse()
    print (recommend(pt, result, names, 5))
    print 

# Word2Vec
- uri_tracks

In [16]:
documents = []
for plist in l_train:
    documents.append(process_tracks(plist))

In [17]:
len(documents[0])

21

In [18]:
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [20]:
model = gensim.models.Word2Vec (documents, size=40, window=8, min_count=4, sg=1)
model.train(documents,total_examples=len(documents),epochs=10)

2018-05-06 14:55:47,952 : INFO : collecting all words and their counts
2018-05-06 14:55:47,954 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-05-06 14:55:48,212 : INFO : PROGRESS: at sentence #10000, processed 658827 words, keeping 171254 word types
2018-05-06 14:55:48,497 : INFO : PROGRESS: at sentence #20000, processed 1327470 words, keeping 264797 word types
2018-05-06 14:55:48,770 : INFO : PROGRESS: at sentence #30000, processed 2001779 words, keeping 339603 word types
2018-05-06 14:55:48,986 : INFO : collected 387543 word types from a corpus of 2507932 raw words and 37500 sentences
2018-05-06 14:55:48,987 : INFO : Loading a fresh vocabulary
2018-05-06 14:55:49,896 : INFO : min_count=4 retains 80237 unique words (20% of original 387543, drops 307306)
2018-05-06 14:55:49,897 : INFO : min_count=4 leaves 2093859 word corpus (83% of original 2507932, drops 414073)
2018-05-06 14:55:50,158 : INFO : deleting the raw counts dictionary of 387543 items
2018-0

2018-05-06 14:56:32,360 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-06 14:56:32,360 : INFO : EPOCH - 1 : training on 2507932 raw words (2093859 effective words) took 8.0s, 262278 effective words/s
2018-05-06 14:56:33,384 : INFO : EPOCH 2 - PROGRESS: at 11.89% examples, 235022 words/s, in_qsize 6, out_qsize 0
2018-05-06 14:56:34,400 : INFO : EPOCH 2 - PROGRESS: at 23.77% examples, 239894 words/s, in_qsize 5, out_qsize 0
2018-05-06 14:56:35,434 : INFO : EPOCH 2 - PROGRESS: at 36.54% examples, 245943 words/s, in_qsize 5, out_qsize 0
2018-05-06 14:56:36,450 : INFO : EPOCH 2 - PROGRESS: at 48.42% examples, 245746 words/s, in_qsize 5, out_qsize 0
2018-05-06 14:56:37,462 : INFO : EPOCH 2 - PROGRESS: at 58.73% examples, 239768 words/s, in_qsize 6, out_qsize 0
2018-05-06 14:56:38,506 : INFO : EPOCH 2 - PROGRESS: at 67.66% examples, 229634 words/s, in_qsize 6, out_qsize 0
2018-05-06 14:56:39,592 : INFO : EPOCH 2 - PROGRESS: at 75.87% examples, 219029 words/s, in_qs

2018-05-06 14:57:27,423 : INFO : EPOCH 8 - PROGRESS: at 11.89% examples, 237125 words/s, in_qsize 6, out_qsize 1
2018-05-06 14:57:28,438 : INFO : EPOCH 8 - PROGRESS: at 23.37% examples, 237112 words/s, in_qsize 5, out_qsize 0
2018-05-06 14:57:29,488 : INFO : EPOCH 8 - PROGRESS: at 35.74% examples, 240593 words/s, in_qsize 6, out_qsize 1
2018-05-06 14:57:30,525 : INFO : EPOCH 8 - PROGRESS: at 47.61% examples, 239938 words/s, in_qsize 6, out_qsize 0
2018-05-06 14:57:31,545 : INFO : EPOCH 8 - PROGRESS: at 59.57% examples, 240796 words/s, in_qsize 6, out_qsize 0
2018-05-06 14:57:32,616 : INFO : EPOCH 8 - PROGRESS: at 72.32% examples, 243159 words/s, in_qsize 6, out_qsize 0
2018-05-06 14:57:33,617 : INFO : EPOCH 8 - PROGRESS: at 85.05% examples, 246301 words/s, in_qsize 6, out_qsize 0
2018-05-06 14:57:34,678 : INFO : EPOCH 8 - PROGRESS: at 97.05% examples, 245864 words/s, in_qsize 6, out_qsize 0
2018-05-06 14:57:34,859 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-

(20938590, 25079320)

In [21]:
l_train[0]['tracks'][0]['track_uri'].split(":")[2]

u'17VegeBoHvMlIByrdu64KR'

In [22]:
l_train[0]['tracks'][0]

{u'album_name': u'Elton John',
 u'album_uri': u'spotify:album:7dtLYwLOdYQa2S8Vjeuxci',
 u'artist_name': u'Elton John',
 u'artist_uri': u'spotify:artist:3PhoLpVuITZKcymswpck5b',
 u'duration_ms': 244226,
 u'pos': 0,
 u'track_name': u'Your Song',
 u'track_uri': u'spotify:track:17VegeBoHvMlIByrdu64KR'}

In [23]:
w1 = l_train[0]['tracks'][0]['track_uri'].split(":")[2]
rec_list = model.wv.most_similar(positive=w1, topn = 5)

2018-05-06 14:59:03,546 : INFO : precomputing L2-norms of word weight vectors


In [24]:
len(rec_list)

5

In [25]:
print "Test: " + uri_track[w1]['track_name'] + " By " + uri_track[w1]['artist_name']
print
for l in rec_list:
    print uri_track[l[0]]['track_name'] + " By " + uri_track[l[0]]['artist_name']
    print "similarity: " + str(l[1])
    print
    

Test: Your Song By Elton John

Tiny Dancer By Elton John
similarity: 0.911042332649

Goodbye Yellow Brick Road - Remastered 2014 By Elton John
similarity: 0.904203534126

Rocket Man (I Think It's Going To Be A Long Long Time) By Elton John
similarity: 0.892361581326

Candle In The Wind - Remastered 2014 By Elton John
similarity: 0.880346298218

Piano Man By Billy Joel
similarity: 0.869542479515



# Evaluation
- Jaccard Similarity measures intersection over union

In [26]:
def eval(pl):
    
    predicts = []
    tracks = pl['tracks']
    #withheld tracks 50/50
    leng = len(tracks)
    t_train, t_test = train_test_split(tracks,test_size=0.5)
    #for the train 50%, predict x5
    for t in t_train:
        uri = t['track_uri'].split(":")[-1]
        if uri not in model.wv.vocab.keys():
            continue
        res = model.wv.most_similar(positive=uri, topn=5)
        for l in res:
            predicts.append(l[0])
    
    len_res = len(predicts)
    
    compare = []
    for t in t_test:
        compare.append(t['track_uri'].split(":")[-1])
    
    return predicts, compare

In [27]:
def jaccard_distance(l1, l2):
    intersection = 0
    for i in l1:
        if i in l2:
            intersection +=1
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

In [29]:
total = []
for plist in l_test:
    pred, comp = eval(plist)
    dist = jaccard_distance(pred, comp)
    total.append(dist)

print "Average Jaccard Distance"
print sum(total) / float(len(total))

Average Jaccard Distance
0.0578745372613


# Try it out for yourself!

In [88]:
track=[]
target = ["Moon River", "Molly", "Fly me to the moon", "Perfect"]
for i in uri_track:
    if uri_track[i]['track_name'] in target:
        track.append(uri_track[i])

In [89]:
track

[]

In [84]:
damn = []
for i in track:
    w1 = i['track_uri'].split(":")[-1]
    if w1 not in model.wv.vocab.keys():
            continue
    res = model.wv.most_similar(positive=w1)
    print "input: " + i['track_name'] + " By " + i["artist_name"]
    print
    for l in res:
        print uri_track[l[0]]['track_name'] + " By " + uri_track[l[0]]['artist_name']
        print 
    print "----------------"

input: Perfect By Simple Plan

7 Minutes In Heaven (Atavan Halen) By Fall Out Boy

Addicted By Simple Plan

One Day By Simple Plan

Hum Hallelujah By Fall Out Boy

Thank You By Simple Plan

Grand Theft Autumn/Where Is Your Boy By Fall Out Boy

Kelsey By Metro Station

That's What You Get By Paramore

Kids In Love By Mayday Parade

Nice Guys Finish Last By Cobra Starship

----------------
input: Perfect By Ed Sheeran

Happier By Ed Sheeran

Dive By Ed Sheeran

Hearts Don't Break Around Here By Ed Sheeran

Supermarket Flowers By Ed Sheeran

Save Myself By Ed Sheeran

What Do I Know? By Ed Sheeran

Idea of Her By Whitney Woerz

Roses By MacKenzie Bourg

Strip That Down - Acoustic By Liam Payne

Ex By James TW

----------------
input: Perfect By One Direction

Cough Syrup - The Voice Performance By Matthew Schuler

Olivia By One Direction

End of the Day By One Direction

Drag Me Down By One Direction

Wolves By One Direction

Infinity By One Direction

Walking in the Wind By One Direction