In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import json
import re
import collections
import os
import datetime

# Preprocessing

In [2]:
mpd = []

In [3]:
import sys
import json
import codecs
import datetime


pretty = True
compact = False
cache = {}

def get_playlist(pid):
    if pid >=0 and pid < 1000000:
        low = 1000 * int(pid / 1000)
        high = low + 999
        offset = pid - low
        path = "./mpd.v1/data/mpd.slice." + str(low) + '-' + str(high) + ".json"
        if not path in cache:
            f = codecs.open(path, 'r', 'utf-8')
            js = f.read()
            f.close()
            playlist = json.loads(js)
            cache[path] = playlist

        playlist = cache[path]['playlists'][offset]
        return playlist

def get_playlists_in_range(start, end):
    pls = []
    try:
        istart = int(start)
        iend = int(end)
        if istart <= iend and istart >= 0 and iend <= 1000000:
            for pid in xrange(istart, iend):
                pls.append(get_playlist(pid))
    except:
        raise
        print "bad pid"
    return pls

In [4]:
total_playlists = 0
total_tracks = 0
tracks = set()
artists = set()
albums = set()
titles = set()
total_descriptions = 0
ntitles = set()

playlist_length_histogram = collections.Counter()
num_followers_histogram = collections.Counter()
artist_histogram = collections.Counter()
track_histogram = collections.Counter()


def process_playlist(playlist):
    global total_playlists, total_tracks, total_descriptions

    total_playlists += 1
    # print playlist['playlist_id'], playlist['name']

    if 'description' in playlist:
        total_descriptions += 1

    titles.add(playlist['name'])
    nname = normalize_name(playlist['name'])
    ntitles.add(nname)

    playlist_length_histogram[playlist['num_tracks']] += 1
    num_followers_histogram[playlist['num_followers']] += 1

    for track in playlist['tracks']:
        total_tracks += 1
        albums.add(track['album_uri'])
        tracks.add(track['track_uri'])
        artists.add(track['artist_uri'])

        full_name = track['track_name'] + " by " + track['artist_name']
        artist_histogram[track['artist_name']] += 1
        track_histogram[full_name] += 1

def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", ' ', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name

In [5]:
def show_summary():
    print
    print "number of playlists", total_playlists
    print "number of tracks", total_tracks
    print "number of unique tracks", len(tracks)
    print "number of unique albums", len(albums)
    print "number of unique artists", len(artists)
    print "number of unique titles", len(titles)
    print "number of playlists with descriptions", total_descriptions
    print "number of unique normalized titles", len(ntitles)
    print "avg playlist length", float(total_tracks) / total_playlists
    
    print
    print "top tracks"
    for track, count in track_histogram.most_common(20):
        print "%7d %s" % (count, track)

    print
    print "top artists"
    for artist, count in artist_histogram.most_common(20):
        print "%7d %s" % (count, artist)

    print
    print "playlist length histogram"
    for length, count in playlist_length_histogram.most_common(20):
        print "%7d %d" % (count, length)

    print
    print "num followers histogram"
    for followers, count in num_followers_histogram.most_common(20):
        print "%7d %d" % (count, followers)


In [6]:
uri_track = {}
def process_tracks(playlist):
    global uri_track
    document = []
    
    for track in playlist['tracks']:
        uri = track['track_uri'].split(':')[2]
        document.append(uri)
        uri_track[uri] = track
    
    return document

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

l = get_playlists_in_range(0, 10000)
l = np.array(l)

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split
l_train, l_test = train_test_split(l)

In [9]:
len(l_train)

7500

# Collaborative Filtering

In [None]:
import copy
#recommend for one list
#go through training set, calculate score, stored in list
#return top n ranked list sorted by score
def calc_score(playlist):
    dic = {}
    tracks = playlist['tracks']
    nname_test = []
    for track in tracks:
        nname = normalize_name(track['track_name'])
        nname_test.append(nname)
    total = len(nname_test)
    
    #for each playlist in training set, calculate score and add in result
    for pl in l_train:
        count = 0
        tracks = pl['tracks']
        for track in tracks:
            nname = normalize_name(track['track_name'])
            if nname in nname_test:
                count+=1
        #after finishing counting
        dic[pl['pid']] = float(count)/total
        
    return dic, nname_test

In [None]:
import operator
dic, names = calc_score(l_test[0])
result = sorted(dic.items(), key=operator.itemgetter(1))
result.reverse()

In [None]:
result[0]

In [None]:
pt = get_pid_tracks()

In [None]:
def recommend(pt, scores, names, count):
    result = []
    for score in scores:
        pid = score[0]
        tracks = pt[pid]
        for track in tracks:
            nname = normalize_name(track['track_name'])
            if nname not in names:
                result.append(track)
                if len(result) >= count:
                    return result
    return result

In [None]:
z = recommend(pt, result, names, 10)

In [None]:
for pl in l_test:
    dic, names = calc_score(pl)
    result = sorted(dic.items(), key=operator.itemgetter(1))
    result.reverse()
    print (recommend(pt, result, names, 5))
    print 

# Word2Vec
- uri_tracks

In [12]:
documents = []
for plist in l_train:
    documents.append(process_tracks(plist))

In [13]:
len(documents[0])

134

In [14]:
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [15]:
model = gensim.models.Word2Vec (documents, size=150, window=10, min_count=2, workers=10)
model.train(documents,total_examples=len(documents),epochs=10)

2018-04-25 14:19:24,827 : INFO : collecting all words and their counts
2018-04-25 14:19:24,828 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-04-25 14:19:25,035 : INFO : collected 140941 word types from a corpus of 500151 raw words and 7500 sentences
2018-04-25 14:19:25,036 : INFO : Loading a fresh vocabulary
2018-04-25 14:19:25,354 : INFO : min_count=2 retains 50077 unique words (35% of original 140941, drops 90864)
2018-04-25 14:19:25,355 : INFO : min_count=2 leaves 409287 word corpus (81% of original 500151, drops 90864)
2018-04-25 14:19:25,508 : INFO : deleting the raw counts dictionary of 140941 items
2018-04-25 14:19:25,512 : INFO : sample=0.001 downsamples 0 most-common words
2018-04-25 14:19:25,513 : INFO : downsampling leaves estimated 409287 word corpus (100.0% of prior 409287)
2018-04-25 14:19:25,664 : INFO : estimated required memory for 50077 words and 150 dimensions: 85130900 bytes
2018-04-25 14:19:25,665 : INFO : resetting layer weights
2

2018-04-25 14:19:29,395 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-04-25 14:19:29,397 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-04-25 14:19:29,398 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-04-25 14:19:29,400 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-04-25 14:19:29,409 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-04-25 14:19:29,413 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-25 14:19:29,415 : INFO : EPOCH - 2 : training on 500151 raw words (409287 effective words) took 0.4s, 960544 effective words/s
2018-04-25 14:19:29,782 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-04-25 14:19:29,802 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-04-25 14:19:29,822 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-04-25 14:19:29,824 : INFO : worker thread

2018-04-25 14:19:32,950 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-04-25 14:19:32,951 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-04-25 14:19:32,953 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-04-25 14:19:32,954 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-04-25 14:19:32,956 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-04-25 14:19:32,957 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-04-25 14:19:32,972 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-04-25 14:19:32,973 : INFO : EPOCH - 10 : training on 500151 raw words (409287 effective words) took 0.4s, 942163 effective words/s
2018-04-25 14:19:32,976 : INFO : training on a 5001510 raw words (4092870 effective words) took 4.4s, 924176 effective words/s


(4092870, 5001510)

In [16]:
l_train[0]['tracks'][0]['track_uri'].split(":")[2]

u'26rdOwwjC2UnweK3xeS58u'

In [17]:
l_train[0]['tracks'][0]

{u'album_name': u'MY HOUSE',
 u'album_uri': u'spotify:album:5lkNnHVlnCCCV304t89wOH',
 u'artist_name': u'Flo Rida',
 u'artist_uri': u'spotify:artist:0jnsk9HBra6NMjO2oANoPY',
 u'duration_ms': 190185,
 u'pos': 0,
 u'track_name': u'GDFR (feat. Sage The Gemini & Lookas)',
 u'track_uri': u'spotify:track:26rdOwwjC2UnweK3xeS58u'}

In [42]:
w1 = l_train[0]['tracks'][0]['track_uri'].split(":")[2]
rec_list = model.wv.most_similar(positive=w1, topn = 5)

In [43]:
len(rec_list)

5

In [44]:
print "Test: " + uri_track[w1]['track_name'] + " By " + uri_track[w1]['artist_name']
print
for l in rec_list:
    print uri_track[l[0]]['track_name'] + " By " + uri_track[l[0]]['artist_name']
    print "similarity: " + str(l[1])
    print
    

Test: GDFR (feat. Sage The Gemini & Lookas) By Flo Rida

Shutterbugg By Big Boi
similarity: 0.984748542309

Watch Me (Whip / Nae Nae) By Silentó
similarity: 0.979164481163

Tribe (feat. Jesse Boykins III) By Theophilus London
similarity: 0.976357161999

Drunk in Love By Beyoncé
similarity: 0.974434196949

I Don't Mind By Usher
similarity: 0.974072992802



# Evaluation
- R-precision is the number of retrieved relevant tracks divided by the number of known relevant tracks (i.e., the number of withheld tracks):
- Jaccard Similarity measures intersection over union

In [76]:
len(l_test)
scores=[]
count = 0
for plist in l_test:
    tracks = plist['tracks']
    #withheld tracks
    t_train, t_test = train_test_split(tracks)
    #predict
    predicts = []
    for t in t_train:
        uri = t['track_uri'].split(":")[-1]
        if uri not in model.wv.vocab.keys():
            continue
        res = model.wv.most_similar(positive=uri)
        count+=1
    #calculate Jaccard Sim & R-precision
print count

89787
