###Imports, File Paths, & Constants

In [260]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import urllib as urllib
import json
import math
import time
import csv
from datetime import datetime
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline
pd.set_option.econding = 'utf-8'

In [4]:
#FILE PATH
fp = #file path where you are storing this repo. User data is the /data folder
#FILES
#fl = "\users-full.csv"
#fl = "\users-test.csv"
fl = "\users-35k.csv"
cache = #file path to cache file after requesting all data through API

In [291]:
users_file = pd.read_csv(fp+fl)['User']

**Note: Request an API key from Last.fm at http://www.last.fm/api

In [5]:
#CONSTANTS
API_KEY  = #REQUEST API KEY FROM LAST.FM
API_SECRET = #REQUEST API SECRET FROM LAST.FM
ARTIST_LIMIT = 50
REPORT_SIZE = 50

###Last.fm API Functions. (To do: create a global function using *awargs)

In [6]:
def usergetTopArtists(user, 
                      method="user.getTopArtists", 
                      period="overall", 
                      limit=ARTIST_LIMIT, 
                      page=1, 
                      key=API_KEY, 
                      format_="json"):
    base = "http://ws.audioscrobbler.com/2.0/?"
    mthd = "method={:s}".format(method)
    prd = "period={:s}".format(period)
    lim = "limit={0}".format(limit)
    pg = "page={0}".format(page)
    usr = "user={:s}".format(user)
    apikey = "api_key={:s}".format(key)
    frmt = "format={:s}".format(format_)
    return base+mthd+"&"+usr+"&"+prd+"&"+lim+"&"+pg+"&"+apikey+"&"+frmt

def usergetInfo(user,
                method='user.getInfo',
                key=API_KEY,
                format_="json"):
#create API call url for the a specified API method
    base = "http://ws.audioscrobbler.com/2.0/?"
    mthd = "method={:s}".format(method)
    usr = "user={:s}".format(user)
    apikey = "api_key={:s}".format(key)
    frmt = "format={:s}".format(format_)
    return base+mthd+"&"+usr+"&"+apikey+"&"+frmt

def usergetLovedTracks(user, 
                      method='user.getLovedTracks', 
                      key=API_KEY, 
                      format_="json",
                      page=1,
                      fest=1, 
                      limit=ARTIST_LIMIT):
    base = "http://ws.audioscrobbler.com/2.0/?"
    mthd = "method={:s}".format(method)
    usr = "user={:s}".format(user)
    pg = "page={0}".format(page)
    lim = "limit={0}".format(limit)
    apikey = "api_key={:s}".format(key)
    frmt = "format={:s}".format(format_)
    return base+mthd+"&"+usr+"&"+lim+"&"+apikey+"&"+frmt

def usergetPastEvents(user, 
                      method='user.getPastEvents', 
                      key=API_KEY, 
                      format_="json", 
                      fest=1, 
                      limit=ARTIST_LIMIT):
    base = "http://ws.audioscrobbler.com/2.0/?"
    mthd = "method={:s}".format(method)
    usr = "user={:s}".format(user)
    lim = "limit={0}".format(limit)
    apikey = "api_key={:s}".format(key)
    frmt = "format={:s}".format(format_)
    return base+mthd+"&"+usr+"&"+lim+"&"+apikey+"&"+frmt

def eventgetInfo(event, 
                method='event.getinfo', 
                key=API_KEY, 
                format_="json"):
    base = "http://ws.audioscrobbler.com/2.0/?"
    mthd = "method={:s}".format(method)
    evnt = "event={0}".format(event)
    apikey = "api_key={:s}".format(key)
    frmt = "format={:s}".format(format_)
    return base+mthd+"&"+evnt+"&"+apikey+"&"+frmt

def eventgetAttendees(event, 
                      method='event.getattendees', 
                      key=API_KEY, 
                      format_="json", 
                      fest=1, 
                      limit=ARTIST_LIMIT):
    base = "http://ws.audioscrobbler.com/2.0/?"
    mthd = "method={:s}".format(method)
    evnt = "event={0}".format(event)
    lim = "limit={0}".format(limit)
    apikey = "api_key={:s}".format(key)
    frmt = "format={:s}".format(format_)
    return base+mthd+"&"+evnt+"&"+lim+"&"+apikey+"&"+frmt

def geogetEvents(location='Kansas City', 
                method='geo.getEvents',
                distance=3000,
                key=API_KEY,
                format_="json", 
                fest=1,
                page=1,
                limit=ARTIST_LIMIT):
    base = "http://ws.audioscrobbler.com/2.0/?"
    mthd = "method={:s}".format(method)
    loc = "location={0}".format(location)
    dis = "distance={0}".format(distance)
    festonly = "festivalsonly={0}".format(fest)
    pg = "page={0}".format(page)
    lim = "limit={0}".format(limit)
    apikey = "api_key={:s}".format(key)
    frmt = "format={:s}".format(format_)
    return base+mthd+"&"+loc+"&"+dis+"&"+festonly+"&"+pg+"&"+lim+"&"+apikey+"&"+frmt

def geogetEventsLatLong(latitude,
                        longitude,
                        method='geo.getEvents',
                        distance=50,
                        key=API_KEY,
                        format_="json", 
                        fest=1,
                        page=1,
                        limit=ARTIST_LIMIT):
    base = "http://ws.audioscrobbler.com/2.0/?"
    mthd = "method={:s}".format(method)
    longi = "long={0}".format(longitude)
    lat = "lat={0}".format(latitude)
    dis = "distance={0}".format(distance)
    festonly = "festivalsonly={0}".format(fest)
    pg = "page={0}".format(page)
    lim = "limit={0}".format(limit)
    apikey = "api_key={:s}".format(key)
    frmt = "format={:s}".format(format_)
    return base+mthd+"&"+longi+"&"+lat+"&"+dis+"&"+festonly+"&"+pg+"&"+lim+"&"+apikey+"&"+frmt

def artistgetInfo (artist,
                   method='artist.getInfo',
                   autocorrect=1,
                   key=API_KEY,
                   format_="json"):
    base = "http://ws.audioscrobbler.com/2.0/?"
    mthd = "method={:s}".format(method)
    artist = "artist={:s}".format(artist)
    ac = "autocorrect={0}".format(autocorrect)
    apikey = "api_key={:s}".format(key)
    frmt = "format={:s}".format(format_)
    return base+mthd+"&"+artist+"&"+ac+"&"+apikey+"&"+frmt

def apiurl_tuples(method, list_of_users, period="overall", limit=1000, key=API_KEY, format_="json"):
#create tuples in the form (user, api_url)
    lookup = []
    for i, user in enumerate(list_of_users):
        qry_str = APIURL_singleuser(method, user, period, limit, key, format_)
        lookup.append((user, qry_str))
    return lookup

def api_call(url):
    sock = urllib.urlopen(url)
    content = sock.read() 
    sock.close()
    return content

In [7]:
#create Timer class to use for benchmarking 
class Timer(object):
    def __init__(self, verbose=False):
        self.verbose = verbose

    def __enter__(self):
        self.start = time.time()
        return self

    def __exit__(self, *args):
        self.end = time.time()
        self.secs = self.end - self.start
        self.msecs = self.secs * 1000  # millisecs
        if self.verbose:
            print 'elapsed time: %f ms' % self.msecs

##Load user listening data via API results.

**WARNING: Very slow! This takes approximately 1 hour per 5k users**

In [7]:
#true_users is a list of only users with listening history
true_users = []
#bad_users is a list of users with no listen history
bad_users = []

#benchmark time
with Timer() as t:

    for i, user in enumerate(users):

        #indiv_user_data is a dictionary of each user's top 50 artist play history in rank: artist pairs
        indiv_user_data = {}
        
        try:
            #load JSON API results for each user
            wrk = json.loads(api_call(usergetTopArtists(user)))
            if int(wrk['topartists']['@attr']['total']) != 0:
                true_users.append(wrk)
            else:
                bad_users.append(user)
                continue

        except:
            continue
        
        #give some user feedback on how much data is loaded
        if i % REPORT_SIZE == 0:
            print '{0} of {1} USERS LOADED'.format(i, len(users))
            time.sleep(1)

print "==>elapsed time: %s s" % t.secs

#cache data locally & create dataframe
f = pd.DataFrame.from_dict(true_users)
f.to_json('users-35k-with-listen-history.json')
true_users

0 of 34999 USERS LOADED
100 of 34999 USERS LOADED
150 of 34999 USERS LOADED
200 of 34999 USERS LOADED
250 of 34999 USERS LOADED
300 of 34999 USERS LOADED
400 of 34999 USERS LOADED
450 of 34999 USERS LOADED
500 of 34999 USERS LOADED
600 of 34999 USERS LOADED
650 of 34999 USERS LOADED
750 of 34999 USERS LOADED
800 of 34999 USERS LOADED
850 of 34999 USERS LOADED
900 of 34999 USERS LOADED
1100 of 34999 USERS LOADED
1150 of 34999 USERS LOADED
1200 of 34999 USERS LOADED
1250 of 34999 USERS LOADED
1400 of 34999 USERS LOADED
1450 of 34999 USERS LOADED
1500 of 34999 USERS LOADED
1600 of 34999 USERS LOADED
1700 of 34999 USERS LOADED
1750 of 34999 USERS LOADED
1800 of 34999 USERS LOADED
1850 of 34999 USERS LOADED
1900 of 34999 USERS LOADED
2000 of 34999 USERS LOADED
2050 of 34999 USERS LOADED
2100 of 34999 USERS LOADED
2200 of 34999 USERS LOADED
2250 of 34999 USERS LOADED
2350 of 34999 USERS LOADED
2500 of 34999 USERS LOADED
2550 of 34999 USERS LOADED
2600 of 34999 USERS LOADED
2800 of 34999 USER

AttributeError: type object 'DataFrame' has no attribute 'from_list'

###Load Cache instead of pulling data via api

In [8]:
f = pd.read_json(cache)

###Create a dict of artists with the mbid used as a unique key (some artists may have similar names)
MBID's stand for MusicBrainz Identifier, a unique ID for music identification

See: 
https://musicbrainz.org/doc/MusicBrainz_Identifier

In [173]:
#mbidartist is a dict with {'mbid': 'artist'} pairs
mbidartist = {}
dupes = []

#loop through each user's top artist plays and save each artist. Duplicates are recorded in 
for i in f['topartists'].index:
    
    try:       
        #first access each user's json file
        wrk_dict = dict(f.ix[i]['topartists'])
        
        min_artists = min(ARTIST_LIMIT, int(wrk_dict['@attr']['total']))
        
        #JSON is not uniform, users with 1 artist have different structure
        if min_artists == 1:
            
            key = str(wrk_dict['artist']['mbid'])
            val = wrk_dict['artist']['name']

            #encode the strings in unicode as some artist names have non-ascii characters
            val = str(val.encode('utf-8'))
            
            if key == '':
                #create a mbid = the artist's name if mbid is missing
                key = val

            if mbidartist.has_key(key) and mbidartist[key] == val:
                dupes.append(val)
            else:
                mbidartist.__setitem__(key, val)
        else:
            
            #loop over each possible artist and add to the mbidartist dict
            for x in xrange(min_artists):
                key = str(wrk_dict['artist'][x]['mbid'])
                val = wrk_dict['artist'][x]['name']
                
                #encode the strings in unicode as some artist names have non-ascii characters
                val = str(val.encode('utf-8'))

                if key == '':
                    #create a mbid = the artist's name if mbid is missing
                    key = val

                if mbidartist.has_key(key) and mbidartist[key] == val:
                    dupes.append(val)
                    #continue
                else:
                    mbidartist.__setitem__(key, val)
                
    except:
        continue

    if i % REPORT_SIZE == 0:
        #give encouragement to the user that something is happening
        print '.',
        time.sleep(1)

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .


In [257]:
dupes

['Norah Jones',
 'Coldplay',
 'The Fray',
 'Black Eyed Peas',
 'Fall Out Boy',
 'Taylor Swift',
 'Avril Lavigne',
 'Lady Gaga',
 'Exile',
 'My Chemical Romance',
 'Christina Aguilera',
 'Bee Gees',
 'C\xc3\xa9line Dion',
 'Chris Brown',
 'Beyonc\xc3\xa9',
 'Justin Bieber',
 'Billie Holiday',
 'Black Eyed Peas',
 'The Lumineers',
 'Jay-Z',
 'Arctic Monkeys',
 'Radiohead',
 'Kings of Leon',
 'Coldplay',
 'The National',
 'Vampire Weekend',
 'Fleet Foxes',
 'Feist',
 'The Black Keys',
 'Sigur R\xc3\xb3s',
 'The Beatles',
 'Drake',
 'Chris Brown',
 'Elvis Presley',
 'Britney Spears',
 'Mariah Carey',
 'Bon Jovi',
 'Muse',
 'Kings of Leon',
 'Crystal Castles',
 'The Killers',
 'Eels',
 'Red Hot Chili Peppers',
 'Linkin Park',
 'Regina Spektor',
 'Elvis Presley',
 'Norah Jones',
 'The Cure',
 'The Beatles',
 'Jack Johnson',
 'Wilco',
 'Coldplay',
 'Kings of Leon',
 'The Kooks',
 'Arctic Monkeys',
 'Paramore',
 'The Vaccines',
 'The Wombats',
 'Daughter',
 'Kings of Leon',
 'Muse',
 'The Blac

In [265]:
#print everything out for a quick visual scan
count = 0
for key in mbidartist:
    print count, key, mbidartist[key]
    count += 1

0 0b8b2812-e5b2-4ca8-a808-65f96390c414 OZMA
1 d7f97ecf-668e-479b-99cb-dd8dc1680381 Black Yaya
2 Swami Sukhabodhananda Swami Sukhabodhananda
3 7bcbc02c-c76f-4491-9fc3-f3d419f19954 BNJMN
4 6ddea7c2-6b5d-4e69-9fb8-8d9bb8bb5087 The Tyrell Corporation
5 5451792e-a144-452a-b82e-a8a33cc4b57d Theodore Bikel
6 2e2cd5a6-63d0-4c43-8625-cf18f975e3c6 Pacou
7 24568d6a-34bb-44a0-8379-423709aa8ef0 Maurice André
8 Emcee Nova f/ DJ Webstar Emcee Nova f/ DJ Webstar
9 Рахманинов (Ашкенази) Рахманинов (Ашкенази)
10 0a3497d4-91d6-45c4-95a1-e2067ccd48e8 Weapon
11 15ba83b7-d89a-4dd6-9695-1290fb567493 Yuridia
12 6caa6105-e734-4017-95c8-16c93cb1b071 Molejo
13 Monica Molina Monica Molina
14 a0f9e744-325d-457b-a5a5-06e3fcbd477b 白浜坂高校合唱部
15 29260a06-d049-4794-81bc-851382777749 Those Chosen
16 Иван Дорн feat Кравц Иван Дорн feat Кравц
17 fd5983d1-1548-4fbb-bd5d-5f5daffdd8fb Parov Stelar Trio
18 Queens Of The Stone Age [Qotsa] Queens Of The Stone Age [Qotsa]
19 Dash Berlin feat. Emma Hewitt Dash Berlin feat. Emma He

####First clean up any bad/blank mbid's

In [266]:
mbidartist.__contains__('Marco Cardoza')

True

In [267]:
mbidartist['Marco Cardoza']

'Marco Cardoza'

In [272]:
df_artists['artist'][df_artists.index == 'ф.шуберт']

ф.шуберт    ф.шуберт
Name: artist, dtype: object

In [241]:
#any blank keys?
mbidartist.keys().count('')

#looks like there are NONE, we can live with that!

0

####Then use the {mbid: artist} dict to create a dataframe for later use

In [270]:
df_artists = pd.DataFrame.from_dict(data=mbidartist, orient='index')
df_artists.rename(columns={0:'artist'}, inplace=True)
df_artists.head()

Unnamed: 0,artist
0b8b2812-e5b2-4ca8-a808-65f96390c414,OZMA
d7f97ecf-668e-479b-99cb-dd8dc1680381,Black Yaya
Swami Sukhabodhananda,Swami Sukhabodhananda
7bcbc02c-c76f-4491-9fc3-f3d419f19954,BNJMN
6ddea7c2-6b5d-4e69-9fb8-8d9bb8bb5087,The Tyrell Corporation


####Create a dataframe of users for later use

In [175]:
users = {}
for i in f['topartists'].index:
    wrk_dict = dict(f.ix[i]['topartists'])
    #ignores users with only 1 artist in their listen history
    if int(wrk_dict['@attr']['total']) == 1:
        continue
    else:
        users[i] = str(wrk_dict['@attr']['user'])

In [269]:
df_users = pd.DataFrame.from_dict(data=users, orient='index')
df_users.rename(columns={0:'user'}, inplace=True)
df_users.head()

Unnamed: 0,user
0,itsCHO2
1,sarahdanita
2,J_guitar
3,anonym_b
4,alias_j


###Create Artist Ratings List for each User

**First let's initialize the User dict which will list each user's playcount and modified_rating based on their playcount (described below)**

####1> Initialize the outside user data structure

In [203]:
#DATA STRUCTURE: {index: {'user': {}}}
raw_users = {}
for i in f['topartists'].index:
    wrk_dict = dict(f.ix[i]['topartists'])
    if int(wrk_dict['@attr']['total']) == 1:
        continue
    else:
        raw_users[i] = {}

100.0
1000.0
10007.0
10009.0
10015.0
10016.0
10041.0
10068.0
10074.0
10088.0
10090.0
10098.0
10102.0
10104.0
10106.0
10133.0
10134.0
10157.0
10172.0
10197.0
10209.0
10210.0
10215.0
10275.0
10278.0
10293.0
10299.0
10305.0
10309.0
10318.0
10331.0
10337.0
10351.0
10363.0
1037.0
1038.0
10386.0
1039.0
10401.0
10405.0
10407.0
10414.0
10436.0
10450.0
10470.0
10524.0
10551.0
10573.0
10580.0
10594.0
10596.0
1061.0
1063.0
10640.0
10650.0
10657.0
10679.0
10682.0
10689.0
10711.0
10712.0
10738.0
10740.0
10751.0
10787.0
10814.0
10817.0
10825.0
10847.0
10852.0
10853.0
10864.0
10890.0
10907.0
10909.0
10928.0
1093.0
11007.0
11013.0
11025.0
11034.0
11040.0
11044.0
11071.0
11072.0
11084.0
11085.0
11092.0
11093.0
11136.0
11163.0
11165.0
11166.0
11172.0
11178.0
11197.0
11217.0
11226.0
11243.0
11258.0
11260.0
11268.0
11321.0
11361.0
11362.0
11384.0
11385.0
11390.0
114.0
11432.0
11434.0
11439.0
11446.0
11452.0
11465.0
11487.0
1150.0
1151.0
11524.0
11527.0
11536.0
11538.0
1154.0
11551.0
11552.0
11574.0
11579.

####2> Populate each user in the active_users list with their playcount data

In [268]:
artist = df_artists['artist'][df_artists.index == '611700cf-27f0-4dc9-ae80-c513a767853e']
artist

Series([], name: artist, dtype: object)

In [273]:
#loop through each user and create a dictionary of that user's playcount history 

active_users = raw_users

for i in raw_users.keys():

    #ratings is a dict for each user in the active_users dict with:
    #DATA STRUCTURE: {'artist': {'playcount': playcount, 'mod_rating}
    ratings = {}

#     try:
    #first access each user's json file
    wrk_dict = dict(f.ix[i]['topartists'])

    #loop over each possible artist and add to the ratings dict
    min_artists = min(ARTIST_LIMIT, int(wrk_dict['@attr']['total']))

    for x in xrange(min_artists):

        #load playcount data
        playcount = int(wrk_dict['artist'][x]['playcount'])
        mbid = str(wrk_dict['artist'][x]['mbid'])

        #create a mbid = the artist's name if mbid is missing
        if mbid == '':
            artist = wrk_dict['artist'][x]['name']

            #encode the strings in unicode as some artist names have non-ascii characters
            artist = str(artist.encode('utf-8'))

            #add to dictionary
            ratings.__setitem__(artist, playcount)

        else:
            print mbid
            artist = df_artists['artist'][df_artists['artist'].index == mbid]

            #add to dictionary
            ratings.__setitem__(artist[0], playcount)

    #now add the ratings dict for each user
    active_users[i] = ratings

#     except:
#         print 'ERROR @ USER {:}'.format(i)
#         continue        

    if i % REPORT_SIZE == 0:
        #give encouragement to the user that something is happening
        print '.',
        time.sleep(1)

494e8d09-f85b-4543-892f-a5096aed1cd4
611700cf-27f0-4dc9-ae80-c513a767853e
1de93a63-3a9f-443a-ba8a-a43b5fe0121e
20244d07-534f-4eff-b4d4-930878889970
d24fb461-dee8-41fc-bb15-2f13bb2644a6
e0140a67-e4d1-4f13-8a01-364355bee46e
847e8a0c-cc20-4213-9e16-975515c2a926
70e5098b-c4ae-4cd7-9799-3c1b9b57c753
82eb8936-7bf6-4577-8320-a2639465206d
6e0ae159-8449-4262-bba5-18ec87fa529f
183105b5-3e68-4748-9086-2c1c11bf7a3d
0eedfc95-e79a-4fd9-832a-cd816a0b3fda
1ab04b5d-b0c4-4254-afc1-386f02337136
650e7db6-b795-4eb5-a702-5ea2fc46c848
afb680f2-b6eb-4cd7-a70b-a63b25c763d5


TypeError: list indices must be integers, not str

In [228]:
#loop through each user and create a dictionary of that user's playcount history 

active_users = raw_users

for i in raw_users.keys():

    #ratings is a dict for each user in the active_users dict with:
    #DATA STRUCTURE: {'artist': {'playcount': playcount, 'mod_rating}
    ratings = {}

    try:
        #first access each user's json file
        wrk_dict = dict(f.ix[i]['topartists'])

        #loop over each possible artist and add to the ratings dict
        min_artists = min(ARTIST_LIMIT, int(wrk_dict['@attr']['total']))
        
        for x in xrange(min_artists):

            #load playcount data
            playcount = int(wrk_dict['artist'][x]['playcount'])
            mbid = str(wrk_dict['artist'][x]['mbid'])
            
            #create a mbid = the artist's name if mbid is missing
            if mbid == '':
                artist = wrk_dict['artist']['name']
                
                #encode the strings in unicode as some artist names have non-ascii characters
                artist = str(artist.encode('utf-8'))
                
                #add to dictionary
                ratings.__setitem__(artist, playcount)
                
            else:
                artist = df_artists['artists'][df_artists['artists'].index == mbid]
                
                #add to dictionary
                ratings.__setitem__(artist[0], playcount)

        #now add the ratings dict for each user
        active_users[i] = ratings

    except:
        print 'ERROR @ USER {:}'.format(i)
        continue        

    if i % REPORT_SIZE == 0:
        #give encouragement to the user that something is happening
        print '.',
        time.sleep(1)

ERROR @ USER 0.0
ERROR @ USER 1.0
ERROR @ USER 2.0
ERROR @ USER 3.0
ERROR @ USER 4.0
ERROR @ USER 5.0
ERROR @ USER 6.0
ERROR @ USER 7.0
ERROR @ USER 8.0
ERROR @ USER 9.0
ERROR @ USER 10.0
ERROR @ USER 11.0
ERROR @ USER 12.0
ERROR @ USER 13.0
ERROR @ USER 14.0
ERROR @ USER 15.0
ERROR @ USER 16.0
ERROR @ USER 17.0
ERROR @ USER 18.0
ERROR @ USER 19.0
ERROR @ USER 21.0
ERROR @ USER 22.0
ERROR @ USER 23.0
ERROR @ USER 24.0
ERROR @ USER 25.0
ERROR @ USER 26.0
ERROR @ USER 27.0
ERROR @ USER 28.0
ERROR @ USER 29.0
ERROR @ USER 30.0
ERROR @ USER 31.0
ERROR @ USER 32.0
ERROR @ USER 33.0
ERROR @ USER 34.0
ERROR @ USER 35.0
ERROR @ USER 36.0
ERROR @ USER 38.0
ERROR @ USER 39.0
ERROR @ USER 40.0
ERROR @ USER 41.0
ERROR @ USER 42.0
ERROR @ USER 43.0
ERROR @ USER 44.0
ERROR @ USER 45.0
ERROR @ USER 46.0
ERROR @ USER 47.0
ERROR @ USER 49.0
ERROR @ USER 50.0
ERROR @ USER 51.0
ERROR @ USER 52.0
ERROR @ USER 53.0
ERROR @ USER 54.0
ERROR @ USER 55.0
ERROR @ USER 56.0
ERROR @ USER 57.0
ERROR @ USER 59.0
ER

In [213]:
active_users[0]

{'': 40,
 '0103c1cc-4a09-4a5d-a344-56ad99a77193': 85,
 '050307fc-c4ab-43e4-b7e0-96a455c9ca3d': 46,
 '0eedfc95-e79a-4fd9-832a-cd816a0b3fda': 182,
 '122d63fc-8671-43e4-9752-34e846d62a9c': 65,
 '126759e7-ca12-4bb4-884e-eba160e9e406': 38,
 '12d75989-f4c5-49c0-9785-91df3d160648': 88,
 '183105b5-3e68-4748-9086-2c1c11bf7a3d': 182,
 '1ab04b5d-b0c4-4254-afc1-386f02337136': 181,
 '1cf5e0db-97c1-4d66-91e6-0b6ba883c3cd': 55,
 '1de93a63-3a9f-443a-ba8a-a43b5fe0121e': 779,
 '20244d07-534f-4eff-b4d4-930878889970': 382,
 '21e3292e-dac8-4609-b57d-402f926aa41f': 55,
 '2f4f5d16-7102-4110-97fd-f5c365d6bb26': 69,
 '35afd5db-67c0-47db-a0f8-2da130a54987': 43,
 '406ff146-022c-4206-b753-bf480d3259b0': 58,
 '494e8d09-f85b-4543-892f-a5096aed1cd4': 1575,
 '5523ed49-38bc-44fa-8670-2a9c354de0b1': 63,
 '60d41417-feda-4734-bbbf-7dcc30e08a83': 39,
 '611700cf-27f0-4dc9-ae80-c513a767853e': 970,
 '64a6d8d7-ffff-42c2-8afb-1f672481b15d': 115,
 '650e7db6-b795-4eb5-a702-5ea2fc46c848': 168,
 '69989475-2971-49aa-8c53-5d74af88b8

####3> Create modified rating to uniformally weight songs based on playcount

There is a bug in the API data for ranking so I decided to create a modified playcount column which is calculated as:

**Modified_Rating = (Artist playcount / Sum of All artists playcounts) * 100**

In [240]:
#loop through each user and create a dictionary of that user's modified_rating
#using playcount history and the above formula

#threshold for which play history will be discarded to 
throw_out_threshold = 5

active_users_mr = raw_users

for i in raw_users.keys():

    #ratings is a dict for each user in the active_users dict with:
    #DATA STRUCTURE: {'mbid': 'mod_rating}
    
    mod_ratings = {}

    try:
        #first access each user's json file
        wrk_dict = dict(f.ix[i]['topartists'])
        
        #playcount total for that user across all artists
        pc_total = sum(active_users[i].values())

        #loop over each possible artist and add to the ratings dict
        min_artists = min(ARTIST_LIMIT, int(wrk_dict['@attr']['total']))
        
        for x in xrange(min_artists):

            #load playcount data
            pc = float(wrk_dict['artist'][x]['playcount'])
            mbid = str(wrk_dict['artist'][x]['mbid'])

            #calculate modified_rating
            mr_wkr = (pc / pc_total) * 100
            
            #throw out ratings less then the throw_out_threshold
            if mr_wkr < throw_out_threshold:
                mr = 0
            else:
                mr = mr_wkr
            
            #add to dictionary
            mod_ratings.__setitem__(mbid, mr)

        #now add the ratings dict for each user
        active_users_mr[i] = mod_ratings

    except:
        print 'ERROR @ USER {:}'.format(i)  
        continue        

    if i % REPORT_SIZE == 0:
        #give encouragement to the user that something is happening
        print '.',
        time.sleep(1)

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .


In [337]:
active_users_mr[0]

{'': 0.4935225169648365,
 '0103c1cc-4a09-4a5d-a344-56ad99a77193': 1.0487353485502777,
 '050307fc-c4ab-43e4-b7e0-96a455c9ca3d': 0.567550894509562,
 '0eedfc95-e79a-4fd9-832a-cd816a0b3fda': 2.2455274521900064,
 '122d63fc-8671-43e4-9752-34e846d62a9c': 0.8019740900678594,
 '126759e7-ca12-4bb4-884e-eba160e9e406': 0.4688463911165947,
 '12d75989-f4c5-49c0-9785-91df3d160648': 1.0857495373226402,
 '183105b5-3e68-4748-9086-2c1c11bf7a3d': 2.2455274521900064,
 '1ab04b5d-b0c4-4254-afc1-386f02337136': 2.233189389265885,
 '1cf5e0db-97c1-4d66-91e6-0b6ba883c3cd': 0.6785934608266502,
 '1de93a63-3a9f-443a-ba8a-a43b5fe0121e': 9.61135101789019,
 '20244d07-534f-4eff-b4d4-930878889970': 4.713140037014188,
 '21e3292e-dac8-4609-b57d-402f926aa41f': 0.6785934608266502,
 '2f4f5d16-7102-4110-97fd-f5c365d6bb26': 0.8513263417643431,
 '35afd5db-67c0-47db-a0f8-2da130a54987': 0.5305367057371992,
 '406ff146-022c-4206-b753-bf480d3259b0': 0.715607649599013,
 '494e8d09-f85b-4543-892f-a5096aed1cd4': 19.432449105490438,
 '552

###Create MBID list

In [134]:
mbid = df_artists.index
print len(mbid)

37830


###Functions to calculate similarity

**Cosine Similarity**

In [363]:
def calc_cs(rating1, rating2, in_common):
    '''This function implements cosine similarity, between the 2 lists of rated artists from the 2 users'''
    
    common_ratings = False
    
    a_list = []
    b_list = []
    
    #for every key in the first users list, check to see if the artist is in the second user's list
    for key in rating1:
        if key in rating2:
            
            #create the vectors of ratings
            a_list.append(rating1[key])
            b_list.append(rating2[key])
            
            common_ratings = True


    #This time we insist that the 2 users must have a least in_common number of artists in common
    #if the other routines are used then this should also be included
    if len(a_list) < in_common:
        common_ratings = False
        
    if common_ratings:
        
        #if there are enough artists in common then calculate the cosine similarity and return it
        cs = cosine_similarity(a_list, b_list)
        return(cs)
    else:
        return 0

**Pearsons Correlation Coefficient**

In [362]:
def calc_pcc(rating1, rating2, in_common):
    '''This routine calculates a Pearsons correlation coefficient between the 2 lists of artits from the 2 users'''
    
    common_ratings = False
    
    a_list = []
    b_list = []
    
    #for every key in the first users list, check to see if the artist is in the second user's list
    for key in rating1:
        if key in rating2:
            
            #create the vectors of ratings
            a_list.append(rating1[key])
            b_list.append(rating2[key])
            common_ratings = True
    
    #This time we insist that the 2 users must have a least in_common number of artists in common
    #if the other routines are used then this should also be included
    if len(a_list) < in_common:
        common_ratings = False
            
    #Only if there were common ratings should we bother to complete the calculation        
    if common_ratings:
        
        #r is the correlation coefficient, and p is the p-value (which we ignore)
        r, p = pearsonr(a_list, b_list)
        return(r)
    else:
        return 0

**User Ratings**

In [396]:
def user_ratings(userid, users, N):
    '''This function will return the ratings from the database in order, with the highest rating first'''
    
    #get the ratings
    ratings = users[userid]
    
    #convert to a list
    ratings = list(ratings.items())
    
    #sort
    ratings.sort(key = lambda x: x[1], reverse = True)
    
    #return the top N ratings
    ratings = ratings[:N]
    
    #print
    print "{:15s}  USER RATINGS FOR:  {:s}  {:15s}".format("-"*15,df_users['user'][userid],"-"*15)
    for r in ratings:
        #val = df_artists['artist']r[0][0:99]
        print "{:100s}\t{:f}".format(df_artists['artist'][r[0][0:99]], float(r[1]))

In [33]:
#quick error checker to see if certain mbid's are missing from df_artist
def error_checkers(mbid):
    found = []
    for i in f['topartists'].index:
        finder = mbid
        wrk_dict = dict(f.ix[i]['topartists'])
        min_artists = min(ARTIST_LIMIT, int(wrk_dict['@attr']['total']))
        for x in xrange(min_artists):
            try:
                if wrk_dict['artist'][x]['mbid'] == finder:
                    found.append({wrk_dict['@attr']['user']:wrk_dict['artist'][x]['name']})
            #KeyError exceptions for a single user
            except KeyError:
                if wrk_dict['artist']['mbid'] == finder:
                    found.append({wrk_dict['@attr']['user']:wrk_dict['artist']['name']})

    return found

In [38]:
error_checkers('611700cf-27f0-4dc9-ae80-c513a767853e')

[{u'itsCHO2': u'Michael Bubl\xe9'},
 {u'k-a-r-i-n': u'Michael Bubl\xe9'},
 {u'-Manzanito_': u'Michael Bubl\xe9'},
 {u'magnus_k': u'Michael Bubl\xe9'},
 {u'j_wentzel': u'Michael Bubl\xe9'},
 {u'b_witt': u'Michael Bubl\xe9'},
 {u'a_marsha': u'Michael Bubl\xe9'},
 {u'e_jetzt': u'Michael Bubl\xe9'},
 {u'anjelica78': u'Michael Bubl\xe9'},
 {u'Ainola_B': u'Michael Bubl\xe9'},
 {u'j_austere': u'Michael Bubl\xe9'},
 {u'dalia_a': u'Michael Bubl\xe9'},
 {u'k-b-t': u'Michael Bubl\xe9'},
 {u'j_romano': u'Michael Bubl\xe9'},
 {u'kiko_f': u'Michael Bubl\xe9'},
 {u'j_asalbuat': u'Michael Bubl\xe9'},
 {u'd_anillo': u'Michael Bubl\xe9'},
 {u'ccrvvanda': u'Michael Bubl\xe9'},
 {u'D-mo': u'Michael Bubl\xe9'},
 {u'nubis_e': u'Michael Bubl\xe9'},
 {u'arual_b': u'Michael Bubl\xe9'},
 {u'D-Kobraz': u'Michael Bubl\xe9'},
 {u'e__jones': u'Michael Bubl\xe9'},
 {u'd_agustina': u'Michael Bubl\xe9'},
 {u'lorena_b': u'Michael Bubl\xe9'},
 {u'whiskey-foxtrot': u'Michael Bubl\xe9'},
 {u'E-Fin': u'Michael Bubl\xe9'},


In [190]:
user_ratings(0, active_users_mr, 10)

NameError: name 'user_ratings' is not defined

###Recommendation Engine Functions

In [372]:
def compute_closest_person(userid, users, in_common = 1):
    '''This routine takes in a user ID and returns users who are close, in terms of the artists they have listened to'''
    
    #this list holds the distance measures from other users
    mdist_list = []
    found = False
    
    #traverse the main dictionary of users
    for user in users:
        
        #obviously don't look at the userid and compare it with itself
        if user != userid:
            
            #Choose a measure of similarity and use the in_common argument to specify
            #the minimum number of artists they must have rated
            #mdist = calc_minkowski_dist(users[user], users[userid], 1, in_common)
            mdist = calc_pcc(users[user], users[userid], in_common)
            #mdist = calc_cs(users[user], users[userid], in_common)
            
            #record the distance and the user as long as the distance is greater than 0
            if np.abs(mdist) > 0:
                mdist_list.append((mdist, user))
                found = True
                
    #sort by distance, closest first
    if found:
        mdist_list.sort(reverse = True)
    else:
        print "no matches found with those search criteria"
    
    return mdist_list

In [373]:
df_users.head(10)

Unnamed: 0,user
0,itsCHO2
1,sarahdanita
2,J_guitar
3,anonym_b
4,alias_j
5,fladdi_e
6,Shift-e
7,e_pepo
8,C_T_K
9,onkel_a


In [376]:
compute_closest_person(userid=0, users=active_users_mr)

[(1.0, 22085.0),
 (1.0, 22062.0),
 (1.0, 22059.0),
 (1.0, 22057.0),
 (1.0, 22029.0),
 (1.0, 22024.0),
 (1.0, 22002.0),
 (1.0, 21980.0),
 (1.0, 21967.0),
 (1.0, 21904.0),
 (1.0, 21883.0),
 (1.0, 21840.0),
 (1.0, 21803.0),
 (1.0, 21795.0),
 (1.0, 21785.0),
 (1.0, 21775.0),
 (1.0, 21760.0),
 (1.0, 21746.0),
 (1.0, 21719.0),
 (1.0, 21707.0),
 (1.0, 21702.0),
 (1.0, 21672.0),
 (1.0, 21668.0),
 (1.0, 21665.0),
 (1.0, 21646.0),
 (1.0, 21582.0),
 (1.0, 21580.0),
 (1.0, 21569.0),
 (1.0, 21521.0),
 (1.0, 21511.0),
 (1.0, 21446.0),
 (1.0, 21440.0),
 (1.0, 21431.0),
 (1.0, 21416.0),
 (1.0, 21402.0),
 (1.0, 21345.0),
 (1.0, 21339.0),
 (1.0, 21310.0),
 (1.0, 21303.0),
 (1.0, 21291.0),
 (1.0, 21270.0),
 (1.0, 21262.0),
 (1.0, 21233.0),
 (1.0, 21226.0),
 (1.0, 21220.0),
 (1.0, 21216.0),
 (1.0, 21204.0),
 (1.0, 21163.0),
 (1.0, 21146.0),
 (1.0, 21137.0),
 (1.0, 21133.0),
 (1.0, 21116.0),
 (1.0, 21098.0),
 (1.0, 21082.0),
 (1.0, 21079.0),
 (1.0, 21075.0),
 (1.0, 21029.0),
 (1.0, 20978.0),
 (1.0, 20920.0

In [387]:
def recommend(user_id, users, in_common = 2, number_of_recommendations = 10):
    
    #get the ID of the nearest person with in_common ratings
    nearest_list = compute_closest_person(user_id, users, in_common)
    
    if len(nearest_list) == 0:
        return []
    
    nearest_id = nearest_list[0][1]
    
    print nearest_id
    
    #initialize the recommendations list
    recommendations = []
    
    #Now get the list of artists for each - the requested user-ID and their nearest neighbor
    neighbor_ratings = users[nearest_id]
    user_ratings = users[user_id]
    
    #Look through the artists from the neighbor
    for artist in neighbor_ratings:
        
        #Check that the artists from the neighbor hasn't already been reviewed
        if not artist in user_ratings:
            
            #append the artists to the recommendations list
            recommendations.append((artist, neighbor_ratings[artist]))
    
    
    if len(recommendations) == 0:
        print "No recommedations found"
        return []
    
    #sort the list based on the rating, returning the highest rated artists first
    #reverse direction needs to be in line with similarity measure
    if number_of_recommendations > len(recommendations):
        number_of_recommendations = len(recommendations)
        
    if number_of_recommendations < 0:
        number_of_recommendations = 1
        
    if number_of_recommendations == 0:
        number_of_recommendations = len(recommendations)
    
    sr = sorted(recommendations, key = lambda x: x[1], reverse = True)[:number_of_recommendations]
        
    return (nearest_list, sr)

In [393]:
recommend(21623, active_users_mr)

22077.0


([(1.0, 22077.0),
  (1.0, 22074.0),
  (1.0, 22066.0),
  (1.0, 22045.0),
  (1.0, 22040.0),
  (1.0, 22006.0),
  (1.0, 21994.0),
  (1.0, 21980.0),
  (1.0, 21969.0),
  (1.0, 21952.0),
  (1.0, 21943.0),
  (1.0, 21933.0),
  (1.0, 21905.0),
  (1.0, 21883.0),
  (1.0, 21840.0),
  (1.0, 21834.0),
  (1.0, 21820.0),
  (1.0, 21809.0),
  (1.0, 21803.0),
  (1.0, 21798.0),
  (1.0, 21797.0),
  (1.0, 21770.0),
  (1.0, 21726.0),
  (1.0, 21702.0),
  (1.0, 21701.0),
  (1.0, 21698.0),
  (1.0, 21669.0),
  (1.0, 21605.0),
  (1.0, 21582.0),
  (1.0, 21557.0),
  (1.0, 21502.0),
  (1.0, 21488.0),
  (1.0, 21474.0),
  (1.0, 21473.0),
  (1.0, 21471.0),
  (1.0, 21466.0),
  (1.0, 21465.0),
  (1.0, 21427.0),
  (1.0, 21407.0),
  (1.0, 21394.0),
  (1.0, 21296.0),
  (1.0, 21290.0),
  (1.0, 21274.0),
  (1.0, 21263.0),
  (1.0, 21233.0),
  (1.0, 21220.0),
  (1.0, 21219.0),
  (1.0, 21204.0),
  (1.0, 21184.0),
  (1.0, 21158.0),
  (1.0, 21152.0),
  (1.0, 21112.0),
  (1.0, 21083.0),
  (1.0, 21082.0),
  (1.0, 21068.0),
  (1.0, 21

In [377]:
def recommend_for_specific_match(useridA, useridB, users, number_of_recommendations = 10):
    '''This function takes 2 user-IDs and makes recommendations for the first based on artists from the second'''
    
    recommendations = []
    
    neighbor_ratings = users[useridB]
    if len(neighbor_ratings) == 0:
        return []
    
    user_ratings = users[useridA]
    if len(user_ratings) == 0:
        return []
    
    for book_title in neighbor_ratings:
        if not book_title in user_ratings:
            recommendations.append((book_title, neighbor_ratings[book_title]))
            
    if len(recommendations) == 0:
        print "No recommedations found"
        return []
    
    if number_of_recommendations > len(recommendations):
        number_of_recommendations = len(recommendations)
        
    if number_of_recommendations < 0:
        number_of_recommendations = 1
        
    if number_of_recommendations == 0:
        number_of_recommendations = len(recommendations)
        
    sr = sorted(recommendations, key = lambda x: x[1], reverse = True)[:number_of_recommendations]
        
    return sr

In [382]:
recommend_for_specific_match(useridA=0,useridB=1, users=active_users_mr)

[('b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d', 16.457001443596617),
 ('fbb375f9-48bb-4635-824e-4120273b3ba7', 6.240461950917715),
 ('c5c2ea1c-4bde-4f4d-bd0b-47b200bf99d6', 3.5265003093421323),
 ('437a0e49-c6ae-42f6-a6c1-84f25ed366bc', 3.497628376984945),
 ('a670e05a-cea8-4b37-bce9-d82daf1a0fa4', 3.1326046607547946),
 ('227b39fc-23f6-40f5-b8a1-75100d5baad4', 2.9964941224994845),
 ('ba99a190-6065-4930-be3d-55ecc48e365d', 2.701587956279645),
 ('1ba1d493-7114-45e2-b163-a36d49a0c065', 2.672716023922458),
 ('03e52a44-b264-453c-a0e0-fbfd89d72765', 2.3097545885749637),
 ('3d6a7fdc-2357-4d7c-bbe5-773c8245d5a0', 2.0127861414724686)]

In [378]:
def evaluate_comparison(user_id1, user_id2, users, N = 0):
    '''This function compares the artist ratings for 2 users'''
    
    ul1 = users[user_id1]
    ul2 = users[user_id2]
    
    count = 0
    
    for b in ul1:
        if b in ul2:
            print "{:100s}".format(b)
            print "Users {:s} & {:s} ratings = {:5d} {:5d}".format(user_id1, user_id2, users[user_id1][b],\
                                                                                   users[user_id2][b])
            print "\n"
            count += 1
            if (count >= N) & (N != 0):
                return