### imports

In [1]:
import os
import os.path

from datetime import datetime

from reader import read_data, read_artist_id_map, read_artist_alias_map
from checks import ids_requirements_satisfied
from plots import plot_roc_curves

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

spark_conf = SparkConf()\
        .setMaster("local")\
        .setAppName("music_recommender")

        
%load_ext autoreload
%autoreload 2
%matplotlib inline

### read data

In [22]:
user_artist_rdd = read_data(sc, 0.00001)
# increase partitions so to avoid the Java heap space issue (good 3 / cpu)
user_artist_rdd = user_artist_rdd.repartition(24)
user_artist_rdd.cache()

print "We have N data rows: ",  user_artist_rdd.count()
        
# Read the artist data
artist_id_map = read_artist_id_map(sc)

# we convert all artist IDs to canonical form using aliases
artist_alias_map = read_artist_alias_map(sc)
    
# and then we broadcast the map variables
artist_id_map_br = sc.broadcast(artist_id_map)
artist_alias_map_br = sc.broadcast(artist_alias_map)

We have N data rows:  242


In [17]:
print "Basic requirements satisfied: ", ids_requirements_satisfied(user_artist_rdd)

Basic requirements satisfied:  True


### create and train model

In [35]:
# ALS algoritm works with Rating Objects (user, product, value)
# we construct first the rating objects
        
ratings = user_artist_rdd\
            .map(lambda x: x.split(' '))\
            .map(lambda x: Rating(int(x[0]), int(x[1]), float(x[2])))
        
ratings.cache()

# if we use the implicit version, the ALS is not factoring the matrix A (where value is some number)
# but another matrix B which contains 1 when the value >0 and 0 elswhere 
model = ALS.trainImplicit(ratings, 10, 5, 0.01)
 

### call model recommendations

In [36]:
# Lets take a test user id
test_user_id = user_artist_rdd.first().split()[0]

# Lets see what artist ids this user likes 
print "=> The preferences of the test user id ", test_user_id, " are: "

# 1) using python dictionaries
t1 = datetime.now()
print user_artist_rdd\
    .filter(lambda x: x.split(' ')[0]==test_user_id)\
    .map(lambda x: x.split(' ')[1])\
    .map(lambda x: int(x))\
    .map(lambda x: artist_alias_map.get(x, x) )\
    .map(lambda x: artist_id_map.get(x, x) )\
    .collect()
t2 = datetime.now()
print "without broadcasting took: ",(t2 - t1).seconds


# 2) using broadcast variables
t1 = datetime.now()
print user_artist_rdd\
    .filter(lambda x: x.split(' ')[0]==test_user_id)\
    .map(lambda x: x.split(' ')[1])\
    .map(lambda x: int(x))\
    .map(lambda x: artist_alias_map_br.value.get(x, x))\
    .map(lambda x: artist_id_map_br.value.get(x, x))\
    .collect()
t2 = datetime.now()
print "using broadcast variables took: ", (t2 - t1).seconds

# and now lets see what the model can recommend
# we get the model recommendations
rec_info = model.recommendProducts(int(test_user_id), 5)

# then we sort accrodingly
rec_info = sorted(list(rec_info), key=lambda x: x[1])

# then we keep only the product=artist
print "=> The ALS model recommendations are: ",  [artist_id_map.get(x[1]) for x in rec_info]

=> The preferences of the test user id  1000002  are: 
[u'Carl Douglas', u'Furslide', u'Annie Lennox', u'Monk', u'MC Hammer', u'Aqua', u'Samuel Barber', u'Richard Blackwood', u'Heavy Stereo', u'Tal Bachman', u'ATB', u'Radiator', u'Ronan Keating', u'Ian Brown', u'Headswim', u'Cyclefly', u'Freakpower', u'The Bluetones', u'Crazy Town', u'Mallrats', u'[unknown]', u'Chris Cornell', u'Apollo 440', u'Powerman 5000', u'The Scorpions', u'The Cars', u'Belle and Sebastian', u'The Bangles', u'The Troggs', u'The Smashing Pumpkins', u'Portishead', u'Phil Collins Big Band', u'The Phil Collins Big Band', u'A Perfect Circle', u'Aerosmith', u'MC Hawking', u'Pantera', u'Judas Priest', u'Metallica', u'Terrorvision', u'Lynyrd Skynyrd', u'3 Doors Down', u'Gorillaz', u'Monty Python', u'Lit', u'John Lennon', u'Cream', u'The Mighty Mighty Bosstones', u'Helloween', u'Foo Fighters', u'Counting Crows', u'Creed', u'Iron Maiden', u'The Beatles', u'Incubus', u'Audioslave', u'Muse', u'(hed) Planet Earth', u'AC/DC', u

### See my own recommendations

In [40]:
my_ratings = sc.parallelize(['99999999 10445322 5', 
                             '99999999 10147472 5', 
                             '99999999 10254710 5'], 2)
user_artist_rdd = sc.union([user_artist_rdd, my_ratings])

ratings = user_artist_rdd\
            .map(lambda x: x.split(' '))\
            .map(lambda x: Rating(int(x[0]), int(x[1]), float(x[2])))
        
model = ALS.trainImplicit(ratings, 10, 5, 0.01)
 
rec_info = model.recommendProducts(int('99999999'), 20)
print "=> The ALS model recommendations are: ",  [artist_id_map.get(x[1]) for x in rec_info]

=> The ALS model recommendations are:  [u'Scorpions & the Berlin Philharmonic Orchestra', u'Ongsfromthewood-Jethrotull', u'P\xefnk floid', u'Judas Priest', u'John Mayer', u'Fugees', u'Counting Crows', u'Bruce Springsteen', u'Jools Holland', u'Muse', u'Joe Satriani', u'The Smashing Pumpkins', u'Elastica', u'The Phil Collins Big Band', u'Chuck Berry', u'Fourplay', u'MC Hawking', u'Aerosmith', u'Toploader', u'Portishead']
