Inspired by the book

Title:Advanced Analytics with Spark

By: Sandy Ryza, Uri Laserson, Sean Owen, Josh Wills

Publisher:O' Reilly Media

Data downloaded from: http://www-etud.iro.umontreal.ca/~bergstrj/audioscrobbler_data.html

### imports

In [52]:
import os
import os.path

from datetime import datetime

from reader import read_user_artist_data, read_artist_id_map, read_artist_alias_map
from checks import ids_requirements_satisfied
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

spark_conf = SparkConf()\
        .setMaster("local")\
        .setAppName("music_recommender")

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### read data

In [31]:
user_artist_rdd = read_user_artist_data(sc)
user_artist_rdd.cache()

print "We have N data rows: ",  user_artist_rdd.count()
        
# Read the artist data
artist_id_map = read_artist_id_map(sc)

# we convert all artist IDs to canonical form using aliases
artist_alias_map = read_artist_alias_map(sc)
    


We have N data rows:  2443


In [45]:
# and then we broadcast it
artist_id_map_br = sc.broadcast(artist_id_map)
artist_alias_map_br = sc.broadcast(artist_alias_map)

In [5]:
print "basic requirements satisfied: ", ids_requirements_satisfied(user_artist_rdd)

basic requirements satisfied:  True


### create and train model

In [11]:
# ALS algoritm works with Rating Objects (user, product, value)
# we construct first the rating objects
        
ratings = user_artist_rdd\
            .map(lambda x: x.split(' '))\
            .map(lambda x: Rating(int(x[0]), int(x[1]), float(x[2])))
        
ratings.cache()

# if we use the implicit version, the ALS is not factoring the matrix A (where value is some number)
# but another matrix B which contains 1 when the value >0 and 0 elswhere 
model = ALS.trainImplicit(ratings, 10, 5, 0.01)
 

### call model recommendations

In [54]:
# Lets take a random user id
random_user_id = '1000557'

# Lets see what artist ids this user likes 
print "= = = > This is what user likes .... "

# 1) using python dictionaries
t1 = datetime.now()
print user_artist_rdd\
    .filter(lambda x: x.split(' ')[0]==random_user_id)\
    .map(lambda x: x.split(' ')[1])\
    .map(lambda x: int(x))\
    .map(lambda x: artist_alias_map.get(x, x) )\
    .map(lambda x: artist_id_map.get(x, x) )\
    .collect()
t2 = datetime.now()
print (t2 - t1).seconds


# 2) using broadcast variables
t1 = datetime.now()
print user_artist_rdd\
    .filter(lambda x: x.split(' ')[0]==random_user_id)\
    .map(lambda x: x.split(' ')[1])\
    .map(lambda x: int(x))\
    .map(lambda x: artist_alias_map_br.value.get(x, x))\
    .map(lambda x: artist_id_map_br.value.get(x, x))\
    .collect()
t2 = datetime.now()
print (t2 - t1).seconds

# and now lets see what the model can recommend
print "= = = > This is what we recommend with ALS .... "


# we get the model recommendations
rec_info = model.recommendProducts(1000557, 5)

# then we sort accrodingly
rec_info = sorted(list(rec_info), key=lambda x: x[1])

# then we keep only the product=artist
print [artist_id_map.get(x[1]) for x in rec_info]

[u'orange pekoe', u'Cary Brothers']
25
[u'orange pekoe', u'Cary Brothers']
2


[u'Beck', u'Nirvana', u'Radiohead', u'P.O.D.', u'Frou Frou']