In [3]:
#Instantiate SparkSession
from pyspark.sql import SparkSession
spark=SparkSession \
.builder \
.appName('Use Implicit Colloborative Filtering for Band  Recommendations') \
.getOrCreate()


In [4]:
# wget http://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip
# gsutil cp user_artists.dat gs://dexdebra-123/datasets
# Use sparkSession to read csv file

rawdata=spark.read \
           .format('csv') \
           .option('header','true') \
           .option('delimiter','\t') \
           .load('gs://dexdebra-123/datasets/user_artists.dat')

In [5]:
rawdata.toPandas().head()

# userID : id of the user who listened the song
# artistID: artistID of the user whose song for listed by user with id userID
# weight: number of times the song for listenend by userID


Unnamed: 0,userID,artistID,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983


In [15]:
from pyspark.sql.functions import col
dataset = rawdata.select( 
                col('userId').cast('int'),
                col('artistID').cast('int'),
                col('weight').cast('int')
)

dataset

DataFrame[userId: int, artistID: int, weight: int]

In [16]:
# See the distribution of values for weight

dataset.select('weight').describe().toPandas()

Unnamed: 0,summary,weight
0,count,92834.0
1,mean,745.2439300256372
2,stddev,3751.32208038768
3,min,1.0
4,max,352698.0


In [18]:
# ML perform far better with small numeric value
# it is better if the values are standardized
# allows us to mitigate extreme variance in input values
# Standardize the value in 'weight' field as below
# z= x- mu / alpha
# this will give us the z-score for every value in the weights column

# first find mean and SD of the values in weight column
# store these in 2 new column mean_weight and sd_weight

# Then Perform a cross join with the original data set which contains the userId , artistId and the weight
# To this result add a new column which contains standardized and scaled weights

from pyspark.sql.functions import stddev, mean, col

df=dataset.select( mean('weight').alias('mean_weight'),stddev('weight').alias('stddev_weight'))\
                             .crossJoin(dataset) \
                             .withColumn('weight_scaled', 
                                            ( col('weight') - col('mean_weight')) / col('stddev_weight'))
df.toPandas().head()  
            


Unnamed: 0,mean_weight,stddev_weight,userId,artistID,weight,weight_scaled
0,745.24393,3751.32208,2,51,13883,3.502167
1,745.24393,3751.32208,2,52,11690,2.917573
2,745.24393,3751.32208,2,53,11351,2.827205
3,745.24393,3751.32208,2,54,10300,2.547037
4,745.24393,3751.32208,2,55,8983,2.195961


In [19]:
# Train the Recommendation model
(trainingData,testingData) = df.randomSplit([0.8,0.2])

In [20]:
# Use ALS library to instantiate estimator to train the model
# weighted regularization - prevent overfitting on the training data
from pyspark.ml.recommendation  import ALS
als=ALS(
		maxIter=10,
		regParam=0.1,
		userCol='userId',
		itemCol='artistID',
        implicitPrefs=True, # To inform Spark it is dealing with implicit and not explicit Feedback
		ratingCol='weight_scaled', # The Column with standardized weights
		coldStartStrategy='drop' # if the algorithm encountes new user or product during validation it will drop that row
		)

model = als.fit(trainingData)

In [21]:
model

ALS_4757805a7e9920589306

In [22]:
# Perform predicts on the testingData

predictions = model.transform(testingData)

predictions.toPandas().head()

Unnamed: 0,mean_weight,stddev_weight,userId,artistID,weight,weight_scaled,prediction
0,745.24393,3751.32208,27,463,159,-0.156277,0.0
1,745.24393,3751.32208,1692,463,1018,0.072709,1.827138e-07
2,745.24393,3751.32208,2055,463,863,0.031391,0.004478354
3,745.24393,3751.32208,1959,471,96,-0.173071,-0.00200805
4,745.24393,3751.32208,1771,471,96,-0.173071,-0.0003558034


In [24]:
predictionsPandas = predictions.select('weight_scaled','prediction').toPandas()
predictionsPandas.describe()

Unnamed: 0,weight_scaled,prediction
count,16279.0,16279.0
mean,0.004596,0.042407
std,1.024688,0.10029
min,-0.198395,-0.323277
25%,-0.167473,0.0
50%,-0.124288,0.002484
75%,-0.024057,0.034656
max,93.820991,1.061591


In [26]:
artistsData=spark.read \
           .format('csv') \
           .option('header','true') \
           .option('delimiter','\t') \
           .load('gs://dexdebra-123/datasets/artists.dat')
artistsData.toPandas().head()


Unnamed: 0,id,name,url,pictureURL
0,1,MALICE MIZER,http://www.last.fm/music/MALICE+MIZER,http://userserve-ak.last.fm/serve/252/10808.jpg
1,2,Diary of Dreams,http://www.last.fm/music/Diary+of+Dreams,http://userserve-ak.last.fm/serve/252/3052066.jpg
2,3,Carpathian Forest,http://www.last.fm/music/Carpathian+Forest,http://userserve-ak.last.fm/serve/252/40222717...
3,4,Moi dix Mois,http://www.last.fm/music/Moi+dix+Mois,http://userserve-ak.last.fm/serve/252/54697835...
4,5,Bella Morte,http://www.last.fm/music/Bella+Morte,http://userserve-ak.last.fm/serve/252/14789013...


In [30]:
# To Get Artists Recommendations for a Particular User

from pyspark.sql.types import IntegerType

def getRecommendationsForUser(userId,numRecs):
    
    # Create a DF with single user for whom we want recommendations
    usersDF=spark.createDataFrame([userId],IntegerType()).toDF('userId')
    
    # Get Recommended Artists for this user from ML model
    userRecs=model.recommendForUserSubset(usersDF,numRecs)
    
    # Setup the ArtistsID in a DF
    artistsList=userRecs.collect()[0].recommendations
    artistsDF=spark.createDataFrame(artistsList)
    
    # Perform a Join operation with the artist Data in order to get name of recommended artist
    # We join artists Data with artistsDF on the artistID column 
    recommendedArtists=artistsData.join(artistsDF, artistsData.id == artistsDF.artistID)\
    .orderBy('rating',ascending=False) \
    .select('name','url','rating')
    return recommendedArtists

In [31]:
getRecommendationsForUser(939,10).toPandas()


Unnamed: 0,name,url,rating
0,Avenged Sevenfold,http://www.last.fm/music/Avenged+Sevenfold,0.292893
1,My Chemical Romance,http://www.last.fm/music/My+Chemical+Romance,0.232433
2,30 Seconds to Mars,http://www.last.fm/music/30+Seconds+to+Mars,0.216827
3,In Flames,http://www.last.fm/music/In+Flames,0.212518
4,Linkin Park,http://www.last.fm/music/Linkin+Park,0.210885
5,A Day to Remember,http://www.last.fm/music/A+Day+to+Remember,0.209767
6,All Time Low,http://www.last.fm/music/All+Time+Low,0.184773
7,Paramore,http://www.last.fm/music/Paramore,0.181947
8,Metallica,http://www.last.fm/music/Metallica,0.178795
9,Rise Against,http://www.last.fm/music/Rise+Against,0.168443
