In [1]:
import os
exec(open(os.path.join(os.environ["SPARK_HOME"], "python/pyspark/shell.py")).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.0
      /_/

Using Python version 3.6.8 (default, Dec 29 2018 19:04:46)
SparkSession available as 'spark'.


In [1]:
import os
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

import pyspark.sql.functions as F
from pyspark.ml import Pipeline
from pyspark.ml.feature import Bucketizer, MinMaxScaler, VectorAssembler
from pyspark.sql.types import DoubleType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [2]:
# this file contains 'uid', 'song_id' and freqency features for implicity rating

df = spark.read.csv("/Users/fanyang/Documents/musicbox/data/recommender_model01_0116.csv",
                   header=True, inferSchema=True).cache()

In [3]:
pd.DataFrame(df.take(5), columns=df.columns)

Unnamed: 0,_c0,uid,song_id,comp_play_last_7,comp_play_last_14,comp_play_last_21,comp_play_last_30,comp_play_last_44,freq_P_last_7,freq_P_last_14,freq_P_last_21,freq_P_last_30,freq_P_last_44,freq_D_last_7,freq_D_last_14,freq_D_last_21,freq_D_last_30,freq_D_last_44
0,0,103103073,572912,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,1,104737814,0,0,0,0,0,0,0,0,0,0,127,0,0,0,0,0
2,2,10919480,277650,2,2,2,2,3,2,2,2,2,3,0,0,0,0,1
3,3,10919480,389413,0,0,0,1,1,0,0,0,1,1,0,0,0,0,0
4,4,10919480,461313,1,2,2,2,2,1,2,2,3,6,0,0,0,0,1


### 1. data cleaning

In [4]:
df = df.drop('_c0')
df = df[df.song_id != 0]

In [None]:
#df.columns()

In [5]:
print("Original size of dataframe is: ")
print(df.count(), len(df.columns))

Original size of dataframe is: 
1992574 17


In [6]:
# select 'uid' that have activities > 10

df_user_select = df.groupBy('uid').count().where(F.col('count')>10)
df = df_user_select.join(df, on=['uid'], how='left')
print("After selecting active users, the size of dataframe is: ")
print(df.count(), len(df.columns))

After selecting active users, the size of dataframe is: 
1897205 18


In [7]:
print("the number of distinct 'uid' is", (df.select('uid').distinct().count()))
print("the number of distinct 'song_id' is", (df.select('song_id').distinct().count()))

the number of distinct 'uid' is 30274
the number of distinct 'song_id' is 303507


In [8]:
# method 1: select features from longest time frame 44 days for implicit rating
# remove rows 
df_feature_select = df.select('uid', 'song_id','comp_play_last_44', 'freq_P_last_44', 'freq_D_last_44') \
.where(((F.col('comp_play_last_44')!=0)|(F.col('freq_P_last_44')!=0)) & (F.col('freq_D_last_44')!=0))

In [9]:
df_feature_select.count()

278483

In [None]:
#df_reco = df.select('uid', 'song_id', 'freq_P_last_44', 'freq_D_last_44', 'comp_play_last_44') \
#            .where((F.col('freq_D_last_44')!=0) & ((F.col('freq_P_last_44')!=0)|(F.col('comp_play_last_44')!=0)))

In [10]:
# check NULL before transformation
#df_play_select_2.select(*[F.sum(F.col(c).isNull().cast('int'))\
#                          .alias(c) for c in df_play_select_2.columns]).show()

df_feature_select.select(*[F.sum(F.col(c).isNull().cast('int')) \
                           .alias(c) for c in df_feature_select.columns]).show()


+---+-------+-----------------+--------------+--------------+
|uid|song_id|comp_play_last_44|freq_P_last_44|freq_D_last_44|
+---+-------+-----------------+--------------+--------------+
|  0|      0|                0|             0|             0|
+---+-------+-----------------+--------------+--------------+



### 2. feature transformation to construct implicit rating

In [11]:
df_feature_transform = df_feature_select.withColumn('comp_play_last_44_transf', F.log10(F.col('comp_play_last_44')+1)) \
                        .withColumn('freq_P_last_44_transf', F.log10(F.col('freq_P_last_44')+1)) \
                        .withColumn('freq_D_last_44_transf', F.log10(F.col('freq_D_last_44')+1)) \
                        .withColumn('rating', 0.33*F.col('comp_play_last_44_transf')+0.33*F.col('freq_P_last_44_transf')\
                            +0.33*F.col('freq_D_last_44_transf')) 


In [12]:
df_feature_transform.columns

['uid',
 'song_id',
 'comp_play_last_44',
 'freq_P_last_44',
 'freq_D_last_44',
 'comp_play_last_44_transf',
 'freq_P_last_44_transf',
 'freq_D_last_44_transf',
 'rating']

In [13]:
assembler = VectorAssembler(inputCols=['rating'], outputCol='rating_assembled')
scaler = MinMaxScaler(inputCol='rating_assembled', outputCol='rating_scaled')
pp = Pipeline(stages=[assembler, scaler])
pp_model = pp.fit(df_feature_transform)
df_reco= pp_model.transform(df_feature_transform)


In [14]:
df_reco

DataFrame[uid: int, song_id: decimal(20,0), comp_play_last_44: int, freq_P_last_44: int, freq_D_last_44: int, comp_play_last_44_transf: double, freq_P_last_44_transf: double, freq_D_last_44_transf: double, rating: double, rating_assembled: vector, rating_scaled: vector]

In [15]:
Vec2num_udf = F.udf(lambda x: float(x[0]), DoubleType())
df_reco = df_reco.withColumn('rating_scaled_num', Vec2num_udf('rating_scaled'))

In [17]:
df_reco

DataFrame[uid: int, song_id: decimal(20,0), comp_play_last_44: int, freq_P_last_44: int, freq_D_last_44: int, comp_play_last_44_transf: double, freq_P_last_44_transf: double, freq_D_last_44_transf: double, rating: double, rating_assembled: vector, rating_scaled: vector, rating_scaled_num: double]

In [18]:
splits = [.0, 1/30, 1/15, 5/15, 7/15, 1]
bucketizer = Bucketizer(splits=splits, inputCol='rating_scaled_num', outputCol='rating_final')
df_reco= bucketizer.transform(df_reco)
df_reco = df_reco.withColumn('rating_final', F.col('rating_final')+1)

In [19]:
df_reco

DataFrame[uid: int, song_id: decimal(20,0), comp_play_last_44: int, freq_P_last_44: int, freq_D_last_44: int, comp_play_last_44_transf: double, freq_P_last_44_transf: double, freq_D_last_44_transf: double, rating: double, rating_assembled: vector, rating_scaled: vector, rating_scaled_num: double, rating_final: double]

In [20]:
df_reco.toPandas().to_csv('/Users/fanyang/Documents/musicbox/data/rating_0120.csv',index=False)

In [21]:
df_reco_spark = df_reco.select('uid', 'song_id', 'rating_scaled_num', 'rating_final')


In [22]:
df_reco_spark

DataFrame[uid: int, song_id: decimal(20,0), rating_scaled_num: double, rating_final: double]

In [23]:
df_reco_spark.printSchema()

root
 |-- uid: integer (nullable = true)
 |-- song_id: decimal(20,0) (nullable = true)
 |-- rating_scaled_num: double (nullable = true)
 |-- rating_final: double (nullable = true)



In [24]:
df_reco_spark.groupBy('rating_final').count().show()

+------------+------+
|rating_final| count|
+------------+------+
|         1.0| 34872|
|         4.0| 22451|
|         3.0|176574|
|         2.0| 39329|
|         5.0|  5257|
+------------+------+



In [25]:
split_seed = 301
train, test = df_reco_spark.randomSplit(weights=[.7,.3],seed=split_seed)

### 3. recommend model1: non-implicit method

In [26]:
als_1 = ALS(maxIter=10, regParam=0.2, userCol="uid", itemCol="song_id", ratingCol="rating_final",
          coldStartStrategy="drop")
model_1 = als_1.fit(train)

In [28]:
# Evaluate the model by computing the RMSE on the test data
predictions_1 = model_1.transform(test)
evaluator_1 = RegressionEvaluator(metricName="rmse", labelCol="rating_final",
                                predictionCol="prediction")
rmse_1 = evaluator_1.evaluate(predictions_1)
print("Root-mean-square error = " + str(rmse_1))

Root-mean-square error = 0.8928251915212452


In [29]:
# Generate top 10 movie recommendations for each user
userRecs = model_1.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model_1.recommendForAllItems(10)

In [30]:
pd.DataFrame(userRecs.take(10), columns=['uid', 'recommendations'])

Unnamed: 0,uid,recommendations
0,167674030,"[(508488, 4.8512444496154785), (14375725, 4.65..."
1,167979490,"[(6945228, 2.3284921646118164), (4546777, 2.06..."
2,168040590,"[(6945228, 3.45124888420105), (508488, 3.28019..."
3,168736710,"[(6945228, 4.409367084503174), (508488, 4.3784..."
4,168914470,"[(6945228, 4.446792125701904), (89971, 4.17810..."
5,169006110,"[(14375725, 4.618165493011475), (6945228, 4.60..."
6,168045751,"[(23492088, 3.730064630508423), (4661109, 3.68..."
7,168294401,"[(23657920, 3.7841367721557617), (6945228, 3.7..."
8,168603631,"[(6945228, 2.90990948677063), (7044821, 2.8002..."
9,168870111,"[(508488, 3.3889827728271484), (6945228, 3.302..."


In [31]:
userRecs.show()

+---------+--------------------+
|      uid|     recommendations|
+---------+--------------------+
|167674030|[[508488, 4.85124...|
|167979490|[[6945228, 2.3284...|
|168040590|[[6945228, 3.4512...|
|168736710|[[6945228, 4.4093...|
|168914470|[[6945228, 4.4467...|
|169006110|[[14375725, 4.618...|
|168045751|[[23492088, 3.730...|
|168294401|[[23657920, 3.784...|
|168603631|[[6945228, 2.9099...|
|168870111|[[508488, 3.38898...|
|168930551|[[6945228, 4.9578...|
|169018731|[[6945228, 1.8339...|
|167670762|[[6945228, 3.0333...|
|167735352|[[6945228, 8.3415...|
|167760432|[[6945228, 4.6451...|
|168236162|[[6945228, 4.8850...|
|168366812|[[7148282, 5.3620...|
|168838372|[[508488, 4.42948...|
|168854802|[[6945228, 4.9964...|
|167894223|[[15195497, 3.702...|
+---------+--------------------+
only showing top 20 rows



In [32]:
pd.DataFrame(movieRecs.take(10), columns=['song_id', 'recommendations'])

Unnamed: 0,song_id,recommendations
0,69042,"[(168215501, 3.0692005157470703), (168586329, ..."
1,94695,"[(168404412, 1.5893669128417969), (167582094, ..."
2,101775,"[(167639478, 3.7814977169036865), (167582094, ..."
3,104656,"[(168134566, 3.793409824371338), (168894504, 3..."
4,107032,"[(168842861, 3.8268675804138184), (168754482, ..."
5,108221,"[(168842861, 1.4583182334899902), (167735352, ..."
6,109172,"[(168635054, 4.09170389175415), (168508071, 4...."
7,118989,"[(168586329, 4.126330852508545), (168215501, 4..."
8,121749,"[(168215501, 4.756392478942871), (168182801, 4..."
9,124743,"[(168445833, 4.167388916015625), (168757503, 4..."


In [33]:
movieRecs.show()

+-------+--------------------+
|song_id|     recommendations|
+-------+--------------------+
|  69042|[[168215501, 3.06...|
|  94695|[[168404412, 1.58...|
| 101775|[[167639478, 3.78...|
| 104656|[[168134566, 3.79...|
| 107032|[[168842861, 3.82...|
| 108221|[[168842861, 1.45...|
| 109172|[[168635054, 4.09...|
| 118989|[[168586329, 4.12...|
| 121749|[[168215501, 4.75...|
| 124743|[[168445833, 4.16...|
| 125502|[[168202866, 2.99...|
| 133948|[[168445833, 4.63...|
| 135976|[[168445833, 4.38...|
| 144907|[[168138422, 3.05...|
| 156365|[[168586329, 5.48...|
| 158257|[[168034451, 3.76...|
| 197258|[[168260599, 4.41...|
| 200878|[[168445833, 4.81...|
| 202641|[[168445833, 3.10...|
| 204529|[[168700215, 3.19...|
+-------+--------------------+
only showing top 20 rows



In [34]:
# make recommendation for a particular user
userRecs.where(F.col('uid')==10919480).show(truncate=False)

+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|uid     |recommendations                                                                                                                                                                                                          |
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|10919480|[[6945228, 4.8684473], [6897211, 4.355576], [7044821, 4.3359804], [1006365, 4.332773], [5655571, 4.332773], [21767145, 4.3055477], [4546777, 4.278472], [508488, 4.2591677], [4780222, 4.2423553], [14375725, 4.2402225]]|
+--------+--------------------------------------------------------------------------

In [35]:
# Generate top 10 movie recommendations for a specified set of users
users = df_reco_spark.select(als_1.getUserCol()).distinct().limit(3)
userSubsetRecs = model_1.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
movies = df_reco_spark.select(als_1.getItemCol()).distinct().limit(3)
movieSubSetRecs = model_1.recommendForItemSubset(movies, 10)

In [36]:
pd.DataFrame(userSubsetRecs.take(100), columns=['uid', 'recommendations '])

Unnamed: 0,uid,recommendations
0,167582087,"[(4148762, 3.68467378616333), (6644254, 3.6464..."
1,167649848,"[(6521928, 3.772763729095459), (14400895, 3.10..."


In [37]:
movieSubSetRecs.show()

+--------+--------------------+
| song_id|     recommendations|
+--------+--------------------+
|22847594|[[168445833, 4.75...|
| 8357238|[[167955372, 4.64...|
+--------+--------------------+



In [38]:
movieSubSetRecs.show(truncate=False)

+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|song_id |recommendations                                                                                                                                                                                                                             |
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|22847594|[[168445833, 4.7534766], [168251083, 4.620491], [167735352, 4.534008], [168700215, 4.533351], [167955372, 4.5329924], [168444828, 4.531059], [168260599, 4.526379], [168877046, 4.5218973], [168635054, 4.521038], [169023131, 4.5112896]]  |
|8357238

### 4. recommend model2: implicit method

In [27]:
# based on below data, it suggest implicitPrefs=True increase RMSE


#als = ALS(maxIter=5, regParam=0.01, implicitPrefs=True,
#          userCol="userId", itemCol="movieId", ratingCol="rating")

als_3 = ALS(maxIter=10, regParam=0.2, implicitPrefs=True, userCol="uid", itemCol="song_id", ratingCol="rating_final",
          coldStartStrategy="drop")

model_3 = als_3.fit(train)

predictions_3 = model_3.transform(test)
evaluator_3 = RegressionEvaluator(metricName="rmse", labelCol="rating_final",
                                predictionCol="prediction")
rmse_3 = evaluator_3.evaluate(predictions_3)
print("Root-mean-square error = " + str(rmse_3))


Root-mean-square error = 2.828045627477652


#### When implicitPrefs=True, RMSE increased a lot when it compares to model1. 
#### It suggests that RMSE is not appropriat for implicit rating evaluation