In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, lit, col, sum
from pyspark.sql.types import IntegerType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

spark = SparkSession\
        .builder\
        .appName("ALS").config("spark.driver.host","localhost")\
        .getOrCreate()
# read in the dataset into pyspark DataFrame
attendance = spark.read.csv('./attend.csv', header='true', inferSchema = 'true')
attendance = attendance.drop('дата занятия') \
                        .drop('время начала занятия') \
                        .drop('время окончания занятия') \
                        .drop('направление 2') \
                        .drop('направление 3') \
                        .drop('уникальный номер занятия') \
                        .withColumnRenamed('уникальный номер группы', 'groupId') \
                        .withColumnRenamed('уникальный номер участника', 'userId') \
                        .withColumn("rank", when((attendance['онлайн/офлайн'] == "Да"), 1) \
                                                .when((attendance['онлайн/офлайн'] == "Нет"), 1) \
                                                .otherwise(lit("0"))) \
                        .drop('онлайн/офлайн')
attendance = attendance.withColumn("rank", col('rank').cast(IntegerType()))
categories = spark.read.csv('./dict.csv', sep =';', header='true', inferSchema = 'true')
categories = categories.drop('id_level1') \
                        .drop('level1') \
                        .drop('id_level2') \
                        .drop('level2') \
                        .drop('d_level1') \
                        .drop('d_level2') \
                        .drop('d_level3') \
                        .withColumn("id_level0", when((categories['Разметка: Для ума/ Для души / Для тела'] == "Для ума"), 10001) \
                                                .when((categories['Разметка: Для ума/ Для души / Для тела'] == "Для души"), 10002) \
                                                .when((categories['Разметка: Для ума/ Для души / Для тела'] == "Для тела"), 10003) \
                                                .otherwise(-1))
                
groups = spark.read.csv('./groups.csv', header='true', inferSchema = 'true')
groups = groups.drop('направление 1') \
                        .drop('направление 2') \
                        .drop('адрес площадки') \
                        .drop('округ площадки') \
                        .drop('район площадки') \
                        .drop('расписание в активных периодах') \
                        .drop('расписание в закрытых периодах') \
                        .drop('расписание в плановом периоде') \
                        .withColumnRenamed('уникальный номер', 'groupId') \
                        .withColumnRenamed('направление 3', 'leve3')
groupsWithLevelIds = categories.join(groups, 'leve3')
attendanceWithLevelIds=attendance.join(groupsWithLevelIds, 'groupId')
interestsLvl3 = attendanceWithLevelIds.groupBy("userId", "id_level3").agg(sum("rank").alias("rank"))


(trainingInterestsLvl3, testInterestsLvl3) = interestsLvl3.randomSplit([0.8, 0.2])

alsInterestsLvl3 = ALS(maxIter=5,rank=100, regParam=0.01, userCol="userId", itemCol="id_level3", ratingCol="rank",
          coldStartStrategy="drop")


modelInterestsLvl3 = alsInterestsLvl3.fit(trainingInterestsLvl3)


predictionsInterestsLvl3 = modelInterestsLvl3.transform(testInterestsLvl3)
evaluatorInterestsLvl3 = RegressionEvaluator(metricName="rmse", labelCol="rank",
                                predictionCol="prediction")
rmseInterestsLvl3 = evaluatorInterestsLvl3.evaluate(predictionsInterestsLvl3)
print("Root-mean-square error = " + str(rmseInterestsLvl3))


Root-mean-square error = 28.461277254248056


In [13]:
interestsLvl0 = attendanceWithLevelIds.groupBy("userId", "id_level0").agg(sum("rank").alias("rank"))

(trainingInterestsLvl0, testInterestsLvl0) = interestsLvl0.randomSplit([0.8, 0.2])

alsInterestsLvl0 = ALS(maxIter=5,rank=100, regParam=0.01, userCol="userId", itemCol="id_level0", ratingCol="rank",
          coldStartStrategy="drop")


modelInterestsLvl0 = alsInterestsLvl0.fit(trainingInterestsLvl0)


predictionsInterestsLvl0 = modelInterestsLvl0.transform(testInterestsLvl0)
evaluatorInterestsLvl0 = RegressionEvaluator(metricName="rmse", labelCol="rank",
                                predictionCol="prediction")
rmseInterestsLvl0 = evaluatorInterestsLvl0.evaluate(predictionsInterestsLvl0)
print("Root-mean-square error = " + str(rmseInterestsLvl0))

Root-mean-square error = 89.29878444147431


### Cross Validation to Find the Optimal Model

Let's now find the optimal values for the parameters of the ALS model. Use the built-in Cross Validator in pyspark with a suitable param grid and determine the optimal model. Try with the parameters:

* regularization = [0.01,0.001,0.1])
* rank = [4,10,50]



In [36]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


als_modelInterests =  ALS(userCol="userId", itemCol="id_level3", ratingCol="sum(rating)", coldStartStrategy="drop")

                 
paramsInterests = ParamGridBuilder().addGrid(als_modelInterests.regParam, [0.01,0.001,0.1]).addGrid(als_modelInterests.rank, [4,10,50]).build()


## instantiating crossvalidator estimator
cvInterests = CrossValidator(estimator=als_modelInterests, estimatorParamMaps=paramsInterests,evaluator=evaluatorInterests,parallelism=4)
best_modelInterests = cvInterests.fit(interests)    

# We see the best model has a rank of 50, so we will use that in our future models with this dataset
best_modelInterests.bestModel

ALSModel: uid=ALS_62dd1dfeb1b4, rank=50

In [14]:
recommendationsInterestsLvl3 = modelInterestsLvl3.recommendForAllUsers(10)
recommendationsInterestsLvl3.where(recommendationsInterestsLvl3["userId"] == 101419598).collect()



[Row(userId=101419598, recommendations=[Row(id_level3=1274, rating=79.53845977783203), Row(id_level3=1185, rating=75.10376739501953), Row(id_level3=1180, rating=56.39841079711914), Row(id_level3=1720, rating=53.994232177734375), Row(id_level3=1753, rating=47.49144744873047), Row(id_level3=1421, rating=45.98579025268555), Row(id_level3=1113, rating=44.996299743652344), Row(id_level3=1394, rating=44.623226165771484), Row(id_level3=1192, rating=44.60888671875), Row(id_level3=1187, rating=42.43898010253906)])]

In [15]:
recommendationsInterestsLvl0 = modelInterestsLvl0.recommendForAllUsers(3)
recommendationsInterestsLvl0.where(recommendationsInterestsLvl0["userId"] == 101419598).collect()

[Row(userId=101419598, recommendations=[Row(id_level0=10001, rating=253.99533081054688), Row(id_level0=10002, rating=82.99931335449219), Row(id_level0=10003, rating=6.002247333526611)])]

In [30]:
spark.stop()