In [39]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, lit, col
from pyspark.sql.types import IntegerType

spark = SparkSession\
        .builder\
        .appName("ALS").config("spark.driver.host","localhost")\
        .getOrCreate()
# read in the dataset into pyspark DataFrame
attendance = spark.read.csv('./attend.csv', header='true', inferSchema = 'true')
attendance = attendance.drop('дата занятия') \
                        .drop('время начала занятия') \
                        .drop('время окончания занятия') \
                        .drop('направление 2') \
                        .drop('направление 3') \
                        .drop('уникальный номер занятия') \
                        .withColumnRenamed('уникальный номер группы', 'groupId') \
                        .withColumnRenamed('уникальный номер участника', 'userId') \
                        .withColumn("rating", when((attendance['онлайн/офлайн'] == "Да"), 1) \
                                                .when((attendance['онлайн/офлайн'] == "Нет"), 1) \
                                                .otherwise(lit("0"))) \
                        .drop('онлайн/офлайн')
attendance = attendance.withColumn("rating", col('rating').cast(IntegerType()))
categories = spark.read.csv('./dict.csv', sep =';', header='true', inferSchema = 'true')
categories = categories.drop('Разметка: Для ума/ Для души / Для тела') \
                        .drop('id_level1') \
                        .drop('level1') \
                        .drop('id_level2') \
                        .drop('level2') \
                        .drop('d_level1') \
                        .drop('d_level2') \
                        .drop('d_level3')
groups = spark.read.csv('./groups.csv', header='true', inferSchema = 'true')
groups = groups.drop('направление 1') \
                        .drop('направление 2') \
                        .drop('адрес площадки') \
                        .drop('округ площадки') \
                        .drop('район площадки') \
                        .drop('расписание в активных периодах') \
                        .drop('расписание в закрытых периодах') \
                        .drop('расписание в плановом периоде') \
                        .withColumnRenamed('уникальный номер', 'groupId') \
                        .withColumnRenamed('направление 3', 'leve3')
groupsWithLevelIds = categories.join(groups, 'leve3')
attendanceWithLevelIds=attendance.join(groupsWithLevelIds, 'groupId')
interests = attendanceWithLevelIds.groupBy("userId", "id_level3").agg({"rating": "sum"})

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
# split into 
(trainingInterests, testInterests) = interests.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
alsInterests = ALS(maxIter=5,rank=100, regParam=0.01, userCol="userId", itemCol="id_level3", ratingCol="sum(rating)",
          coldStartStrategy="drop")

# fit the ALS model to the training set
modelInterests = alsInterests.fit(trainingInterests)

# Evaluate the model by computing the RMSE on the test data
predictionsInterests = modelInterests.transform(testInterests)
evaluatorInterests = RegressionEvaluator(metricName="rmse", labelCol="sum(rating)",
                                predictionCol="prediction")
rmseInterests = evaluatorInterests.evaluate(predictionsInterests)
print("Root-mean-square error = " + str(rmseInterests))


Root-mean-square error = 29.325029438240207


### Cross Validation to Find the Optimal Model

Let's now find the optimal values for the parameters of the ALS model. Use the built-in Cross Validator in pyspark with a suitable param grid and determine the optimal model. Try with the parameters:

* regularization = [0.01,0.001,0.1])
* rank = [4,10,50]



In [36]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


als_modelInterests =  ALS(userCol="userId", itemCol="id_level3", ratingCol="sum(rating)", coldStartStrategy="drop")

                 
paramsInterests = ParamGridBuilder().addGrid(als_modelInterests.regParam, [0.01,0.001,0.1]).addGrid(als_modelInterests.rank, [4,10,50]).build()


## instantiating crossvalidator estimator
cvInterests = CrossValidator(estimator=als_modelInterests, estimatorParamMaps=paramsInterests,evaluator=evaluatorInterests,parallelism=4)
best_modelInterests = cvInterests.fit(interests)    

# We see the best model has a rank of 50, so we will use that in our future models with this dataset
best_modelInterests.bestModel

ALSModel: uid=ALS_62dd1dfeb1b4, rank=50

In [37]:
recommendationsInterests = modelInterests.recommendForAllUsers(10)
recommendationsInterests.where(recommendationsInterests["userId"] == 101419598).collect()

[Row(userId=101419598, recommendations=[Row(id_level3=1720, rating=53.99146270751953), Row(id_level3=1118, rating=51.732566833496094), Row(id_level3=1421, rating=45.983795166015625), Row(id_level3=1113, rating=44.997684478759766), Row(id_level3=1180, rating=41.25835037231445), Row(id_level3=1281, rating=39.22730255126953), Row(id_level3=1171, rating=38.77926254272461), Row(id_level3=1278, rating=38.70088577270508), Row(id_level3=1287, rating=38.42405700683594), Row(id_level3=1283, rating=35.283958435058594)])]

In [30]:
spark.stop()