In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, lit, col
from pyspark.sql.types import IntegerType

spark = SparkSession\
        .builder\
        .appName("ALS").config("spark.driver.host","localhost")\
        .getOrCreate()

In [79]:
# read in the dataset into pyspark DataFrame
attendance = spark.read.csv('./attend.csv', header='true', inferSchema = 'true')
attendance = attendance.drop('дата занятия') \
                        .drop('время начала занятия') \
                        .drop('время окончания занятия') \
                        .drop('направление 2') \
                        .drop('направление 3') \
                        .drop('уникальный номер занятия') \
                        .withColumnRenamed('уникальный номер группы', 'groupId') \
                        .withColumnRenamed('уникальный номер участника', 'userId') \
                        .withColumn("rating", when((attendance['онлайн/офлайн'] == "Да"), 1) \
                                                .when((attendance['онлайн/офлайн'] == "Нет"), 1) \
                                                .otherwise(lit("0"))) \
                        .drop('онлайн/офлайн')
attendance = attendance.withColumn("rating", col('rating').cast(IntegerType()))
print(attendance.dtypes)
attendance.head(5)

[Stage 9458:>                                                       (0 + 8) / 8]

[('groupId', 'int'), ('userId', 'int'), ('rating', 'int')]


                                                                                

[Row(groupId=801346550, userId=101352023, rating=1),
 Row(groupId=801346550, userId=101385462, rating=1),
 Row(groupId=801346550, userId=101421897, rating=1),
 Row(groupId=801346550, userId=101354499, rating=1),
 Row(groupId=801346550, userId=101421312, rating=1)]

### Fitting the Alternating Least Squares Model

Because this dataset is already preprocessed for us, we can go ahead and fit the Alternating Least Squares model.

* Import the ALS module from pyspark.ml.recommendation.
* Use the randomSplit method on the pyspark DataFrame to separate the dataset into a training and test set
* Fit the Alternating Least Squares Model to the training dataset. Make sure to set the userCol, itemCol, and ratingCol to the appropriate names given this dataset. Then fit the data to the training set and assign it to a variable model. 

In [21]:
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.recommendation import ALS
# split into 
(training, test) = attendance.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5,rank=4, regParam=0.01, userCol="userId", itemCol="groupId", ratingCol="rating",
          coldStartStrategy="drop")

# fit the ALS model to the training set
model = als.fit(training)


23/05/25 23:36:40 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/05/25 23:36:40 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/05/25 23:36:40 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

In [33]:
model.userFactors.first()

Row(id=101346610, features=[-0.09497742354869843, 0.057440996170043945, -0.00277847982943058, 0.15691901743412018, -0.08705680072307587, 0.06693026423454285, 0.14026832580566406, 0.12499383836984634, 0.045402467250823975, 0.025674080476164818, -0.013423117808997631, 0.06911619752645493, -0.20548807084560394, -0.04170578718185425, -0.12842290103435516, 0.01386646181344986, -0.036058805882930756, 0.06587745994329453, 0.008388432674109936, -0.05730828642845154, -0.10669739544391632, 0.0077640600502491, -0.05352290719747543, 0.09031370282173157, -0.12390410900115967, -0.008904114365577698, 0.005484557244926691, 0.02336527593433857, 0.016564112156629562, -0.04919910430908203, -0.0017062033293768764, 0.04292133077979088, -0.08697185665369034, -0.10933773964643478, 0.0794006884098053, 0.025934170931577682, -0.056310731917619705, 0.15212248265743256, -0.12254568934440613, -0.06573570519685745, -0.04081561788916588, 0.11978831142187119, -0.01604308746755123, 0.001383399241603911, 0.000390144065

In [32]:
model.itemFactors.first()



Row(id=801346550, features=[-0.5429868698120117, -0.5055123567581177, 0.026591012254357338, -0.23246563971042633, 0.2155381590127945, 0.29472947120666504, -0.06582456827163696, -0.27172520756721497, 0.17953981459140778, 0.15475395321846008, -0.17308685183525085, 0.4240971803665161, -0.020962323993444443, 0.07267415523529053, -0.5862930417060852, 0.278379887342453, 0.12836910784244537, -0.09696116298437119, 0.15661607682704926, 0.3113566040992737, 0.13550250232219696, -0.06602007150650024, 0.0677306205034256, 0.40290719270706177, 0.011147268116474152, 0.03184487298130989, -0.06647253781557083, 0.195112407207489, 0.4735880196094513, -0.41877150535583496, 0.05611487105488777, 0.5212815999984741, 0.18407407402992249, -0.0915948897600174, -0.07038358598947525, 0.37637531757354736, -0.18032342195510864, 0.1895064264535904, -0.11977829784154892, -0.07220704853534698, -0.3280166983604431, -0.026818908751010895, 0.10839580744504929, -0.029526978731155396, 0.04546525329351425, -0.207727804780006

Now you've fit the model, and it's time to evaluate it to determine just how well it performed.

* import the RegressionEvalutor from pyspark.ml.evaluation
* generate predictions with your model for the test set by using the `transform` method on your ALS model
* evaluate your model and print out the RMSE from your test set

In [23]:
# importing appropriate library
from pyspark.ml.evaluation import RegressionEvaluator

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

[Stage 86:>                                                         (0 + 8) / 8]

Root-mean-square error = 0.3900732774893743


                                                                                

### Cross Validation to Find the Optimal Model

Let's now find the optimal values for the parameters of the ALS model. Use the built-in Cross Validator in pyspark with a suitable param grid and determine the optimal model. Try with the parameters:

* regularization = [0.01,0.001,0.1])
* rank = [4,10,50]



In [26]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


als_model =  ALS(userCol="userId", itemCol="groupId", ratingCol="rating", coldStartStrategy="drop")

                 
params = ParamGridBuilder().addGrid(als_model.regParam, [0.01,0.001,0.1]).addGrid(als_model.rank, [4,10,50]).build()


## instantiating crossvalidator estimator
cv = CrossValidator(estimator=als_model, estimatorParamMaps=params,evaluator=evaluator,parallelism=4)
best_model = cv.fit(attendance)    

# We see the best model has a rank of 50, so we will use that in our future models with this dataset
best_model.bestModel

                                                                                

ALSModel: uid=ALS_0764f29c0fac, rank=50

### Fitting the Alternating Least Squares Model with rank 50

In [27]:
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.recommendation import ALS
# split into 
(training, test) = attendance.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5,rank=50, regParam=0.01, userCol="userId", itemCol="groupId", ratingCol="rating",
          coldStartStrategy="drop")

# fit the ALS model to the training set
model = als.fit(training)

                                                                                

In [28]:
# importing appropriate library
from pyspark.ml.evaluation import RegressionEvaluator

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))



Root-mean-square error = 0.06775719589157905




## Getting Recommendations

Now it's time to actually get some recommendations! The ALS model has built in methods called `recommendForUserSubset` and `recommendForAllUsers`.

In the next line, we are creating an RDD with the top 10 recommendations for every user and then selecting one user to find out his predictions:

In [29]:
recommendations = model.recommendForAllUsers(10)
recommendations.where(recommendations["userId"] == 101352023).collect()

                                                                                

[Row(userId=101352023, recommendations=[Row(groupId=801369579, rating=2.3925416469573975), Row(groupId=801350039, rating=2.3474323749542236), Row(groupId=801348610, rating=2.3401365280151367), Row(groupId=801365340, rating=2.3213629722595215), Row(groupId=801349631, rating=2.2758753299713135), Row(groupId=801371096, rating=2.2596638202667236), Row(groupId=801349185, rating=2.209477424621582), Row(groupId=801359965, rating=2.20817494392395), Row(groupId=801366418, rating=2.1873857975006104), Row(groupId=801368424, rating=2.1793465614318848)])]

## User interests (categories)

In [80]:
categories = spark.read.csv('./dict.csv', sep =';', header='true', inferSchema = 'true')

In [81]:
categories = categories.drop('Разметка: Для ума/ Для души / Для тела') \
                        .drop('id_level1') \
                        .drop('level1') \
                        .drop('id_level2') \
                        .drop('level2') \
                        .drop('d_level1') \
                        .drop('d_level2') \
                        .drop('d_level3')

In [82]:
categories.head(5)

[Row(id_level3=1042, leve3='Иные интеллектуальные игры'),
 Row(id_level3=1040, leve3='Викторины'),
 Row(id_level3=1041, leve3='Квест'),
 Row(id_level3=1043, leve3='Брейн-ринг'),
 Row(id_level3=323, leve3='Современные настольные игры')]

In [83]:
groups = spark.read.csv('./groups.csv', header='true', inferSchema = 'true')

In [84]:
groups.dtypes

[('уникальный номер', 'int'),
 ('направление 1', 'string'),
 ('направление 2', 'string'),
 ('направление 3', 'string'),
 ('адрес площадки', 'string'),
 ('округ площадки', 'string'),
 ('район площадки', 'string'),
 ('расписание в активных периодах', 'string'),
 ('расписание в закрытых периодах', 'string'),
 ('расписание в плановом периоде', 'string')]

In [85]:
groups = groups.drop('направление 1') \
                        .drop('направление 2') \
                        .drop('адрес площадки') \
                        .drop('округ площадки') \
                        .drop('район площадки') \
                        .drop('расписание в активных периодах') \
                        .drop('расписание в закрытых периодах') \
                        .drop('расписание в плановом периоде') \
                        .withColumnRenamed('уникальный номер', 'groupId') \
                        .withColumnRenamed('направление 3', 'leve3')

In [86]:
groups.head(5)

[Row(groupId=801357270, leve3='ОФП'),
 Row(groupId=801356857, leve3='ОФП'),
 Row(groupId=801351684, leve3='ОФП'),
 Row(groupId=801353683, leve3='ОФП'),
 Row(groupId=801352164, leve3='ОФП')]

In [87]:
groupsWithLevelIds = categories.join(groups, 'leve3')
groupsWithLevelIds.head(5)

[Row(leve3='ОФП', id_level3=104, groupId=801357270),
 Row(leve3='ОФП', id_level3=104, groupId=801356857),
 Row(leve3='ОФП', id_level3=104, groupId=801351684),
 Row(leve3='ОФП', id_level3=104, groupId=801353683),
 Row(leve3='ОФП', id_level3=104, groupId=801352164)]

In [88]:
attendanceWithLevelIds=attendance.join(groupsWithLevelIds, 'groupId')
attendanceWithLevelIds.head(5)

                                                                                

[Row(groupId=801346710, userId=101386726, rating=1, leve3='Дыхательная гимнастика', id_level3=171),
 Row(groupId=801346710, userId=101430794, rating=1, leve3='Дыхательная гимнастика', id_level3=171),
 Row(groupId=801346810, userId=101366986, rating=1, leve3='ОНЛАЙН Ментальная арифметика', id_level3=1173),
 Row(groupId=801346810, userId=101374816, rating=1, leve3='ОНЛАЙН Ментальная арифметика', id_level3=1173),
 Row(groupId=801346810, userId=101381146, rating=1, leve3='ОНЛАЙН Ментальная арифметика', id_level3=1173)]

In [90]:
attendanceWithLevelIds.count()

                                                                                

6537789

In [91]:
interests = attendanceWithLevelIds.groupBy("userId", "id_level3").agg({"rating": "sum"})
interests.head(5)

                                                                                

[Row(userId=101419598, id_level3=1788, sum(rating)=20),
 Row(userId=101357168, id_level3=1113, sum(rating)=26),
 Row(userId=101364367, id_level3=1421, sum(rating)=71),
 Row(userId=101390957, id_level3=1165, sum(rating)=6),
 Row(userId=101365143, id_level3=1421, sum(rating)=35)]

In [92]:
interests.count()

                                                                                

316550

In [94]:
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.recommendation import ALS
# split into 
(trainingInterests, testInterests) = interests.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
alsInterests = ALS(maxIter=5,rank=4, regParam=0.01, userCol="userId", itemCol="id_level3", ratingCol="sum(rating)",
          coldStartStrategy="drop")

# fit the ALS model to the training set
modelInterests = alsInterests.fit(trainingInterests)

                                                                                

In [95]:
modelInterests.userFactors.first()

Row(id=101346610, features=[25.293867111206055, -39.14100646972656, 18.81976318359375, 64.5489273071289])

In [96]:
from pyspark.ml.evaluation import RegressionEvaluator

# Evaluate the model by computing the RMSE on the test data
predictionsInterests = modelInterests.transform(testInterests)
evaluatorInterests = RegressionEvaluator(metricName="rmse", labelCol="sum(rating)",
                                predictionCol="prediction")
rmseInterests = evaluatorInterests.evaluate(predictionsInterests)
print("Root-mean-square error = " + str(rmseInterests))



Root-mean-square error = 57.550114179685835


                                                                                

In [69]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


als_modelInterests =  ALS(userCol="userId", itemCol="id_level3", ratingCol="sum(rating)", coldStartStrategy="drop")

                 
paramsInterests = ParamGridBuilder().addGrid(als_modelInterests.regParam, [0.01,0.001,0.1]).addGrid(als_modelInterests.rank, [4,10,50]).build()


## instantiating crossvalidator estimator
cvInterests = CrossValidator(estimator=als_modelInterests, estimatorParamMaps=paramsInterests,evaluator=evaluatorInterests,parallelism=4)
best_modelInterests = cvInterests.fit(interests)    

# We see the best model has a rank of 50, so we will use that in our future models with this dataset
best_modelInterests.bestModel

23/05/26 00:59:28 WARN CacheManager: Asked to cache already cached data.
23/05/26 00:59:28 WARN CacheManager: Asked to cache already cached data.
                                                                                ]

ALSModel: uid=ALS_507da242d782, rank=50

In [111]:
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.recommendation import ALS
# split into 
(trainingInterests, testInterests) = interests.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
alsInterests = ALS(maxIter=5,rank=100, regParam=0.01, userCol="userId", itemCol="id_level3", ratingCol="sum(rating)",
          coldStartStrategy="drop")

# fit the ALS model to the training set
modelInterests = alsInterests.fit(trainingInterests)

                                                                                

In [112]:
from pyspark.ml.evaluation import RegressionEvaluator

# Evaluate the model by computing the RMSE on the test data
predictionsInterests = modelInterests.transform(testInterests)
evaluatorInterests = RegressionEvaluator(metricName="rmse", labelCol="sum(rating)",
                                predictionCol="prediction")
rmseInterests = evaluatorInterests.evaluate(predictionsInterests)
print("Root-mean-square error = " + str(rmseInterests))



Root-mean-square error = 28.143804777455088


                                                                                

In [102]:
recommendationsInterests = modelInterests.recommendForAllUsers(10)
recommendationsInterests.where(recommendationsInterests["userId"] == 101419598).collect()

                                                                                

[Row(userId=101419598, recommendations=[Row(id_level3=448, rating=58.27103042602539), Row(id_level3=149, rating=54.70125198364258), Row(id_level3=1720, rating=53.996788024902344), Row(id_level3=1118, rating=47.52177047729492), Row(id_level3=152, rating=45.99745178222656), Row(id_level3=1421, rating=45.98931121826172), Row(id_level3=1416, rating=45.182411193847656), Row(id_level3=656, rating=41.04288101196289), Row(id_level3=1280, rating=37.36673355102539), Row(id_level3=1287, rating=37.02436065673828)])]

In [38]:
spark.stop()