# Using ALS with Spark, evaluating CV results 

### Read the data, split into tokens and create a structured DataFrame. For low level tasks like splitting strings, we need to use an RDD, where we can apply a `map` function.

In [34]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
# the imports are used creating the data frame

spark = SparkSession.builder.getOrCreate() # create a SparkSession 

# this gets us an RDD. (could also be done with RDD.textFile in this case)
lines = spark.read.text("hdfs://saltdean/data/movielens/sample_movielens_ratings.txt").rdd 
# now split the lines at the '::'
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                    rating=float(p[2]), timestamp=int(p[3])))
print(ratingsRDD.take(1))
ratings = spark.createDataFrame(ratingsRDD)
print(ratings.take(1))
print(ratings)

ratings.createOrReplaceTempView('ratings') # register the DataFrame so that we can use it with Spark SQL.
(training, test) = ratings.randomSplit([0.8, 0.2]) # split into test and training set
print(training) # just for testing, should show the four columns
print(training.describe()) # just for testing, should show the four columns
print(training.count()) # just for testing, should be around 1200

[Row(movieId=2, rating=3.0, timestamp=1424380312, userId=0)]
[Row(movieId=2, rating=3.0, timestamp=1424380312, userId=0)]
DataFrame[movieId: bigint, rating: double, timestamp: bigint, userId: bigint]
DataFrame[movieId: bigint, rating: double, timestamp: bigint, userId: bigint]
DataFrame[summary: string, movieId: string, rating: string, timestamp: string, userId: string]
1194


### Take a very simple estimate as the baseline: calculate the mean of all ratings.    

In [35]:
SQL1 = 'SELECT AVG(rating) FROM ratings'
row = spark.sql(SQL1).collect()[0] # get the single row with the result

meanRating = row['avg(rating)'] # access Row as a map 
print('meanRating',meanRating)

se_rdd = test.rdd.map(lambda row: Row(se = pow(row['rating']-meanRating,2)) ) 
se_df = spark.createDataFrame(se_rdd) 
se_df.createOrReplaceTempView('se')
print('se_df',se_df)
SQL2 = 'SELECT AVG(se) FROM se'
row = spark.sql(SQL2).collect()[0]
meanSE = row['avg(se)'] # access Row as a map 
print('meanSE',meanSE)

meanRating 1.7741505662891406
se_df DataFrame[se: double]
meanSE 1.2883502813858931


### Create an ALS estimator and a parameter grid to explore different values for the `rank` and `regParam` parameter of the ALS. Then build a cross-validator to train the model.

In [36]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5, rank=5, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating")

paramGrid = ParamGridBuilder() \
    .addGrid(als.regParam, [0.03,0.1,0.3,1,3]) \
    .addGrid(als.rank, [1,3,10,30,100]).build()

regEval = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
nFolds = 6 # reh

crossVal = CrossValidator(estimator=als, estimatorParamMaps=paramGrid, evaluator=regEval, numFolds=nFolds)
print('starting cross-validation')
cvModel = crossVal.fit(training)
print('finished cross-validation')

starting cross-validation
finished cross-validation


### Hyper-parameter tuning and CV

Take the trained cvModel and extract the best parameter values by inspecting the estimatorParameterMap. Compare the RMSE value to that of the mean for different parameter settings.

The parameter maps and metrics lists we get from the `cvModel` are local Python list, so we need to use local methods, not RDD methods. There are however similar functions available, in particular `map` and `zip`, which work like for RDDs and `list` which is similar to RDD.collect in creating a mapped list. See here for documentation:  [https://docs.python.org/3/library/functions.html](https://docs.python.org/3/library/functions.html)

In [37]:
print(cvModel.avgMetrics) # the metrics form the CrossValidation
print(cvModel.getEstimatorParamMaps())
# use Python zip and list (not RDD functions, these are local Python object) to create a joint paramter and result list
paramMap = list(zip(cvModel.getEstimatorParamMaps(),cvModel.avgMetrics)) 
# use Python map to create a joint paramter and result list
paramMap = list(map(lambda epm_am: (epm_am[0],epm_am[1]/nFolds), paramMap)) 
#print(paramMap)
# use Python min to get the best params (i.e. those producing minimal RMSE) 
paramMax = min(paramMap, key=lambda x: x[1])
print(paramMax)

# Evaluate the cvModel by computing the RMSE on the test data
predictions = cvModel.transform(test)
rmse = regEval.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

[7.541827322676414, 7.504204833346607, 7.543234042274206, 9.151977064097533, 12.854056304461999, 8.160033474392774, 7.375578896757023, 7.043424389032287, 9.152147888376694, 12.854052608775902, 9.671700080379987, 6.889642010167341, 6.7899610659035075, 9.152071560984172, 12.854057648205941, 8.237358092183237, 6.551492911565812, 6.793163339438315, 9.15208245573481, 12.854055597319896, 8.286559416484007, 6.609700573830148, 6.787986162609502, 9.152061545631911, 12.854051124520904]
[{Param(parent='ALS_417d92275eb0e2db2b0c', name='rank', doc='rank of the factorization'): 1, Param(parent='ALS_417d92275eb0e2db2b0c', name='regParam', doc='regularization parameter (>= 0).'): 0.03}, {Param(parent='ALS_417d92275eb0e2db2b0c', name='rank', doc='rank of the factorization'): 1, Param(parent='ALS_417d92275eb0e2db2b0c', name='regParam', doc='regularization parameter (>= 0).'): 0.1}, {Param(parent='ALS_417d92275eb0e2db2b0c', name='rank', doc='rank of the factorization'): 1, Param(parent='ALS_417d92275eb0e

In [26]:
ratings = spark.read.csv('hdfs://saltdean/data/movielens/ml-latest-small/ratings.csv',header=True,inferSchema=True) 
ratings.dtypes

[('userId', 'int'),
 ('movieId', 'int'),
 ('rating', 'double'),
 ('timestamp', 'int')]