In [32]:
#import findspark
#findspark.init()
import pyspark
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
from pyspark.sql import Row
import pyspark.sql.functions as func
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from sklearn.model_selection import KFold

spark = SparkSession.builder \
    .master("local[2]") \
    .appName("COM6012 Assignment 1 Task2") \
    .getOrCreate()

sc = spark.sparkContext


In [33]:
lines = spark.read.text("ml-25m/ratings.csv").rdd
parts = lines.map(lambda row: row.value.split(","))

header = parts.first()
parts = parts.filter(lambda line: line != header)

In [34]:
for line in parts.take(5):
    print(line)

['1', '296', '5.0', '1147880044']
['1', '306', '3.5', '1147868817']
['1', '307', '5.0', '1147868828']
['1', '665', '5.0', '1147878820']
['1', '899', '3.5', '1147868510']


In [35]:
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)

In [36]:
(fold_0, fold_1, fold_2) = ratings.randomSplit([1.0, 1.0, 1.0])

data_list = [fold_0,fold_1,fold_2]
test_list = [fold_2,fold_0,fold_1]

In [43]:
als_1 = ALS(rank=5,maxIter=10, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
als_2 = ALS(rank=40,maxIter=100, regParam=0.05, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
als_3 = ALS(rank=80,maxIter=250, regParam=1.5, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")

In [44]:
rmse_evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
mae_evaluator = RegressionEvaluator(metricName="mae", labelCol="rating",predictionCol="prediction")

In [39]:
def cross_validate(als_version):
    rmse_list =[]
    mae_list =[]
    fold_j = 0
    for fold_i in range(len(data_list)):
        if fold_j < len(data_list)-1:
            fold_j+= 1
        else:
            fold_j = 0
        first_train = data_list[fold_i].union(data_list[fold_j])
        model = als_version.fit(first_train)
        predictions = model.transform(test_list[fold_i])
        
        rmse = rmse_evaluator.evaluate(predictions)
        mae = mae_evaluator.evaluate(predictions)
        
        rmse_list.append(rmse)
        mae_list.append(mae)
    
    return rmse_list, mae_list

In [40]:
rmse_als_1, mae_als_1 = cross_validate(als_1)
rmse_als_2, mae_als_2 = cross_validate(als_2)
rmse_als_3, mae_als_3 = cross_validate(als_3)

In [41]:
rmse_als_1

[0.8038265509801306, 0.8047327577509259, 0.8042296943178026]

In [42]:
mae_als_1

[0.6192979239873788, 0.6199056936852945, 0.6195750478130315]