In [79]:
#import findspark
#findspark.init()
import pyspark
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
from pyspark.sql import Row
import pyspark.sql.functions as func
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

spark = SparkSession.builder \
    .master("local[2]") \
    .appName("COM6012 Assignment 1 Task2") \
    .getOrCreate()

sc = spark.sparkContext


In [61]:
lines = spark.read.text("ml-25m/ratings.csv").rdd
parts = lines.map(lambda row: row.value.split(","))

header = parts.first()
parts = parts.filter(lambda line: line != header)

In [62]:
for line in parts.take(5):
    print(line)

['1', '296', '5.0', '1147880044']
['1', '306', '3.5', '1147868817']
['1', '307', '5.0', '1147868828']
['1', '665', '5.0', '1147878820']
['1', '899', '3.5', '1147868510']


In [63]:
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)

In [64]:
for line in ratings.take(10):
    print(line)

Row(movieId=296, rating=5.0, timestamp=1147880044, userId=1)
Row(movieId=306, rating=3.5, timestamp=1147868817, userId=1)
Row(movieId=307, rating=5.0, timestamp=1147868828, userId=1)
Row(movieId=665, rating=5.0, timestamp=1147878820, userId=1)
Row(movieId=899, rating=3.5, timestamp=1147868510, userId=1)
Row(movieId=1088, rating=4.0, timestamp=1147868495, userId=1)
Row(movieId=1175, rating=3.5, timestamp=1147868826, userId=1)
Row(movieId=1217, rating=3.5, timestamp=1147878326, userId=1)
Row(movieId=1237, rating=5.0, timestamp=1147868839, userId=1)
Row(movieId=1250, rating=4.0, timestamp=1147868414, userId=1)


In [65]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [80]:
als = ALS(maxIter=10, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")

In [67]:
model = als.fit(training)

predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))



Root-mean-square error = 0.8044110798771688


In [78]:
param_grid = ParamGridBuilder()\
.addGrid(als.maxIter,[10,15,20])\
.addGrid(als.regParam, [0.01, 0.02,0.005])\
.build()

In [81]:
cv = CrossValidator(estimator = als,
                    estimatorParamMaps = param_grid,
                    evaluator = evaluator,
                    numFolds = 5)

In [None]:
cv = CrossValidator() 