In [12]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
import pyspark.sql.functions as F

In [2]:
import os

os.environ["PYSPARK_PYTHON"]="/home/emil/.conda/envs/python2/bin/python2.7"
os.environ["PYSPARK_DRIVER_PYTHON"]="/home/emil/.conda/envs/python2/bin/python2.7"

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master("local") \
        .appName("Word Count") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

sc = spark.sparkContext
data=sc.textFile("train.txt", use_unicode=False)

In [4]:
data = data.map(lambda x: [float(item) for item in x.split("\t")])

In [5]:
df = spark.createDataFrame(data, ["uid", "mid", "rat"])

In [19]:
df = df.withColumn("pred", F.lit(1.0))

In [20]:
df.show()

+---+----+---+----+
|uid| mid|rat|pred|
+---+----+---+----+
|1.0| 1.0|5.0| 1.0|
|1.0| 2.0|3.0| 1.0|
|1.0| 3.0|4.0| 1.0|
|1.0| 4.0|3.0| 1.0|
|1.0| 5.0|3.0| 1.0|
|1.0| 6.0|5.0| 1.0|
|1.0| 7.0|4.0| 1.0|
|1.0| 8.0|1.0| 1.0|
|1.0| 9.0|5.0| 1.0|
|1.0|10.0|3.0| 1.0|
|1.0|11.0|2.0| 1.0|
|1.0|12.0|5.0| 1.0|
|1.0|13.0|5.0| 1.0|
|1.0|14.0|5.0| 1.0|
|1.0|15.0|5.0| 1.0|
|1.0|16.0|5.0| 1.0|
|1.0|17.0|3.0| 1.0|
|1.0|18.0|4.0| 1.0|
|1.0|19.0|5.0| 1.0|
|1.0|21.0|1.0| 1.0|
+---+----+---+----+
only showing top 20 rows



In [21]:
(training, test) = df.randomSplit([0.8, 0.2])

In [22]:
als = ALS(userCol="uid", itemCol="mid", ratingCol="rat",
          coldStartStrategy="drop", nonnegative=True)

In [23]:
param_grid = (ParamGridBuilder()
              .addGrid(als.rank, [12,13,14])
              .addGrid(als.maxIter, [18,19,20])
              .addGrid(als.regParam, [.17,.18,.19])
              .build())
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rat', predictionCol='prediction')
tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator)

In [25]:
model = tvs.fit(training)

In [28]:
model.bestModel

ALS_0769bc6f454a

In [31]:
predictions.show()

+-----+-----+---+----+----------+
|  uid|  mid|rat|pred|prediction|
+-----+-----+---+----+----------+
|224.0|148.0|3.0| 1.0| 3.2040348|
|479.0|148.0|2.0| 1.0| 2.7969046|
|120.0|148.0|3.0| 1.0| 2.8998196|
|430.0|148.0|2.0| 1.0| 2.8928638|
|455.0|148.0|3.0| 1.0| 3.0570571|
|891.0|148.0|5.0| 1.0| 3.9947226|
|552.0|148.0|3.0| 1.0| 2.9523656|
|880.0|148.0|2.0| 1.0| 3.1273022|
| 49.0|148.0|1.0| 1.0| 1.7567825|
|293.0|148.0|1.0| 1.0| 2.3282893|
|320.0|148.0|4.0| 1.0| 3.3790927|
| 21.0|148.0|1.0| 1.0| 2.3851006|
|825.0|148.0|4.0| 1.0|  3.758829|
|198.0|148.0|3.0| 1.0|  2.516036|
|158.0|148.0|4.0| 1.0| 3.1702442|
|506.0|148.0|3.0| 1.0| 3.2699554|
|313.0|148.0|2.0| 1.0| 2.9741333|
|399.0|148.0|4.0| 1.0| 2.7649565|
|705.0|148.0|5.0| 1.0| 3.5423028|
|932.0|148.0|2.0| 1.0| 2.9119356|
+-----+-----+---+----+----------+
only showing top 20 rows



In [29]:
best_model = model.bestModel
predictions = best_model.transform(test)
rmse = evaluator.evaluate(predictions)

print("RMSE: " + str(rmse))

RMSE: 2.77514110243
