In [1]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.appName('rec').getOrCreate()

ratings_df_schema = StructType([StructField('userId', IntegerType()),
                                StructField('movieId', IntegerType()),
                                StructField('rating', DoubleType())])
movies_df_schema = StructType([StructField('ID', IntegerType()),
                               StructField('title', StringType())])

In [3]:
rating_df = spark.read.format('com.databricks.spark.csv')\
.options(inferSchema=False, delimiter = '\t').schema(ratings_df_schema).load( "ml-100k/u.data" )

In [4]:
rating_df.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|   196|    242|   3.0|
|   186|    302|   3.0|
|    22|    377|   1.0|
|   244|     51|   2.0|
|   166|    346|   1.0|
|   298|    474|   4.0|
|   115|    265|   2.0|
|   253|    465|   5.0|
|   305|    451|   3.0|
|     6|     86|   3.0|
|    62|    257|   2.0|
|   286|   1014|   5.0|
|   200|    222|   5.0|
|   210|     40|   3.0|
|   224|     29|   3.0|
|   303|    785|   3.0|
|   122|    387|   5.0|
|   194|    274|   2.0|
|   291|   1042|   4.0|
|   234|   1184|   2.0|
+------+-------+------+
only showing top 20 rows



In [5]:
movie_df = spark.read.format('com.databricks.spark.csv')\
.options(inferSchema=False, delimiter = "|").schema(movies_df_schema).load("ml-100k/u.item")

movie_df.show()

+---+--------------------+
| ID|               title|
+---+--------------------+
|  1|    Toy Story (1995)|
|  2|    GoldenEye (1995)|
|  3|   Four Rooms (1995)|
|  4|   Get Shorty (1995)|
|  5|      Copycat (1995)|
|  6|Shanghai Triad (Y...|
|  7|Twelve Monkeys (1...|
|  8|         Babe (1995)|
|  9|Dead Man Walking ...|
| 10|  Richard III (1995)|
| 11|Seven (Se7en) (1995)|
| 12|Usual Suspects, T...|
| 13|Mighty Aphrodite ...|
| 14|  Postino, Il (1994)|
| 15|Mr. Holland's Opu...|
| 16|French Twist (Gaz...|
| 17|From Dusk Till Da...|
| 18|White Balloon, Th...|
| 19|Antonia's Line (1...|
| 20|Angels and Insect...|
+---+--------------------+
only showing top 20 rows



In [6]:
(training_df, validation_df, test_df) = rating_df.randomSplit([0.6, 0.2, 0.2], seed = 42)

In [7]:
als = ALS(maxIter = 10, regParam = 0.01, userCol='userId', itemCol='movieId',ratingCol='rating')

In [8]:
model = als.fit(training_df)

In [9]:
from pyspark.ml.evaluation import RegressionEvaluator
# Create an RMSE evaluator using the label and predicted columns
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")

In [11]:
ranks = [2, 3]
errors = [0, 0]
models = [0, 0]
count = 0
min_error = float('inf')
best_rank = -1
for rank in ranks:

  # Build the model    
  als.setRank(rank)
  model = als.fit(training_df)

  # Make predictions on validation dataset  
  predict_df = model.transform(validation_df)

  predicted_ratings_df = predict_df.filter(predict_df.prediction != float('nan'))

  # Run the previously created RMSE evaluator, reg_eval, on the predicted_ratings_df DataFrame
  error = reg_eval.evaluate(predicted_ratings_df)
  errors[count] = error
  models[count] = model
  print( 'For rank %s the RMSE is %s' % (rank, error) )

  if error < min_error:
      min_error = error
      best_rank = count
  count += 1

als.setRank(ranks[best_rank])
print( 'The best model was trained with rank %s' % ranks[best_rank] )

For rank 2 the RMSE is 0.956949034787736
For rank 3 the RMSE is 0.9902136016869953
The best model was trained with rank 2
