In [4]:
from pyspark.sql import SparkSession 
from pyspark.sql import SQLContext
from pyspark.sql.functions import udf, col, when 
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS 
from pyspark.ml.tuning import ParamGridBuilder 
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder 
from pyspark.ml.tuning import CrossValidator 
from IPython.display import Image
from IPython.display import display
import pandas as pd 
import numpy as np  

In [3]:
ratings_df = spark.read.option("header","true").option("inferSchema","true").format("csv").load("ratings.csv")

In [4]:
movies_df = spark.read.option("header","true").option("inferSchema","true").format("csv").load("movies.csv")

In [5]:
users_df = spark.read.option("header","true").option("inferSchema","true").format("csv").load("users.csv")

In [6]:
ratings_df.show(5)

+-------+--------+------+
|user_id|movie_id|rating|
+-------+--------+------+
|      1|    1193|     5|
|      1|     661|     3|
|      1|     914|     3|
|      1|    3408|     4|
|      1|    2355|     5|
+-------+--------+------+
only showing top 5 rows



In [7]:
movies_df.show(5)

+--------+--------------------+--------------------+
|movie_id|               title|              genres|
+--------+--------------------+--------------------+
|       1|    Toy Story (1995)|Animation|Childre...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|        Comedy|Drama|
|       5|Father of the Bri...|              Comedy|
+--------+--------------------+--------------------+
only showing top 5 rows



In [8]:
users_df.show(5)

+--------------------------------------------------------+
|	user_id	gender	age	occupation	zipcode	age_desc	occ_desc|
+--------------------------------------------------------+
|                                    0	1	F	1	10	48067	...|
|                                    1	2	M	56	16	70072...|
|                                    2	3	M	25	15	55117...|
|                                    3	4	M	45	7	02460	...|
|                                    4	5	M	25	20	55455...|
+--------------------------------------------------------+
only showing top 5 rows



In [9]:
# Splitting the dataset into training and testing at a ratio of 0.8 to 0.2
training_df, validation_df = ratings_df.randomSplit([0.8, 0.2])

In [10]:
# Setting the parameters for ALS
iterations = 10 
regularization_parameter = 0.1 
rank = 4,7
errors = []
err = 0

In [11]:
# Applying the parameters to the Alternating Least Squares Model
als = ALS(maxIter = iterations,
         regParam = regularization_parameter,
         rank = 4,
         userCol = "user_id",
         itemCol ="movie_id",
         ratingCol = "rating", coldStartStrategy="drop")

In [16]:
# Training the model on the train set
model = als.fit(training_df)

In [17]:
# Test the model on the test-set
predictions = model.transform(validation_df)

In [18]:
# Using Mean Squared Error as evaluation metric.
evaluator = RegressionEvaluator(metricName = "rmse", labelCol = "rating", predictionCol = "prediction")

In [20]:
rmse = evaluator.evaluate(predictions)

In [17]:
# Print out the evaluattion metric for the baseline model (RMSE) 
print ("Root Mean Square Error = " + str(rmse))

Root Mean Square Error = 0.879900371075126


In [60]:
# Now I will initiate a grid search to conduct hyperparametric tuning.
als = ALS(maxIter = iterations, regParam = regularization_parameter, userCol = "user_id",
          itemCol ="movie_id",
          ratingCol = "rating", 
          coldStartStrategy="drop")


paramGrid = ParamGridBuilder() \
.addGrid(als.regParam, [ 0.1, 0.2, 0.7 ] ) \
.addGrid(als.maxIter, [15, 17, 19])\
.addGrid(als.rank, range(2,8)) \
.build()

predictions = predictions.filter(col('prediction') != np.nan)
evaluator = RegressionEvaluator(metricName = ("mae"), labelCol = "rating", predictionCol = "prediction")
crossval = CrossValidator(estimator = als,
                                  estimatorParamMaps = paramGrid,
                                  evaluator = evaluator,
                                  numFolds=2)

cvModel = crossval.fit(training_df)
print ("Mean absolute Error = " + str(rmse))

mae = 0.8695889446072668


In [41]:
predictions = predictions.filter(col('prediction') != np.nan)

In [42]:
print ("Root Mean Square Error = " + str(rmse))

Root Mean Square Error = 0.8695889446072668


In [43]:
# Make predictions and extract 10 recommendations
predictions.show(n = 10)

+-------+--------+------+----------+
|user_id|movie_id|rating|prediction|
+-------+--------+------+----------+
|   3053|     148|     3|  3.069607|
|   4040|     463|     1| 1.9760653|
|   4277|     463|     4| 3.4980903|
|   3032|     463|     4|  4.316007|
|   2210|     463|     3|  2.551093|
|   3709|     463|     3| 2.3377364|
|   1146|     463|     2| 2.1019523|
|   2777|     463|     3| 3.0896657|
|   5511|     463|     2| 3.3512058|
|    524|     463|     3| 2.5116155|
+-------+--------+------+----------+
only showing top 10 rows



In [34]:
# Lets map these results back to their respective movie genres and title
predictions.join(movies_df, "movie_id").select("user_id","title","genres","prediction").show(10)

+-------+--------------------+--------------------+----------+
|user_id|               title|              genres|prediction|
+-------+--------------------+--------------------+----------+
|   3053|Awfully Big Adven...|               Drama|  2.792067|
|   4040|Guilty as Sin (1993)|Crime|Drama|Thriller| 1.9937147|
|   4277|Guilty as Sin (1993)|Crime|Drama|Thriller| 3.3355322|
|   3032|Guilty as Sin (1993)|Crime|Drama|Thriller|  4.264699|
|   2210|Guilty as Sin (1993)|Crime|Drama|Thriller|  2.297533|
+-------+--------------------+--------------------+----------+
only showing top 5 rows



In [31]:
# Lets randomly choose a user to see how the actual movie recommendations for that user.
random_user_predictions = predictions.filter(col("user_id")== 345).join(movies_df, "movie_id")
random_user_predictions.show(5)

+--------+-------+------+----------+--------------------+-------------+
|movie_id|user_id|rating|prediction|               title|       genres|
+--------+-------+------+----------+--------------------+-------------+
|    1296|    345|     5| 4.2018843|Room with a View,...|Drama|Romance|
|    1704|    345|     4|  4.342558|Good Will Hunting...|        Drama|
|    2165|    345|     4| 3.6678488|Your Friends and ...|        Drama|
|    2908|    345|     5|  4.438612|Boys Don't Cry (1...|        Drama|
+--------+-------+------+----------+--------------------+-------------+



In [45]:
# Produce top 5 recommendations for every user
userRecommend = model.recommendForAllUsers(5)

# Produce top 5 user recommendations for each movie
movieRecommends = model.recommendForAllItems(5)

In [46]:
userRecommend.select("user_id", "recommendations.movie_id").show(10, False)

+-------+----------------------------+
|user_id|movie_id                    |
+-------+----------------------------+
|1580   |[557, 989, 787, 1420, 53]   |
|4900   |[3233, 318, 527, 2309, 858] |
|5300   |[557, 787, 2309, 1149, 53]  |
|471    |[3245, 682, 3338, 1423, 53] |
|1591   |[3233, 989, 557, 787, 2503] |
|4101   |[1780, 3003, 37, 2964, 3314]|
|1342   |[3172, 318, 1780, 1198, 989]|
|2122   |[3233, 2760, 858, 3172, 527]|
|2142   |[318, 3233, 527, 1851, 50]  |
|463    |[2309, 2760, 682, 858, 106] |
+-------+----------------------------+
only showing top 10 rows



In [50]:
users = ratings_df.select("user_id"). distinct().limit(5);
users.show()

+-------+
|user_id|
+-------+
|    148|
|    463|
|    471|
|    496|
|    833|
+-------+



In [52]:
userSubsetRecs = model.recommendForUserSubset(users,10)
userSubsetRecs.show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|    471|[[3245, 4.783424]...|
|    463|[[2309, 4.2573185...|
|    833|[[2129, 5.225012]...|
|    496|[[108, 5.7070894]...|
|    148|[[2825, 4.7420435...|
+-------+--------------------+



In [53]:
userSubsetRecs.select("user_id", "recommendations.movie_id").show(10, False)

+-------+----------------------------------------------------------+
|user_id|movie_id                                                  |
+-------+----------------------------------------------------------+
|471    |[3245, 682, 3338, 1423, 53, 2905, 2309, 2019, 1851, 750]  |
|463    |[2309, 2760, 682, 858, 106, 3233, 3338, 1193, 787, 318]   |
|833    |[2129, 3314, 3003, 811, 2332, 2197, 3233, 37, 985, 3853]  |
|496    |[108, 1360, 1780, 2810, 2127, 989, 2571, 449, 3636, 598]  |
|148    |[2825, 1471, 1360, 3314, 989, 985, 3172, 1519, 3607, 1198]|
+-------+----------------------------------------------------------+

