In [47]:
import findspark
findspark.init()
import pyspark
import pyspark.sql.functions as F

In [2]:
from pyspark.ml.recommendation import ALS

In [3]:
spark = pyspark.sql.SparkSession.builder.appName('ALSRecommender').getOrCreate()

In [10]:
movies = spark.read.csv('ml100/movies.csv', header=True, inferSchema=True)
ratings = spark.read.csv('ml100/ratings.csv', header=True, inferSchema=True)
ratings = ratings.drop('timestamp')

In [25]:
als = ALS(rank=10, maxIter=10, regParam=0.01,
         userCol='userId', itemCol='movieId', ratingCol='rating')

In [27]:
model=als.fit(ratings)

In [88]:
targetUser = 400
print('Target user\'s top rated movies:')
(ratings.where(ratings.userId == targetUser)
        .orderBy('rating', ascending=False)
        .join(movies, 'movieId')
        .select('title', 'rating')
        .limit(10)
        .show(truncate=False))

Target user's top rated movies:
+-----------------------------------------+------+
|title                                    |rating|
+-----------------------------------------+------+
|Usual Suspects, The (1995)               |5.0   |
|Braveheart (1995)                        |5.0   |
|Pulp Fiction (1994)                      |5.0   |
|Shawshank Redemption, The (1994)         |5.0   |
|Jurassic Park (1993)                     |5.0   |
|Silence of the Lambs, The (1991)         |5.0   |
|Snow White and the Seven Dwarfs (1937)   |5.0   |
|Get Shorty (1995)                        |4.0   |
|Twelve Monkeys (a.k.a. 12 Monkeys) (1995)|4.0   |
|Babe (1995)                              |4.0   |
+-----------------------------------------+------+



In [89]:
#create a new dataframe for the target user with all movieIds
targetUserMovies = movies.withColumn('userId', F.lit(targetUser)).select('userId', 'movieId')

### Make predictions for each movie in the list
There are some movies that don't have any common ratings
* They will be predicted as NaNs, need to drop them

In [90]:
predictions = model.transform(targetUserMovies).dropna()

In [91]:
(predictions.join(movies, 'movieId')
            .select('title', 'prediction')
            .orderBy('prediction', ascending=False)
            .show(10, truncate=False))

+----------------------------------------------------------------------------+----------+
|title                                                                       |prediction|
+----------------------------------------------------------------------------+----------+
|Guess Who's Coming to Dinner (1967)                                         |6.0110803 |
|We Were Soldiers (2002)                                                     |5.9780645 |
|Storytelling (2001)                                                         |5.8024335 |
|Farewell My Concubine (Ba wang bie ji) (1993)                               |5.700131  |
|Cops (1922)                                                                 |5.632817  |
|Play House, The (1921)                                                      |5.632817  |
|Land of Silence and Darkness (Land des Schweigens und der Dunkelheit) (1971)|5.632817  |
|Goat, The (1921)                                                            |5.632817  |
|The Fault

In [None]:
spark.stop()