In [2]:
from pyspark.sql import SparkSession

In [3]:
MAX_MEMORY = "5g"
spark = SparkSession.builder.appName("movie-recommendation")\
    .config("spark.executor.memory", MAX_MEMORY)\
    .config("spark.driver.memory", MAX_MEMORY)\
    .getOrCreate()

In [5]:
ratings_file = "/Users/woals/data-engineering/01-spark/data/ml-25m/ratings.csv"
ratings_df = spark.read.csv(f"file:///{ratings_file}", inferSchema=True, header=True)

In [6]:
ratings_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



In [7]:
ratings_df = ratings_df.select(["userId", "movieId", "rating"])

In [8]:
ratings_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [9]:
ratings_df.select("rating").describe().show()

+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|          25000095|
|   mean| 3.533854451353085|
| stddev|1.0607439611423535|
|    min|               0.5|
|    max|               5.0|
+-------+------------------+



In [10]:
train_df, test_df = ratings_df.randomSplit([0.8, 0.2])

In [11]:
from pyspark.ml.recommendation import ALS

In [12]:
als = ALS(
    maxIter=5,
    regParam=0.1,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop"
)

In [13]:
model = als.fit(train_df)

In [14]:
predictions = model.transform(test_df)

In [15]:
predictions.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   137|   1645|   3.0| 3.1439996|
|   321|   6620|   3.5| 3.7453382|
|   375|   1580|   2.5| 3.4874847|
|   588|   1645|   2.5| 2.7212934|
|   613|   1580|   3.0| 3.3409734|
|   613|   1645|   4.0| 3.6792169|
|   633|   1591|   5.0|  3.428872|
|   772|    471|   4.0|  3.433312|
|   772|   2122|   2.0|  2.035382|
|   804|   3175|   5.0|  4.481285|
|   847|   1959|   2.0| 3.2072742|
|   847|   4818|   0.5| 1.0782983|
|   847|   6620|   4.0| 3.7067366|
|   847| 180981|   3.5| 3.3271582|
|  1139|   1580|   4.0| 3.6355038|
|  1265|   3175|   3.5| 3.2298932|
|  1352|   1088|   4.5| 3.3385966|
|  1561|   3175|   4.0| 3.3129842|
|  1924|   2366|   4.0|  3.391782|
|  1977|    833|   1.0|  1.520834|
+------+-------+------+----------+
only showing top 20 rows



In [16]:
predictions.select('rating', 'prediction').describe().show()

+-------+------------------+------------------+
|summary|            rating|        prediction|
+-------+------------------+------------------+
|  count|           5001644|           5001644|
|   mean| 3.534603122493324|3.4419646015852527|
| stddev|1.0602847574395575|0.6499855123810246|
|    min|               0.5|         -2.159045|
|    max|               5.0|           6.71361|
+-------+------------------+------------------+



In [17]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName="rmse", labelCol='rating', predictionCol='prediction')

In [18]:
rmse = evaluator.evaluate(predictions)

In [19]:
print(rmse)

0.8064087612267029


In [20]:
model.recommendForAllUsers(3).show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    28|[{194434, 7.42172...|
|    31|[{203882, 3.81943...|
|    34|[{194434, 5.62247...|
|    53|[{194334, 6.35528...|
|    65|[{144202, 6.24030...|
|    78|[{200930, 6.75907...|
|    81|[{200930, 5.15285...|
|    85|[{203882, 5.80957...|
|   101|[{203086, 5.22314...|
|   108|[{203086, 5.34636...|
|   115|[{203882, 6.09876...|
|   126|[{203882, 6.40953...|
|   133|[{203882, 5.56322...|
|   137|[{205453, 5.53613...|
|   148|[{183947, 5.57466...|
|   155|[{194434, 5.81972...|
|   183|[{203882, 6.09008...|
|   193|[{194434, 5.20776...|
|   210|[{199187, 8.08775...|
|   211|[{203086, 6.34462...|
+------+--------------------+
only showing top 20 rows



In [21]:
model.recommendForAllItems(3).show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     28|[{105801, 5.56028...|
|     31|[{87426, 5.349241...|
|     34|[{58248, 5.607516...|
|     53|[{106621, 5.26489...|
|     65|[{87426, 5.278626...|
|     78|[{149507, 4.78111...|
|     81|[{87816, 4.819454...|
|     85|[{105801, 4.89818...|
|    101|[{26829, 4.953245...|
|    108|[{86854, 5.144724...|
|    115|[{18885, 5.85764}...|
|    126|[{87426, 4.914584...|
|    133|[{86854, 6.031361...|
|    137|[{86854, 5.724830...|
|    148|[{87816, 4.095135...|
|    155|[{149507, 5.04689...|
|    183|[{149507, 5.12489...|
|    193|[{87426, 4.944092...|
|    210|[{87426, 4.792523...|
|    211|[{105801, 5.19140...|
+-------+--------------------+
only showing top 20 rows



In [22]:
from pyspark.sql.types import IntegerType

user_list = [65, 78, 81]
users_df = spark.createDataFrame(user_list, IntegerType()).toDF('userId')

users_df.show()

+------+
|userId|
+------+
|    65|
|    78|
|    81|
+------+



In [23]:
user_recs = model.recommendForUserSubset(users_df, 5)

In [24]:
movies_list = user_recs.collect()[0].recommendations

In [25]:
recs_df = spark.createDataFrame(movies_list)
recs_df.show()

+-------+------------------+
|movieId|            rating|
+-------+------------------+
| 144202| 6.240303993225098|
| 169606| 6.034413814544678|
| 185645| 5.671205520629883|
| 194434|5.6255292892456055|
| 203086| 5.592863082885742|
+-------+------------------+



In [26]:
movies_file = "/Users/woals/data-engineering/01-spark/data/ml-25m/movies.csv"
movies_df = spark.read.csv(f"file:///{movies_file}", inferSchema=True, header=True)

In [27]:
movies_df.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [28]:
recs_df.createOrReplaceTempView("recommendations")
movies_df.createOrReplaceTempView("movies")

In [29]:
query = """
SELECT *
FROM
    movies JOIN recommendations
    ON movies.movieId = recommendations.movieId
ORDER BY
    rating desc
"""
recommended_movies = spark.sql(query)
recommended_movies.show()

+-------+--------------------+------------------+-------+------------------+
|movieId|               title|            genres|movieId|            rating|
+-------+--------------------+------------------+-------+------------------+
| 144202|Catch That Girl (...|   Action|Children| 144202| 6.240303993225098|
| 169606|Dara O'Briain Cro...|            Comedy| 169606| 6.034413814544678|
| 185645|Stone Cold Steve ...|       Documentary| 185645| 5.671205520629883|
| 194434|   Adrenaline (1990)|(no genres listed)| 194434|5.6255292892456055|
| 203086|Truth and Justice...|             Drama| 203086| 5.592863082885742|
+-------+--------------------+------------------+-------+------------------+



In [30]:
def get_recommendations(user_id, num_recs):
    users_df = spark.createDataFrame([user_id], IntegerType()).toDF('userId')
    user_recs_df = model.recommendForUserSubset(users_df, num_recs)
    
    recs_list = user_recs_df.collect()[0].recommendations
    recs_df = spark.createDataFrame(recs_list)
    recommended_movies = spark.sql(query)
    return recommended_movies

In [31]:
recs = get_recommendations(456, 10)

In [32]:
recs.toPandas()

Unnamed: 0,movieId,title,genres,movieId.1,rating
0,144202,Catch That Girl (2002),Action|Children,144202,6.240304
1,169606,Dara O'Briain Crowd Tickler (2015),Comedy,169606,6.034414
2,185645,Stone Cold Steve Austin: The Bottom Line on th...,Documentary,185645,5.671206
3,194434,Adrenaline (1990),(no genres listed),194434,5.625529
4,203086,Truth and Justice (2019),Drama,203086,5.592863


In [33]:
spark.stop()