In [1]:
from pyspark.sql import SparkSession

MAX_MEMORY = "5g"
spark = SparkSession.builder.appName("movie-recommendation")\
    .config("spark.executor.memory", MAX_MEMORY)\
    .config("spark.driver.memory", MAX_MEMORY)\
    .getOrCreate()

22/01/04 23:52:02 WARN Utils: Your hostname, iyejun-ui-MacBookAir.local resolves to a loopback address: 127.0.0.1; using 192.168.35.206 instead (on interface en0)
22/01/04 23:52:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/04 23:52:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/01/04 23:52:06 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
ratings_file = "/Users/yeznable/Documents/GitHub/Data_Processing/Spark/data/ml-25m/ratings.csv"
ratings_df = spark.read.csv(f"file:///{ratings_file}", inferSchema=True, header=True)

                                                                                

In [3]:
ratings_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [4]:
ratings_df = ratings_df.select(["userId", "movieId", "rating"])

In [5]:
ratings_df.select("rating").describe().show()



+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|          25000095|
|   mean| 3.533854451353085|
| stddev|1.0607439611423535|
|    min|               0.5|
|    max|               5.0|
+-------+------------------+



                                                                                

In [6]:
train_df, test_df = ratings_df.randomSplit([0.8, 0.2])

In [7]:
from pyspark.ml.recommendation import ALS

als = ALS(
    maxIter=5,
    regParam=0.1,
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    coldStartStrategy="drop"
)

In [8]:
model = als.fit(train_df)

22/01/04 23:56:48 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/01/04 23:56:48 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/01/04 23:56:50 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

In [9]:
predictions = model.transform(test_df)
predictions.select('rating', 'prediction').describe().show()

                                                                                

+-------+------------------+------------------+
|summary|            rating|        prediction|
+-------+------------------+------------------+
|  count|           4998017|           4998017|
|   mean|3.5342159700537232|3.4364396529128003|
| stddev|1.0606504718997283|0.6467038693658687|
|    min|               0.5|        -2.1877646|
|    max|               5.0|         6.7436566|
+-------+------------------+------------------+



In [10]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName="rmse", labelCol='rating', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print(rmse)

[Stage 147:>                                                        (0 + 4) / 4]

0.8085926796737654




In [11]:
model.recommendForAllUsers(3).show()
model.recommendForAllItems(3).show()

                                                                                

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{194434, 5.48642...|
|     6|[{202231, 6.42259...|
|    12|[{194434, 5.30531...|
|    13|[{194434, 6.22034...|
|    16|[{194434, 6.66611...|
|    22|[{176597, 6.11011...|
|    26|[{194434, 5.54179...|
|    27|[{203086, 5.78416...|
|    28|[{194434, 7.44716...|
|    31|[{203882, 3.80279...|
|    34|[{194434, 5.42645...|
|    44|[{203086, 6.73830...|
|    47|[{194434, 5.31373...|
|    52|[{203086, 6.00124...|
|    53|[{192089, 6.52217...|
|    65|[{169606, 6.81147...|
|    76|[{194434, 6.13274...|
|    78|[{194434, 6.61670...|
|    81|[{194434, 5.46445...|
|    85|[{182793, 5.23826...|
+------+--------------------+
only showing top 20 rows





+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|      1|[{105801, 5.60719...|
|     12|[{87426, 5.662331...|
|     13|[{87426, 5.710892...|
|     22|[{87426, 5.315193...|
|     26|[{105801, 5.34046...|
|     27|[{143282, 5.64795...|
|     28|[{949, 5.8351316}...|
|     31|[{87426, 5.454971...|
|     34|[{71227, 5.521664...|
|     44|[{87426, 5.675578...|
|     47|[{121393, 5.66073...|
|     52|[{105801, 5.06476...|
|     53|[{33115, 5.537861...|
|     65|[{87426, 5.192933...|
|     76|[{87426, 5.598177...|
|     78|[{75043, 4.888745...|
|     81|[{75043, 4.918815...|
|     85|[{75043, 5.272462...|
|     93|[{87426, 5.416026...|
|    101|[{33633, 5.033475...|
+-------+--------------------+
only showing top 20 rows



                                                                                

In [12]:
from pyspark.sql.types import IntegerType

user_list = [65, 78, 81]
users_df = spark.createDataFrame(user_list, IntegerType()).toDF('userId')
user_recs = model.recommendForUserSubset(users_df, 5)
movies_list = user_recs.collect()[0].recommendations

                                                                                

In [13]:
recs_df = spark.createDataFrame(movies_list)
recs_df.show()

+-------+-----------------+
|movieId|           rating|
+-------+-----------------+
| 169606|6.811473846435547|
| 205277|6.142005443572998|
| 189555|5.958441734313965|
| 196717|5.944956302642822|
| 130219|5.895541191101074|
+-------+-----------------+



In [14]:
movies_file = "/Users/yeznable/Documents/GitHub/Data_Processing/Spark/data/ml-25m/movies.csv"
movies_df = spark.read.csv(f"file:///{movies_file}", inferSchema=True, header=True)

In [15]:
recs_df.createOrReplaceTempView("recommendations")
movies_df.createOrReplaceTempView("movies")

In [18]:
query = """
SELECT *
FROM
    movies JOIN recommendations
    ON movies.movieId = recommendations.movieId
ORDER BY
    rating desc
"""
recommended_movies = spark.sql(query)
recommended_movies.show()

+-------+--------------------+--------------------+-------+-----------------+
|movieId|               title|              genres|movieId|           rating|
+-------+--------------------+--------------------+-------+-----------------+
| 169606|Dara O'Briain Cro...|              Comedy| 169606|6.811473846435547|
| 205277|   Inside Out (1991)|Comedy|Drama|Romance| 205277|6.142005443572998|
| 189555|Dragon Ball Z - T...|    Action|Adventure| 189555|5.958441734313965|
| 196717|Bernard and the G...|Comedy|Drama|Fantasy| 196717|5.944956302642822|
| 130219|The Dark Knight (...|Action|Crime|Dram...| 130219|5.895541191101074|
+-------+--------------------+--------------------+-------+-----------------+



In [16]:
def get_recommendations(user_id, num_recs):
    users_df = spark.createDataFrame([user_id], IntegerType()).toDF('userId')
    user_recs_df = model.recommendForUserSubset(users_df, num_recs)
    
    recs_list = user_recs_df.collect()[0].recommendations
    recs_df = spark.createDataFrame(recs_list)
    recommended_movies = spark.sql(query)
    return recommended_movies

In [19]:
recs = get_recommendations(456, 10)
recs.toPandas()

                                                                                

Unnamed: 0,movieId,title,genres,movieId.1,rating
0,169606,Dara O'Briain Crowd Tickler (2015),Comedy,169606,6.811474
1,205277,Inside Out (1991),Comedy|Drama|Romance,205277,6.142005
2,189555,Dragon Ball Z - The Fall of Men (2015),Action|Adventure,189555,5.958442
3,196717,Bernard and the Genie (1991),Comedy|Drama|Fantasy,196717,5.944956
4,130219,The Dark Knight (2011),Action|Crime|Drama|Thriller,130219,5.895541
