In [1]:
import os

In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
!tar xf spark-2.4.7-bin-hadoop2.7.tgz
!pip install -q Findspark
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7"

update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java to provide /usr/bin/java (java) in manual mode


In [3]:
os.environ["JAVA_HOME"]

'/usr/lib/jvm/java-8-openjdk-amd64'

In [4]:
os.environ["SPARK_HOME"]

'/content/spark-2.4.7-bin-hadoop2.7'

In [5]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BigDataProjectRecommendation').getOrCreate()

In [6]:
from google.colab import drive
drive.mount('/content/drive');

Mounted at /content/drive


In [7]:
!ls "drive/My Drive/Big_Data_Movie_Recommender"
DATA_PATH = "drive/My Drive/Big_Data_Movie_Recommender/Data"
RESULTS_PATH = "drive/My Drive/Big_Data_Movie_Recommender/Results"

Data  Results


In [8]:
ratings = spark.read.option("header", "true").csv(DATA_PATH+"/ratings.csv")
movies = spark.read.option("header", "true").csv(DATA_PATH+"/movies.csv")

In [9]:
from pyspark.ml.recommendation import ALS

In [11]:
df_train, df_test = (ratings.randomSplit([0.7, 0.3], seed = 1))

In [12]:
df_train.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
|     1|  27193|   3.0|1147879774|
|     1|  27721|   3.0|1147869115|
|     1|    296|   5.0|1147880044|
|     1|    307|   5.0|1147868828|
|     1|  31956|   3.5|1147877610|
|     1|  32591|   5.0|1147879538|
|     1|   3569|   5.0|1147879603|
+------+-------+------+----------+
only showing top 20 rows



In [22]:
from pyspark.sql.types import IntegerType

df_train = df_train.withColumn("userId",df_train['userId'].cast(IntegerType())) \
                   .withColumn("movieId",df_train['movieId'].cast(IntegerType())) \
                   .withColumn("rating",df_train['rating'].cast(IntegerType()))


In [61]:
als = ALS(rank=10, maxIter=5, seed=0, userCol= "userId", itemCol= "movieId", ratingCol="rating")
als.setColdStartStrategy('drop')
als_model = als.fit(df_train)

In [24]:
from pyspark.ml.evaluation import RegressionEvaluator

In [25]:
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")

In [35]:
df_train.count()

17502939

In [36]:
movies.count()

62423

In [33]:
import pandas as pd
pd.set_option('display.max_colwidth',None)

In [34]:
als_model.itemFactors.toPandas()

Unnamed: 0,id,features
0,10,"[0.025030001997947693, -0.15706470608711243, -0.7125052213668823, 0.5534553527832031, -0.4547816216945648, -0.6850728392601013, -0.02034790813922882, 0.7225993871688843, 0.3338954746723175, 0.5894351601600647]"
1,20,"[-0.016555173322558403, 0.20084144175052643, -0.7855822443962097, 0.32502689957618713, -0.43446433544158936, -0.8430556654930115, -0.2627929449081421, 0.32083219289779663, 0.00761453527957201, 0.4773748815059662]"
2,30,"[-0.5490902066230774, -0.4576345980167389, -0.45613765716552734, 0.8040887713432312, -0.35709503293037415, -0.8311018347740173, -0.18651041388511658, 0.6749392151832581, 0.06978724151849747, -0.2113349586725235]"
3,40,"[-0.26931577920913696, -0.7781033515930176, -0.6253662705421448, 0.6695196628570557, -0.627173900604248, -0.8440384864807129, -0.5844224095344543, 0.2798468768596649, 0.17067402601242065, -0.21850799024105072]"
4,50,"[-0.1771986037492752, -0.4208698868751526, -0.7888234257698059, 0.9405423998832703, -0.1371450275182724, -0.9908616542816162, -0.017619265243411064, 0.9093695282936096, 0.28146111965179443, 0.08991490304470062]"
...,...,...
55120,209119,"[0.05715878680348396, -0.029551083222031593, -0.26570358872413635, 0.7993663549423218, -0.05546528473496437, -0.4180419147014618, -0.34509679675102234, 0.5956115126609802, 0.2083600014448166, 0.165347158908844]"
55121,209129,"[-0.14975421130657196, 0.09460978209972382, 0.15954715013504028, 1.10838782787323, -0.035872142761945724, -0.8518288135528564, -0.1743745654821396, 1.0619900226593018, 0.30995625257492065, 0.2502687871456146]"
55122,209139,"[-0.03968313708901405, 0.024427734315395355, -0.14824500679969788, 0.09329432994127274, 0.021550556644797325, -0.24797090888023376, -0.06280148029327393, 0.2760414183139801, 0.05714036896824837, 0.035864830017089844]"
55123,209159,"[-0.11231565475463867, 0.07095733284950256, 0.11966035515069962, 0.8312908411026001, -0.02690410614013672, -0.6388716101646423, -0.13078093528747559, 0.7964924573898315, 0.2324671894311905, 0.18770159780979156]"


In [62]:
training_predictions_df = als_model.transform(df_train)

In [39]:
type(training_predictions_df)

pyspark.sql.dataframe.DataFrame

In [40]:
training_predictions_df.show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
| 32855|    148|     4|1029309135| 2.3990471|
| 26480|    148|     2| 915406133| 1.9864883|
| 38199|    148|     2| 835601960| 2.4336302|
|159730|    148|     3| 842162037|  2.716928|
| 33354|    148|     3| 938886119| 2.6935844|
| 47989|    148|     2| 833173771| 2.9645228|
| 72337|    148|     2| 944246202|  2.777789|
|151614|    148|     1| 878170956|  2.731505|
|  5055|    148|     3| 842463284| 2.8496578|
|108767|    148|     3|1276969740| 2.5595648|
| 21531|    148|     3| 834035555| 3.0218282|
| 38679|    148|     3| 853421750| 2.5657732|
| 99684|    148|     3|1027645782| 2.9732146|
| 35969|    148|     2| 835094487|  2.794639|
| 54331|    148|     2| 954702916| 2.9416816|
| 77130|    148|     1| 831284829| 1.1798544|
| 29943|    148|     3|1049216998| 2.8596456|
|117168|    148|     4| 835820190| 3.2516797|
| 28229|    148|     1| 833850593|

In [63]:
from pyspark.ml.recommendation import ALSModel
als_model.save(RESULTS_PATH+"ALS_MovieLens_1")
als_model = ALSModel.load(RESULTS_PATH+"ALS_MovieLens_1")

In [64]:
reg_eval.evaluate(training_predictions_df)

0.8193309774106281

In [49]:
df_test = df_test.withColumn("userId",df_test['userId'].cast(IntegerType())) \
                   .withColumn("movieId",df_test['movieId'].cast(IntegerType())) \
                   .withColumn("rating",df_test['rating'].cast(IntegerType()))

In [65]:
validation_predictions_df = als_model.transform(df_test)

In [66]:
validation_predictions_df.select([F.count(F.when(F.isnan(c), c)).alias(c) for c in validation_predictions_df.columns]).show()

+------+-------+------+---------+----------+
|userId|movieId|rating|timestamp|prediction|
+------+-------+------+---------+----------+
|     0|      0|     0|        0|         0|
+------+-------+------+---------+----------+



In [67]:
validation_predictions_df.show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|     1|   1250|     4|1147868414| 3.3892672|
|     1|   2161|     3|1147868609| 2.9195266|
|     1|  27266|     4|1147879365| 3.3538353|
|     1|   2843|     4|1147868891|   3.80054|
|     1|    306|     3|1147868817| 4.1096406|
|     1|   3448|     4|1147868480|  3.211211|
|     1|   4308|     3|1147868534| 3.3079157|
|     1|   4973|     4|1147869080|  3.985975|
|     1|   5767|     5|1147878729| 3.6437132|
|     1|   5912|     3|1147878698| 3.2279365|
|     1|   5952|     4|1147868053| 3.3478596|
|     1|   6016|     5|1147869090| 3.8530746|
|     1|   6377|     4|1147868469|  3.213149|
|     1|   6539|     3|1147868461| 2.9053173|
|     1|   7234|     4|1147868869|  3.627199|
|     1|   7361|     5|1147880055|  3.841207|
|     1|   7938|     2|1147878063|  3.478118|
|     1|   8154|     5|1147868865| 3.4568613|
|    10|   1962|     3|1227570828|

In [68]:
reg_eval.evaluate(validation_predictions_df)

0.8576732625272104

In [50]:
spark.sparkContext._conf.getAll()

[('spark.driver.port', '38387'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.host', '9b3d0aaf19eb'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.app.id', 'local-1607178844913'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.app.name', 'BigDataProjectRecommendation'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true')]

In [51]:
spark.sparkContext.version

'2.4.7'

In [52]:
als=ALS()

In [53]:
als.setColdStartStrategy('drop')

ALS_e1c9a006fcb0

In [56]:
type(als_model)

pyspark.ml.recommendation.ALSModel

In [57]:
type(als)

pyspark.ml.recommendation.ALS

In [37]:
import pyspark.sql.functions as F
ratings.filter("userId = 100").join(movies, on = "movieId").orderBy(F.desc("rating")).show()

+-------+------+------+---------+--------------------+--------------------+
|movieId|userId|rating|timestamp|               title|              genres|
+-------+------+------+---------+--------------------+--------------------+
|   1358|   100|   5.0|862169132|  Sling Blade (1996)|               Drama|
|    781|   100|   5.0|862168912|Stealing Beauty (...|               Drama|
|   1466|   100|   5.0|862169186|Donnie Brasco (1997)|         Crime|Drama|
|   1233|   100|   5.0|862168955|Boot, Das (Boat, ...|    Action|Drama|War|
|   1193|   100|   5.0|862169731|One Flew Over the...|               Drama|
|    714|   100|   5.0|862169065|     Dead Man (1995)|Drama|Mystery|Wes...|
|   1354|   100|   5.0|862169101|Breaking the Wave...|       Drama|Mystery|
|    608|   100|   4.0|862168434|        Fargo (1996)|Comedy|Crime|Dram...|
|   1210|   100|   4.0|862168580|Star Wars: Episod...|Action|Adventure|...|
|     82|   100|   4.0|862168835|Antonia's Line (A...|        Comedy|Drama|
|    290|   

In [69]:
from pyspark.sql.types import DoubleType
import numpy as np
import scipy
import scipy.spatial

def distance(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2)
    return float(scipy.spatial.distance.cosine(v1, v2))

spark.udf.register("distance", distance, DoubleType())

def recommendation_by_i2i(movie_id):
    return (als_model
     .itemFactors
     .filter(F.col("id") == movie_id)
     .alias("t1")
     .crossJoin(als_model.itemFactors.alias("t2"))
     .withColumn("similarity", F.expr("distance(t1.features, t2.features)")) 
     .join(movies, F.col("t2.id") == F.col("movieId"))
     .orderBy(F.asc("similarity"))
     .select("movieId", "title", "similarity")
    )

In [73]:
recommendation_by_i2i(541).show(20, False)

+-------+----------------------------------------------+--------------------+
|movieId|title                                         |similarity          |
+-------+----------------------------------------------+--------------------+
|541    |Blade Runner (1982)                           |0.0                 |
|1080   |Monty Python's Life of Brian (1979)           |0.012101173928981468|
|73914  |Sometimes a Great Notion (1970)               |0.01240904214166283 |
|6104   |Monty Python Live at the Hollywood Bowl (1982)|0.013113274085467697|
|5965   |Duellists, The (1977)                         |0.013625690509695643|
|1218   |Killer, The (Die xue shuang xiong) (1989)     |0.014535267911132355|
|148288 |Who Am I This Time? (1982)                    |0.015166192408404777|
|1136   |Monty Python and the Holy Grail (1975)        |0.015209555968390687|
|145755 |The Dark Glow of the Mountain (1985)          |0.015227092839303014|
|152292 |Mojin: The Lost Legend (2015)                 |0.015743

In [89]:
ratings.filter("rating = 4.0 or rating = 5.0 or rating = 4.5").show(100)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|   1088|   4.0|1147868495|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1653|   4.0|1147868097|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
|     1|   2843|   4.5|1147868891|
|     1|   3448|   4.0|1147868480|
|     1|   3569|   5.0|1147879603|
|     1|   3949|   5.0|1147868678|
|     1|   4144|   5.0|1147868898|
|     1|   4325|   5.0|1147878122|
|     1|   4703|   4.0|1147869223|
|     1|   4973|   4.5|1147869080|
|     1|   5147|   4.0|1147877654|
|     1|   5767|   5.0|1147878729|
|     1|   5878|   4.0|1147868807|
|     1|   5952|   4.0|1147868053|
|     1|   6016|   5.0|1147869090|
|     1|   6370|   4.5|1147869191|
|     1|   6377|   4

In [91]:
ratings.filter("rating = 4.0 or rating = 5.0 or rating = 4.5").groupBy('userId').count().show(10)

+------+-----+
|userId|count|
+------+-----+
|   296|   53|
|   467|    5|
|   675|   78|
|   691|   46|
|   829|   21|
|  1090|   19|
|  1159|   54|
|  1436|  236|
|  1512|   33|
|  1572|   56|
+------+-----+
only showing top 10 rows



In [72]:
top_rated_movies_by_user = (ratings
                            .filter("rating = 4 or rating = 5")
                            .groupBy("userId")
                            .agg(F.collect_set("movieId").alias("top_movies")))
top_rated_movies_by_user.show()

+------+--------------------+
|userId|          top_movies|
+------+--------------------+
|100010|[647, 1047, 1, 60...|
|100140|[2348, 2313, 1189...|
|100227|[6, 3, 662, 62, 7...|
|100263|[2105, 5995, 3300...|
|100320|[6873, 1719, 2160...|
|100553|[5995, 2160, 5679...|
|100704|[74458, 4886, 412...|
|100735|[110, 2000, 74458...|
|100768|[41, 306, 1450, 1...|
| 10096|[784, 832, 1, 839...|
|100964|[62081, 48744, 13...|
|101021|[2859, 858, 1950,...|
|101122|[60069, 64614, 40...|
|101205|[1, 352, 141, 307...|
|101261|[33794, 37380, 43...|
|101272|[189333, 140956, ...|
|102113|[74458, 81845, 33...|
|102521|[110, 350, 356, 2...|
|102536|[3, 783, 1, 141, ...|
|102539|[110, 185, 2231, ...|
+------+--------------------+
only showing top 20 rows



In [94]:
top_rated_movies_by_user.sort(F.col('userId')).show()

+------+--------------------+
|userId|          top_movies|
+------+--------------------+
|     1|[4973, 5767, 6711...|
|    10|[110, 356, 6502, ...|
|   100|[781, 858, 213, 1...|
|  1000|[4973, 1923, 1259...|
| 10000|[110, 4973, 5995,...|
|100000|[6870, 2840, 5995...|
|100001|[101142, 317, 784...|
|100002|[6867, 1225, 6947...|
|100003|[1997, 1193, 3499...|
|100004|[236, 141, 471, 1...|
|100005|[110, 4886, 1, 19...|
|100006|[110, 150, 296, 2...|
|100007|[2105, 78218, 112...|
|100008|[110, 551, 356, 5...|
|100009|[2859, 2348, 429,...|
| 10001|[55442, 90866, 42...|
|100010|[647, 1047, 1, 60...|
|100011|[44555, 1221, 487...|
|100012|[110, 33794, 3578...|
|100013|[367, 368, 589, 1...|
+------+--------------------+
only showing top 20 rows



In [103]:
def recommendation_by_u2u(user_id):
    return (als_model
     .userFactors
     .filter(F.col("id") == user_id)
     .alias("t1")
     .crossJoin(als_model.itemFactors.alias("t2"))
     .withColumn("similarity", F.expr("distance(t1.features, t2.features)"))
     .filter("similarity < 0.03 and similarity > 0.0") # 0.02 Similarity threshold - a hyper parameter. We can perform tuning to find the suitable value
     .join(top_rated_movies_by_user.alias("t3"), F.col("t2.id") == F.col("t3.userId"))
     .select("t1.id", F.explode("top_movies").alias("movieId"))
     .join(movies, on = "movieId")
     .select("movieId", "title")
    )

In [106]:
ratings.join(movies,['movieId']).show(50,False)

+-------+------+------+----------+---------------------------------------------------------------------------------+-------------------------------------+
|movieId|userId|rating|timestamp |title                                                                            |genres                               |
+-------+------+------+----------+---------------------------------------------------------------------------------+-------------------------------------+
|296    |1     |5.0   |1147880044|Pulp Fiction (1994)                                                              |Comedy|Crime|Drama|Thriller          |
|306    |1     |3.5   |1147868817|Three Colors: Red (Trois couleurs: Rouge) (1994)                                 |Drama                                |
|307    |1     |5.0   |1147868828|Three Colors: Blue (Trois couleurs: Bleu) (1993)                                 |Drama                                |
|665    |1     |5.0   |1147878820|Underground (1995)                  

In [112]:
recommendation_by_u2u(1).show(20, False)

+-------+---------------------------------------------------------+
|movieId|title                                                    |
+-------+---------------------------------------------------------+
|2160   |Rosemary's Baby (1968)                                   |
|1207   |To Kill a Mockingbird (1962)                             |
|1215   |Army of Darkness (1993)                                  |
|79132  |Inception (2010)                                         |
|589    |Terminator 2: Judgment Day (1991)                        |
|7445   |Man on Fire (2004)                                       |
|87192  |Attack the Block (2011)                                  |
|6385   |Whale Rider (2002)                                       |
|179135 |Blue Planet II (2017)                                    |
|2011   |Back to the Future Part II (1989)                        |
|57274  |[REC] (2007)                                             |
|64839  |Wrestler, The (2008)                   

In [109]:
ratings = ratings.withColumn("userId",ratings['userId'].cast(IntegerType())) \
                   .withColumn("movieId",ratings['movieId'].cast(IntegerType())) \
                   .withColumn("rating",ratings['rating'].cast(IntegerType()))
predictions=als_model.transform(ratings)
predictions.createOrReplaceTempView("predictions_sql")
movies.createOrReplaceTempView("movies_sql")

In [111]:
spark.sql(
"""select p.userId, p.movieId, p.rating, p.prediction, m.title, m.genres
from predictions_sql p
join movies_sql m on p.movieId=m.movieId
where p.userId == 1
order by p.prediction desc
""").show(20,False)

+------+-------+------+----------+----------------------------------------------------------------------------------------+-------------------------------------+
|userId|movieId|rating|prediction|title                                                                                   |genres                               |
+------+-------+------+----------+----------------------------------------------------------------------------------------+-------------------------------------+
|1     |306    |3     |4.1096406 |Three Colors: Red (Trois couleurs: Rouge) (1994)                                        |Drama                                |
|1     |307    |5     |4.0995245 |Three Colors: Blue (Trois couleurs: Bleu) (1993)                                        |Drama                                |
|1     |296    |5     |4.0422907 |Pulp Fiction (1994)                                                                     |Comedy|Crime|Drama|Thriller          |
|1     |4973   |4     |3.985

In [48]:
%%time
predictionsPerso = resultsALS.filter(F.col("userId")==0) \
                             .select(F.explode("predictions") \
                             .alias("movieId")) \
                             .join(dfMovies.select(["movieId", "title"]),
                                   "movieId") \
                             .join(dfRatings.filter(F.col("userId")==0),
                                   ['movieId'], how='left')

predictionsPerso.select(["title", "rating"]).show(10, truncate=False)

+-------------------------------------+------+
|title                                |rating|
+-------------------------------------+------+
|Foster (2018)                        |null  |
|The Thorn (1971)                     |null  |
|Hoaxed (2019)                        |null  |
|My Best Enemy (2006)                 |null  |
|The Challengers (1990)               |null  |
|Fracchia la belva umana (1981)       |null  |
|La leggenda di Al, John e Jack (2002)|null  |
|Heroes Above All (2017)              |null  |
|Of Two Minds (2012)                  |null  |
|King for a Day (1983)                |null  |
+-------------------------------------+------+
only showing top 10 rows

CPU times: user 149 ms, sys: 42.2 ms, total: 191 ms
Wall time: 12min 52s


In [None]:
%%time
predictionsPerso = resultsALS.filter(F.col("userId")==0) \
                             .select(F.explode("predictions") \
                             .alias("movieId")) \
                             .join(dfMovies.select(["movieId", "title"]),
                                   "movieId") \
                             .join(dfRatings.filter(F.col("userId")==0),
                                   ['movieId'], how='left')

predictionsPerso.select(["title", "rating"]).show(10, truncate=False)