In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.appName('SPARK_APP').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
rating_schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movieId", IntegerType(), True),
    StructField("rating", DoubleType(), True),
    StructField("timestamp", StringType(), True),])

In [4]:
movies_schema = StructType([
    StructField("movieId", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("genre", StringType(), True),])

In [5]:
movies = spark.read.csv("datasets/ml-100k/movies.csv",header=True, schema = movies_schema)
ratings = spark.read.csv("Datasets/ml-100k/ratings.csv",header=True, schema = rating_schema)
ratings.show()

[Stage 0:>                                                          (0 + 1) / 1]

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



                                                                                

In [6]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: string (nullable = true)



In [7]:
movie_ratings = ratings.join(movies, ['movieId'], 'left')
movie_ratings.show()

                                                                                

+-------+------+------+---------+--------------------+--------------------+
|movieId|userId|rating|timestamp|               title|               genre|
+-------+------+------+---------+--------------------+--------------------+
|      1|     1|   4.0|964982703|    Toy Story (1995)|Adventure|Animati...|
|      3|     1|   4.0|964981247|Grumpier Old Men ...|      Comedy|Romance|
|      6|     1|   4.0|964982224|         Heat (1995)|Action|Crime|Thri...|
|     47|     1|   5.0|964983815|Seven (a.k.a. Se7...|    Mystery|Thriller|
|     50|     1|   5.0|964982931|Usual Suspects, T...|Crime|Mystery|Thr...|
|     70|     1|   3.0|964982400|From Dusk Till Da...|Action|Comedy|Hor...|
|    101|     1|   5.0|964980868|Bottle Rocket (1996)|Adventure|Comedy|...|
|    110|     1|   4.0|964982176|   Braveheart (1995)|    Action|Drama|War|
|    151|     1|   5.0|964984041|      Rob Roy (1995)|Action|Drama|Roma...|
|    157|     1|   5.0|964984100|Canadian Bacon (1...|          Comedy|War|
|    163|   

In [8]:
movie_ratings.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- title: string (nullable = true)
 |-- genre: string (nullable = true)



In [9]:
def get_mat_sparsity(ratings):
    # Count the total number of ratings in the dataset
    count_nonzero = ratings.select("rating").count()

    # Count the number of distinct userIds and distinct movieIds
    total_elements = ratings.select("userId").distinct().count() * ratings.select("movieId").distinct().count()

    # Divide the numerator by the denominator
    sparsity = (1.0 - (count_nonzero *1.0)/total_elements)*100
    print("The ratings dataframe is ", "%.2f" % sparsity + "% sparse.")
    
get_mat_sparsity(ratings)

[Stage 12:>                                                         (0 + 1) / 1]

The ratings dataframe is  98.30% sparse.


                                                                                

In [10]:
get_mat_sparsity(movie_ratings)

                                                                                

The ratings dataframe is  98.30% sparse.


In [11]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: string (nullable = true)



In [12]:
(train, test) = ratings.randomSplit([0.8, 0.2], seed = 2020)

In [13]:
def get_binary_data(ratings):
    ratings = ratings.withColumn('binary', lit(1))
    userIds = ratings.select("userId").distinct()
    movieIds = ratings.select("movieId").distinct()

    user_movie = userIds.crossJoin(movieIds).join(ratings, ['userId', 'movieId'], "left")
    user_movie = user_movie.select(['userId', 'movieId', 'binary']).fillna(0)
    return user_movie

user_movie = get_binary_data(ratings)

In [14]:
user_movie.show()

+------+-------+------+
|userId|movieId|binary|
+------+-------+------+
|   148|   1580|     0|
|   463|   1580|     0|
|   471|   1580|     0|
|   496|   1580|     0|
|   243|   1580|     0|
|   392|   1580|     0|
|   540|   1580|     0|
|    31|   1580|     0|
|   516|   1580|     0|
|    85|   1580|     0|
|   137|   1580|     1|
|   251|   1580|     0|
|   451|   1580|     0|
|   580|   1580|     1|
|    65|   1580|     0|
|   458|   1580|     0|
|    53|   1580|     0|
|   255|   1580|     0|
|   481|   1580|     0|
|   588|   1580|     0|
+------+-------+------+
only showing top 20 rows



In [15]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ALS model
als = ALS(
         userCol="userId", 
         itemCol="movieId",
         ratingCol="rating", 
         nonnegative = True, 
         implicitPrefs = False,
         coldStartStrategy="drop"
)

In [16]:
# Import the requisite packages
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

In [17]:
# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()

In [18]:
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(
           metricName="rmse", 
           labelCol="rating", 
           predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  16


In [19]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

In [20]:
#Fit cross validator to the 'train' dataset
model = cv.fit(train)
#Extract best model from the cv model above
best_model = model.bestModel
# View the predictions
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

                                                                                

0.8711442854997997


In [21]:
print("**Best Model**")
# Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())
# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

**Best Model**
  Rank: 50
  MaxIter: 10
  RegParam: 0.15


In [22]:
# Generate n Recommendations for all users
recommendations = best_model.recommendForAllUsers(5)
recommendations.show()



+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[{3379, 4.7252}, ...|
|    31|[{33649, 5.146888...|
|   251|[{3379, 5.7864122...|
|   451|[{3379, 5.306609}...|
|   481|[{3379, 4.0172496...|
|   321|[{33649, 4.831716...|
|   211|[{3379, 4.994942}...|
|   101|[{3379, 4.9838457...|
|    81|[{3379, 3.8757098...|
|   501|[{67618, 4.413088...|
|   271|[{3379, 4.5265336...|
|   463|[{3379, 4.8362446...|
|   243|[{33649, 5.426117...|
|    53|[{33649, 6.556576...|
|   133|[{3379, 3.8539922...|
|   513|[{3379, 4.9885983...|
|   593|[{3379, 4.311468}...|
|   193|[{7982, 4.753437}...|
|   183|[{3379, 4.695856}...|
|   253|[{7096, 5.072973}...|
+------+--------------------+
only showing top 20 rows



                                                                                

In [23]:
nrecommendations = recommendations\
    .withColumn("rec_exp", explode("recommendations"))\
    .select('userId', col("rec_exp.movieId"), col("rec_exp.rating"))
nrecommendations.limit(10).show()



+------+-------+---------+
|userId|movieId|   rating|
+------+-------+---------+
|   540|   3379| 5.301655|
|   540| 171495|  5.12863|
|   540|  33649|  5.04867|
|   540|  72171|5.0400224|
|   540|   7071|5.0400224|
|   580|   3379| 4.678757|
|   580|  33649| 4.651721|
|   580|  53123| 4.646753|
|   580| 171495|4.6348557|
|   580| 117531|4.6216216|
+------+-------+---------+



                                                                                

In [24]:
nrecommendations.join(movies, on='movieId').filter('userId = 100').show()



+-------+------+---------+--------------------+--------------------+
|movieId|userId|   rating|               title|               genre|
+-------+------+---------+--------------------+--------------------+
|  33649|   100|5.1647325|  Saving Face (2004)|Comedy|Drama|Romance|
|  67618|   100|4.9814887|Strictly Sexual (...|Comedy|Drama|Romance|
|   3379|   100| 4.956847| On the Beach (1959)|               Drama|
|   7121|   100| 4.921274|   Adam's Rib (1949)|      Comedy|Romance|
| 171495|   100|4.9207797|              Cosmos|  (no genres listed)|
+-------+------+---------+--------------------+--------------------+



                                                                                

In [25]:
ratings.join(movies, on='movieId').filter('userId = 100').sort('rating', ascending=False).limit(10).show()

+-------+------+------+----------+--------------------+--------------------+
|movieId|userId|rating| timestamp|               title|               genre|
+-------+------+------+----------+--------------------+--------------------+
|   1101|   100|   5.0|1100184137|      Top Gun (1986)|      Action|Romance|
|   1958|   100|   5.0|1100186258|Terms of Endearme...|        Comedy|Drama|
|   2423|   100|   5.0|1100186118|Christmas Vacatio...|              Comedy|
|   4041|   100|   5.0|1100184235|Officer and a Gen...|       Drama|Romance|
|   5620|   100|   5.0|1100186982|Sweet Home Alabam...|      Comedy|Romance|
|    368|   100|   4.5|1100183774|     Maverick (1994)|Adventure|Comedy|...|
|    934|   100|   4.5|1100186407|Father of the Bri...|              Comedy|
|    539|   100|   4.5|1100184295|Sleepless in Seat...|Comedy|Drama|Romance|
|     16|   100|   4.5|1100185959|       Casino (1995)|         Crime|Drama|
|    553|   100|   4.5|1100183810|    Tombstone (1993)|Action|Drama|Western|