# 1. Libraries

In [1]:
# Importing Libraries
import pandas as pd #For Data Manipulation and Analysis
import numpy as np  # For Working with Arrays and Linear Algebra
from surprise import Reader
from surprise import SVD
from surprise import Dataset
from surprise import NMF
from surprise import reader
from surprise.model_selection import cross_validate as cv
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error
from surprise.model_selection import PredefinedKFold


# Spark Libraries
from pyspark.sql import SparkSession # Import an SQL spark-session so that we can use dataframes
from pyspark.sql import SQLContext # Initiate the SQL spark session
from pyspark.sql.functions import udf, col, when # For user defined columns
from pyspark.ml.evaluation import RegressionEvaluator # To evaluate the Performance of the ALS model (RMSE & MAE)
from pyspark.ml.recommendation import ALS # The Alternating Least Squares Model
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator #  and hyperparametric tuning
from IPython.display import Image
from IPython.display import display
from pyspark.ml.recommendation import ALS 
from surprise import NormalPredictor
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType


# Build Spark Session

In [2]:
# Initiate spark session
spark = SparkSession.builder.appName('recommender').getOrCreate()

In [7]:
spark

# Import Data

In [3]:
import os
os.chdir(r'C:\Users\WALEED AMJAD ALI\Data_Analytics\Recommender Systems\ml-1m')

In [12]:
# Import ratings dataset
ratings = spark.read.csv(r'C:\Users\WALEED AMJAD ALI\Data_Analytics\Recommender Systems\ml-1m\ratings.csv', header=True,inferSchema=True)

# Import users dataset
users = spark.read.csv(r'C:\Users\WALEED AMJAD ALI\Data_Analytics\Recommender Systems\ml-1m\users.csv', header=True,inferSchema=True)

# Import movies dataset
movies = spark.read.csv(r'C:\Users\WALEED AMJAD ALI\Data_Analytics\Recommender Systems\ml-1m\movies.csv', header=True,inferSchema=True)

# EDA

In [19]:
# Examine variables and Datatypes
ratings.printSchema(), users.printSchema(), movies.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- movie_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: integer (nullable = true)

root
 |-- user_id: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- zipcode: integer (nullable = true)
 |-- age_desc: integer (nullable = true)
 |-- occ_desc: string (nullable = true)

root
 |-- movie_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



(None, None, None)

In [23]:
ratings.show()

+-------+--------+------+
|user_id|movie_id|rating|
+-------+--------+------+
|      1|    1193|     5|
|      1|     661|     3|
|      1|     914|     3|
|      1|    3408|     4|
|      1|    2355|     5|
|      1|    1197|     3|
|      1|    1287|     5|
|      1|    2804|     5|
|      1|     594|     4|
|      1|     919|     4|
|      1|     595|     5|
|      1|     938|     4|
|      1|    2398|     4|
|      1|    2918|     4|
|      1|    1035|     5|
|      1|    2791|     4|
|      1|    2687|     3|
|      1|    2018|     4|
|      1|    3105|     5|
|      1|    2797|     4|
+-------+--------+------+
only showing top 20 rows



# Data Preprocessing

In [22]:
# Drop the timestamp column as its currently irrelevant
ratings = ratings.drop('Timestamp')

In [25]:
# Splitting the dataset into training and testing at a ratio of 0.8 to 0.2
training, testing = ratings.randomSplit([0.8, 0.2])

# Predictive Modelling

In [26]:
# Instantiate the Alternating Least Squares Model
als = ALS(maxIter = 10,
         regParam = 0.1,
         rank = 4,
         userCol = "user_id",
         itemCol ="movie_id",
         ratingCol = "rating",
         coldStartStrategy="drop")

In [27]:
# Training the model
model = als.fit(training)

In [28]:
# Testing the model
predictions = model.transform(testing)

# Evaluation

In [29]:
# Evaluation metrics
evaluator = RegressionEvaluator(metricName = "rmse", labelCol = "rating", predictionCol = "prediction")

In [30]:
# RMSE is the metric used to for model evaluation
rmse = evaluator.evaluate(predictions)

In [54]:
# Baseline Alternating Least Squares RMSE 
print(f"Root Mean Square Error = {rmse}")

Root Mean Square Error = 0.8755804278146311


In [32]:
evaluator2 = RegressionEvaluator(metricName = "r2", labelCol = "rating", predictionCol = "prediction")

In [56]:
r2 = evaluator.evaluate(predictions)

In [57]:
print(r2)

0.8755804278146311


# Hyperparametric Tuning

In [55]:
# Instantiate gridsearch for hyperparametric tuning

# Hyperparameters
grid_search = ParamGridBuilder() \ 
.addGrid(als.regParam, [ 0.1, 0.2, 0.7 ] ) \
.addGrid(als.maxIter, [15, 17, 19])\
.addGrid(als.rank, range(2,8)) \
.build()


# Evaluation Metric
metric = RegressionEvaluator(metricName = ("rmse"), labelCol = "rating", predictionCol = "prediction")

# GridSeach CV
cv = CrossValidator(estimator = als,
                                  estimatorParamMaps = grid,
                                  evaluator = metric,
                                  numFolds=2)
cv_model = crossval.fit(training)
print("Mean absolute Error = " + str(rmse))

NameError: name 'iterations' is not defined

In [31]:
predictions

DataFrame[user_id: int, movie_id: int, rating: int, prediction: float]

In [35]:
predictions = predictions.filter(col('prediction') != np.nan)

In [36]:
print ("Root Mean Square Error = " + str(rmse))

Root Mean Square Error = 0.8755804278146311


In [37]:
# Now that the best model is retrieved, I will pull out 10 recommendations to compare actual and predicted values.
predictions.show(n = 10)

+-------+--------+------+----------+
|user_id|movie_id|rating|prediction|
+-------+--------+------+----------+
|   5333|     148|     3| 1.9733444|
|   3184|     148|     4| 3.1712599|
|   1242|     148|     3| 2.8761709|
|   3829|     148|     2| 2.6082404|
|   4169|     463|     2| 2.6110835|
|   5047|     463|     3| 2.1900086|
|   1069|     463|     2| 1.1038731|
|    202|     463|     3|   2.62006|
|   5511|     463|     2|  3.248498|
|    524|     463|     3|  2.302901|
+-------+--------+------+----------+
only showing top 10 rows



In [39]:
# Lets map these results back to their respective movie genres and title
predictions.join(movies, "movie_id").select("user_id","title","genres","prediction").show(10)

+-------+--------------------+--------------------+----------+
|user_id|               title|              genres|prediction|
+-------+--------------------+--------------------+----------+
|   5333|Awfully Big Adven...|               Drama| 1.9733444|
|   3184|Awfully Big Adven...|               Drama| 3.1712599|
|   1242|Awfully Big Adven...|               Drama| 2.8761709|
|   3829|Awfully Big Adven...|               Drama| 2.6082404|
|   4169|Guilty as Sin (1993)|Crime|Drama|Thriller| 2.6110835|
|   5047|Guilty as Sin (1993)|Crime|Drama|Thriller| 2.1900086|
|   1069|Guilty as Sin (1993)|Crime|Drama|Thriller| 1.1038731|
|    202|Guilty as Sin (1993)|Crime|Drama|Thriller|   2.62006|
|   5511|Guilty as Sin (1993)|Crime|Drama|Thriller|  3.248498|
|    524|Guilty as Sin (1993)|Crime|Drama|Thriller|  2.302901|
+-------+--------------------+--------------------+----------+
only showing top 10 rows



In [40]:
# Lets randomly choose a user to see how the actual movie recommendations for that user.
random_user_predictions = predictions.filter(col("user_id")== 1756).join(movies, "movie_id")
random_user_predictions.show(10)

+--------+-------+------+----------+--------------------+--------------------+
|movie_id|user_id|rating|prediction|               title|              genres|
+--------+-------+------+----------+--------------------+--------------------+
|    2572|   1756|     4| 3.2375116|10 Things I Hate ...|      Comedy|Romance|
|    3071|   1756|     5|  3.511016|Stand and Deliver...|               Drama|
|    3752|   1756|     1| 1.7478164|Me, Myself and Ir...|              Comedy|
|    2763|   1756|     4|  3.510744|Thomas Crown Affa...|     Action|Thriller|
|    3534|   1756|     3| 2.9733672|      28 Days (2000)|              Comedy|
|    2394|   1756|     4|  3.696369|Prince of Egypt, ...|   Animation|Musical|
|    2541|   1756|     1| 2.7659678|Cruel Intentions ...|               Drama|
|     260|   1756|     4| 4.1978264|Star Wars: Episod...|Action|Adventure|...|
|    3623|   1756|     5| 3.0280197|Mission: Impossib...|     Action|Thriller|
|    3160|   1756|     1| 2.2307444|     Magnolia (1

In [40]:
# Lets randomly choose a user to see how the actual movie recommendations for that user.
random_user_predictions = predictions.filter(col("user_id")== 2000).join(movies_df, "movie_id")
random_user_predictions.show(10)

+--------+-------+------+----------+--------------------+--------------------+
|movie_id|user_id|rating|prediction|               title|              genres|
+--------+-------+------+----------+--------------------+--------------------+
|      34|   2000|     4| 4.0843754|         Babe (1995)|Children's|Comedy...|
|    2355|   2000|     4| 3.8953161|Bug's Life, A (1998)|Animation|Childre...|
|    2791|   2000|     3|  3.800591|    Airplane! (1980)|              Comedy|
|       1|   2000|     4| 4.2299886|    Toy Story (1995)|Animation|Childre...|
|    2391|   2000|     4|  3.727282|Simple Plan, A (1...|      Crime|Thriller|
|    2694|   2000|     3| 2.6532254|    Big Daddy (1999)|              Comedy|
|    2761|   2000|     4| 4.0472383|Iron Giant, The (...|Animation|Children's|
|    2987|   2000|     3| 3.5692427|Who Framed Roger ...|Adventure|Animati...|
|    3717|   2000|     2| 2.5891695|Gone in 60 Second...|        Action|Crime|
|    3745|   2000|     3| 3.2322736|   Titan A.E. (2

In [41]:
# Produce top 5 recommendations for every user
userRecommend = model.recommendForAllUsers(5)

# Produce top 5 user recommendations for each movie
movieRecommends = model.recommendForAllItems(5)

In [47]:
x = userRecommend.select("user_id", "recommendations.movie_id").show(10, False)

+-------+-----------------------------+
|user_id|movie_id                     |
+-------+-----------------------------+
|1580   |[557, 2309, 989, 787, 3338]  |
|4900   |[557, 2197, 572, 3233, 3140] |
|5300   |[557, 2309, 787, 989, 2503]  |
|471    |[557, 2309, 3245, 787, 1664] |
|1591   |[557, 572, 989, 787, 1851]   |
|4101   |[776, 557, 2127, 887, 572]   |
|1342   |[557, 572, 1851, 989, 3172]  |
|2122   |[557, 572, 1851, 3245, 1664] |
|2142   |[557, 572, 1851, 1664, 3172] |
|463    |[557, 2309, 2342, 1664, 3245]|
+-------+-----------------------------+
only showing top 10 rows



In [48]:
x.show(n=20, truncate=50, vertical=False)

AttributeError: 'NoneType' object has no attribute 'show'

In [44]:
users = ratings.select("user_id").distinct().limit(5);
users.show()

+-------+
|user_id|
+-------+
|    148|
|    463|
|    471|
|    496|
|    833|
+-------+



In [50]:
userSubsetRecs = model.recommendForUserSubset(users,10)

In [52]:
userSubsetRecs.show(10, 50)

+-------+--------------------------------------------------+
|user_id|                                   recommendations|
+-------+--------------------------------------------------+
|    471|[[557, 5.465485], [2309, 4.926101], [3245, 4.66...|
|    463|[[557, 4.790712], [2309, 4.449182], [2342, 4.31...|
|    833|[[572, 5.86089], [557, 5.3887033], [2197, 5.317...|
|    496|[[557, 6.0280986], [572, 5.5311384], [989, 5.34...|
|    148|[[572, 5.6659465], [557, 5.5376205], [3314, 4.9...|
+-------+--------------------------------------------------+



In [53]:
userSubsetRecs.select("user_id", "recommendations.movie_id").show(10, False)

+-------+----------------------------------------------------------+
|user_id|movie_id                                                  |
+-------+----------------------------------------------------------+
|471    |[557, 2309, 3245, 787, 1664, 3542, 1901, 2019, 2503, 2905]|
|463    |[557, 2309, 2342, 1664, 3245, 2760, 2905, 670, 2019, 787] |
|833    |[572, 557, 2197, 2562, 985, 2776, 1780, 3314, 3233, 404]  |
|496    |[557, 572, 989, 2127, 776, 3338, 3172, 887, 318, 2571]    |
|148    |[572, 557, 3314, 2127, 1851, 3607, 404, 318, 3147, 2762]  |
+-------+----------------------------------------------------------+

