In [1]:
# This tutorial for recommender engine usingCollaborative Filtering

In [2]:
# it uses python version 3.5.6

In [3]:
! python --version

Python 3.5.6 :: Anaconda, Inc.


In [4]:
# importing needed packages 

In [5]:
import os

import urllib.request

import zipfile

from pyspark.context import SparkContext
from pyspark.sql import SparkSession

from pyspark.sql.types import IntegerType
from pyspark.sql.types import FloatType

from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

from time import time

from pyspark.sql.functions import lit





##################################################
####              UNUSED IMPORTS             ####
#################################################
# from pandas import Series, DataFrame
# import pandas as pd
# import math
# from pyspark.sql.functions import udf, lit
# from pyspark.sql.types import  BooleanType, StringType
# import pyspark
# from pyspark import SparkConf
# from pyspark.mllib.recommendation import ALS
# from pyspark.sql import Row
# from pyspark.mllib.recommendation import MatrixFactorizationModel

In [6]:
# Dataset info
#     .Small: 100,000 ratings and 2,488 tag applications applied to 8,570 movies by 706 users. Last updated 4/2015.
#     .Full: 21,000,000 ratings and 470,000 tag applications applied to 27,000 movies by 230,000 users. Last updated 4/2015.

In [7]:
complete_dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest.zip'
small_dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'

In [8]:
# Defining dataset location

In [9]:
if not os.path.exists('datasets'):
    os.makedirs('datasets')
cwd = os.getcwd()
datasets_path = os.path.join(cwd, 'datasets')
complete_dataset_path = os.path.join(datasets_path, 'ml-latest.zip')
small_dataset_path = os.path.join(datasets_path, 'ml-latest-small.zip')

In [10]:
# Download dataset
#     . small dataset size = 955 kb 
#     . complete dataset size = 264 mb
#    in this tutorial we will use the small dataset

In [11]:
if not os.path.isfile(small_dataset_path):
    small_f = urllib.request.urlretrieve(small_dataset_url,small_dataset_path)
# complete_f = urllib.request.urlretrieve (complete_dataset_url, complete_dataset_path)

In [12]:
# Unzip datasets

In [13]:
with zipfile.ZipFile(small_dataset_path, "r") as z:
    z.extractall(datasets_path)

# with zipfile.ZipFile(complete_dataset_path, "r") as z:
#     z.extractall(datasets_path)

In [14]:
# Intializing spark context

In [15]:
spark = SparkSession \
   .builder \
    .master("local[*]") \
   .appName("Recommender-system") \
   .getOrCreate()

sc =  spark.sparkContext

In [16]:
# Load data set 
# this tables we will use from dataset

# Each line in the ratings dataset (ratings.csv) is formatted as: userId,movieId,rating,timestamp

# Each line in the movies (movies.csv) dataset is formatted as: movieId,title,genres

In [17]:
# find ratings.csv path 
small_ratings_file = os.path.join(datasets_path, 'ml-latest-small', 'ratings.csv')
# load ratings.csv file to dataframe and drop timespamp column 
rating_df = spark.read.format("csv").option("header", "true").load(small_ratings_file).drop('timestamp')

In [18]:
# Ensuring that rating dataframe is loaded correctly 
rating_df.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
|     1|     70|   3.0|
|     1|    101|   5.0|
|     1|    110|   4.0|
|     1|    151|   5.0|
|     1|    157|   5.0|
|     1|    163|   5.0|
|     1|    216|   5.0|
|     1|    223|   3.0|
|     1|    231|   5.0|
|     1|    235|   4.0|
|     1|    260|   5.0|
|     1|    296|   3.0|
|     1|    316|   3.0|
|     1|    333|   5.0|
|     1|    349|   4.0|
+------+-------+------+
only showing top 20 rows



In [19]:
# Casting rating dataframe coloumns to int

In [20]:
# casting userId to int
rating_df = rating_df.withColumn('userId', rating_df['userId'].cast(IntegerType()))
# casting movieId to int 
rating_df = rating_df.withColumn('movieId', rating_df['movieId'].cast(IntegerType()))
#casting rating to float 
rating_df = rating_df.withColumn('rating', rating_df['rating'].cast(FloatType()))


In [21]:
# print schema of rating dataframe 
rating_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)



In [22]:
# load movies.csv file

In [23]:
# finding movies.csv file 
movies_file = os.path.join(datasets_path, 'ml-latest-small', 'movies.csv')
# load movies.csv file to dataframe and drop genres column 
movies_df = spark.read.format("csv").option("header", "true").load(movies_file).drop('genres')

In [24]:
# ensuring that movies dataframe is loaded correctly
movies_df.show()

+-------+--------------------+
|movieId|               title|
+-------+--------------------+
|      1|    Toy Story (1995)|
|      2|      Jumanji (1995)|
|      3|Grumpier Old Men ...|
|      4|Waiting to Exhale...|
|      5|Father of the Bri...|
|      6|         Heat (1995)|
|      7|      Sabrina (1995)|
|      8| Tom and Huck (1995)|
|      9| Sudden Death (1995)|
|     10|    GoldenEye (1995)|
|     11|American Presiden...|
|     12|Dracula: Dead and...|
|     13|        Balto (1995)|
|     14|        Nixon (1995)|
|     15|Cutthroat Island ...|
|     16|       Casino (1995)|
|     17|Sense and Sensibi...|
|     18|   Four Rooms (1995)|
|     19|Ace Ventura: When...|
|     20|  Money Train (1995)|
+-------+--------------------+
only showing top 20 rows



In [25]:
#casting movies dataframe to int 

In [26]:
movies_df = movies_df.withColumn('movieId' ,movies_df['movieId'].cast(IntegerType()))

In [27]:
# print movies_df schema 
movies_df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)



In [28]:
# Parsing and spliting  dataset  to rdds 
#     split dataset into
#         . Traning dataset = 60%
#         . Validation dataset = 20%
#         . Test Dataset = 20%
# note that we removed actual ratings from testing and validation dataset 

In [29]:
training_df,test_df = rating_df.randomSplit([0.8, 0.2])

In [30]:
# Configuring and tunning our model

In [31]:
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop")

param_grid = ParamGridBuilder().addGrid(
    als.rank,
    [10, 15],
).addGrid(
    als.maxIter,
    [10, 15],
).addGrid(
    als.regParam,
    [0.1,0.01,0.2],
).build()

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
)
tvs = TrainValidationSplit(
    estimator=als,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
)


model = tvs.fit(training_df)

bestmodel=model.bestModel


In [32]:
# saving best model parameters

In [33]:
best_rank=bestmodel.rank
best_regParm=bestmodel._java_obj.parent().getRegParam()
best_iterations=bestmodel._java_obj.parent().getMaxIter()


In [34]:
# Saving the BEST model for future use 

In [35]:
df= bestmodel.transform(test_df)
df.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|    91|    471|   1.0| 3.1514268|
|   409|    471|   3.0| 3.3944638|
|   372|    471|   3.0| 3.1064243|
|   599|    471|   2.5| 2.8177652|
|   603|    471|   4.0| 3.2178159|
|   474|    471|   3.0|  3.315065|
|   462|    471|   2.5| 2.9463058|
|   217|    471|   2.0| 2.7720299|
|   520|    471|   5.0| 3.6111546|
|   136|    471|   4.0| 3.6572232|
|   609|    833|   3.0|  1.665521|
|    20|   1088|   4.5| 3.4505277|
|   169|   1088|   4.5| 4.0843716|
|   563|   1088|   4.0| 3.2047427|
|   555|   1088|   4.0| 3.4016178|
|   221|   1088|   3.0| 3.1049845|
|    68|   1088|   3.5| 3.0921006|
|   600|   1088|   3.5| 2.4753342|
|   517|   1088|   1.0|  2.470663|
|    19|   1238|   3.0| 3.1355453|
+------+-------+------+----------+
only showing top 20 rows



In [36]:
# Test model using test dataset

In [37]:
predictions_df = model.transform(test_df)

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                    predictionCol="prediction")
error = evaluator.evaluate(predictions_df)


In [38]:
# printing ther error of our model
print ('For testing data the RMSE is {}'.format(error))

For testing data the RMSE is 0.877753701089013


In [39]:
################################################################################
###             BUILDING MODEL USING COMPLETE DATASET                       ####
################################################################################
#N.B -> here we alos using the small one beacuse of memory 

In [40]:
# Load the complete dataset file
############################################################################################
## TAKE CARE HERE WE CHANGED USAGE OF COMPLETE PATH TO USE SMALL DUE TO MEMORY LIMITATION ##
#############################################################################################
complete_ratings_file = os.path.join(datasets_path, 'ml-latest-small', 'ratings.csv')


In [41]:
# load ratings file to dataframe and removing timestamp column 
complete_ratings_df = spark.read.format("csv").option("header", "true").load(complete_ratings_file).drop('timestamp')
# casting userId to int
complete_ratings_df = complete_ratings_df.withColumn('userId', complete_ratings_df['userId'].cast(IntegerType()))
# casting movieId to int 
complete_ratings_df = complete_ratings_df.withColumn('movieId', complete_ratings_df['movieId'].cast(IntegerType()))
#casting rating to float 
complete_ratings_df = complete_ratings_df.withColumn('rating', complete_ratings_df['rating'].cast(FloatType()))
    
print ("There are {} ratings in the complete dataset".format((complete_ratings_df.count())))

There are 100836 ratings in the complete dataset


In [42]:
# Train model using complete dataset with chosen parameters

# here we did't need validation dataset we need only test to find the error of our model so we splited our dataset to Training -> 70% test -> 30%

In [43]:
training_df, test_df = complete_ratings_df.randomSplit([0.8,0.2])
# building model
als = ALS(maxIter=best_iterations, regParam=best_regParm,rank=best_rank, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop")
model = als.fit(training_df)


In [44]:
# CALCULATING MODEL ERROR  

In [45]:
predictions = model.transform(test_df)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
error = evaluator.evaluate(predictions)

print ('For testing data the RMSE is {}'.format(error))

For testing data the RMSE is 0.8893666775012877


In [46]:
################################################################################
###               RECOMMENDATION ENGINE STARTS HERE                         ####
################################################################################
#N.B -> here we alos using the small one beacuse of memory 

In [47]:
# find movies.csv file location 
complete_movies_file = os.path.join(datasets_path, 'ml-latest-small', 'movies.csv')
#load movies.csv filt to dataframe and removing genres coloumn 
complete_movies_df = spark.read.format("csv").option("header", "true").load(complete_movies_file).drop('genres')
# casting movieId to integer 
complete_movies_df = complete_movies_df.withColumn('movieId' ,complete_movies_df['movieId'].cast(IntegerType()))

In [48]:
# counting movies 
print ("There are {} movies in the complete dataset ".format(complete_movies_df.count()))

There are 9742 movies in the complete dataset 


In [49]:
# Calculate average rating for each movie

In [50]:
movie_ID_with_avg_ratings_df=complete_ratings_df.groupby('movieId').agg({'rating':'avg'})
movies_rating_counts_df=complete_ratings_df.groupby("movieId").count()

In [51]:
# Add New User to our model 

In [52]:
new_user_ID = 1

# The format of each line is (userID, movieID, rating)
new_user_ratings = [
     (1,2,4), # Star Wars (1977)
     (1,1,3), # Toy Story (1995)
     (1,16,2), # Casino (1995)
     (1,25,3), # Leaving Las Vegas (1995)
     (1,32,4), # Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
     (1,335,4), # Flintstones, The (1994)
     (1,379,3), # Timecop (1994)
     (1,296,2), # Pulp Fiction (1994)
     (1,858,5) , # Godfather, The (1972)
     (1,50,3) # Usual Suspects, The (1995)
    ]

# creating new_user_rating dataframe
new_user_ratings_df = spark.createDataFrame(new_user_ratings)

#casting userId,MovieId, rating to their appropriate datatypes
new_user_ratings_df = new_user_ratings_df.withColumn('userId', new_user_ratings_df[0].cast(IntegerType()))
new_user_ratings_df = new_user_ratings_df.withColumn('movieId', new_user_ratings_df[1].cast(IntegerType()))
new_user_ratings_df = new_user_ratings_df.withColumn('rating', new_user_ratings_df[2].cast(FloatType()))

# removing irrelevant data from new_user_ratings_df
new_user_ratings_df = new_user_ratings_df.selectExpr("userId","movieId","rating")


In [53]:
#ensuring that new_user_ratings_df loaded correctly and checking schema
new_user_ratings_df.show()
new_user_ratings_df.printSchema

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      2|   4.0|
|     1|      1|   3.0|
|     1|     16|   2.0|
|     1|     25|   3.0|
|     1|     32|   4.0|
|     1|    335|   4.0|
|     1|    379|   3.0|
|     1|    296|   2.0|
|     1|    858|   5.0|
|     1|     50|   3.0|
+------+-------+------+



<bound method DataFrame.printSchema of DataFrame[userId: int, movieId: int, rating: float]>

In [54]:
# MERGE NEW USER RATINGS WITH THE COMPLETE RATINGS
complete_data_with_new_ratings_df = complete_ratings_df.union(new_user_ratings_df)



In [55]:
print(complete_ratings_df.count())
print(complete_data_with_new_ratings_df.count())


100836
100846


In [56]:
#checking that new_user_ratings added correctly
#N.B recall that number of rating was less than this number by 10 ...
complete_data_with_new_ratings_df.count()

100846

In [57]:
complete_data_with_new_ratings_df.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
|     1|     70|   3.0|
|     1|    101|   5.0|
|     1|    110|   4.0|
|     1|    151|   5.0|
|     1|    157|   5.0|
|     1|    163|   5.0|
|     1|    216|   5.0|
|     1|    223|   3.0|
|     1|    231|   5.0|
|     1|    235|   4.0|
|     1|    260|   5.0|
|     1|    296|   3.0|
|     1|    316|   3.0|
|     1|    333|   5.0|
|     1|    349|   4.0|
+------+-------+------+
only showing top 20 rows



In [58]:
# TRAIN NEW MODEL WITH NEW ADDED DATA 
t0 = time()
als = ALS(maxIter=best_iterations, regParam=best_regParm,rank=best_rank, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop")
new_ratings_model = als.fit(complete_data_with_new_ratings_df)
tt = time() - t0

print ("New model trained in {} seconds".format(round(tt,3)))

New model trained in 1.777 seconds


In [59]:
# taking invalid predictions 

In [60]:
# extracting new rated movies into a list 
invalid_predictions = list(map(lambda x: x[1], new_user_ratings))
# extracting user_id from new_user_ratings_df
user_id=new_user_ratings_df.select('userId').take(1)[0]['userId']

In [61]:
#print pre-rated movies (invalid)
print(invalid_predictions)

[2, 1, 16, 25, 32, 335, 379, 296, 858, 50]


In [62]:
# new_movies_df 
new_movies_df=complete_movies_df.filter(~complete_movies_df['movieId'].isin(invalid_predictions)).withColumn('userId', lit(user_id)).drop('title')

In [63]:
# print new_movies_df (this dataframe should be all from valid movies)
new_movies_df=new_movies_df.select('userId', 'movieId').cache()
new_movies_df.show()

+------+-------+
|userId|movieId|
+------+-------+
|     1|      3|
|     1|      4|
|     1|      5|
|     1|      6|
|     1|      7|
|     1|      8|
|     1|      9|
|     1|     10|
|     1|     11|
|     1|     12|
|     1|     13|
|     1|     14|
|     1|     15|
|     1|     17|
|     1|     18|
|     1|     19|
|     1|     20|
|     1|     21|
|     1|     22|
|     1|     23|
+------+-------+
only showing top 20 rows



In [64]:
new_movies_df.show()
new_movies_df.printSchema()

+------+-------+
|userId|movieId|
+------+-------+
|     1|      3|
|     1|      4|
|     1|      5|
|     1|      6|
|     1|      7|
|     1|      8|
|     1|      9|
|     1|     10|
|     1|     11|
|     1|     12|
|     1|     13|
|     1|     14|
|     1|     15|
|     1|     17|
|     1|     18|
|     1|     19|
|     1|     20|
|     1|     21|
|     1|     22|
|     1|     23|
+------+-------+
only showing top 20 rows

root
 |-- userId: integer (nullable = false)
 |-- movieId: integer (nullable = true)



In [65]:
new_ratings_model.userFactors.show()

+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[-0.5427709, 0.29...|
| 20|[-0.16825935, -0....|
| 30|[-0.3532909, -0.2...|
| 40|[-0.72978, 0.0269...|
| 50|[-0.26852262, 0.0...|
| 60|[-0.4512421, -0.1...|
| 70|[-0.4393683, -0.0...|
| 80|[-0.61166096, 0.0...|
| 90|[-0.46840835, 0.0...|
|100|[-0.5718834, -0.1...|
|110|[-0.31890944, 0.1...|
|120|[-0.48787358, -0....|
|130|[-0.49742514, -0....|
|140|[-0.54451835, 0.0...|
|150|[-0.5248455, -0.0...|
|160|[-0.37851775, 0.5...|
|170|[-0.40053913, -0....|
|180|[-0.4262179, 0.06...|
|190|[-0.33076447, -0....|
|200|[-0.35692716, -0....|
+---+--------------------+
only showing top 20 rows



In [66]:
# finding new user recommendations 
new_user_recommendations_df = bestmodel.transform(new_movies_df)
new_user_recommendations_df.show()

+------+-------+----------+
|userId|movieId|prediction|
+------+-------+----------+
|     1|    148| 4.3434587|
|     1|    471|  4.290055|
|     1|    496| 4.3434587|
|     1|    833| 2.1454613|
|     1|   1088| 3.7730947|
|     1|   1238|  4.607239|
|     1|   1342|  3.102829|
|     1|   1580| 4.0160565|
|     1|   1591| 3.0741577|
|     1|   1645| 3.8810644|
|     1|   1829| 3.8038137|
|     1|   1959| 4.3276644|
|     1|   2122| 3.0483038|
|     1|   2142| 3.4739676|
|     1|   2366|  3.997889|
|     1|   2659|  2.011827|
|     1|   2866| 3.7860649|
|     1|   3175|  4.028168|
|     1|   3794| 2.9218812|
|     1|   3918| 3.7580264|
+------+-------+----------+
only showing top 20 rows



In [67]:
# joing new_user_recommendations_df with movies_rating_counts_df in order in order to recommend the highly rated movies
new_user_rating_recommendation_df =new_user_recommendations_df.join(movies_rating_counts_df, on='movieId')

In [68]:
# Top 25 moveis recommended for user 
top_movies=new_user_rating_recommendation_df[(new_user_rating_recommendation_df['count'] >= 25)].head(25)

In [69]:
# Prining Top 25 moveis recommended for user 
top_movies

[Row(movieId=471, userId=1, prediction=4.290054798126221, count=40),
 Row(movieId=1088, userId=1, prediction=3.773094654083252, count=42),
 Row(movieId=1580, userId=1, prediction=4.016056537628174, count=165),
 Row(movieId=1591, userId=1, prediction=3.07415771484375, count=26),
 Row(movieId=1645, userId=1, prediction=3.8810644149780273, count=51),
 Row(movieId=2366, userId=1, prediction=3.9978890419006348, count=25),
 Row(movieId=3175, userId=1, prediction=4.028168201446533, count=75),
 Row(movieId=1025, userId=1, prediction=4.284240245819092, count=25),
 Row(movieId=1084, userId=1, prediction=4.723385810852051, count=35),
 Row(movieId=1127, userId=1, prediction=4.0520405769348145, count=62),
 Row(movieId=1721, userId=1, prediction=3.7387449741363525, count=140),
 Row(movieId=2580, userId=1, prediction=4.54417085647583, count=39),
 Row(movieId=3698, userId=1, prediction=3.9495725631713867, count=34),
 Row(movieId=48780, userId=1, prediction=4.631921768188477, count=90),
 Row(movieId=69