# This tutorial for recommender engine usingCollaborative Filtering

it uses python 3.5.5 

In [1]:
! python --version

Python 3.5.6 :: Anaconda, Inc.


# importing needed packages 

In [2]:
import os
import pyspark
import urllib.request
import zipfile
from pyspark import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib.recommendation import ALS
import math
from time import time
from pyspark.mllib.recommendation import MatrixFactorizationModel

# Dataset info
    .Small: 100,000 ratings and 2,488 tag applications applied to 8,570 movies by 706 users. Last updated 4/2015.
    .Full: 21,000,000 ratings and 470,000 tag applications applied to 27,000 movies by 230,000 users. Last updated 4/2015.

In [3]:
complete_dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest.zip'
small_dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'

# Defining dataset location

In [4]:
if not os.path.exists('datasets'):
    os.makedirs('datasets')
cwd = os.getcwd()
datasets_path = os.path.join(cwd, 'datasets')
complete_dataset_path = os.path.join(datasets_path, 'ml-latest.zip')
small_dataset_path = os.path.join(datasets_path, 'ml-latest-small.zip')

# Download dataset
    . small dataset size = 955 kb 
    . complete dataset size = 264 mb
   in this tutorial we will use the small dataset

In [5]:
small_f = urllib.request.urlretrieve(small_dataset_url,small_dataset_path)
# complete_f = urllib.request.urlretrieve (complete_dataset_url, complete_dataset_path)

# Unzip datasets

In [6]:

with zipfile.ZipFile(small_dataset_path, "r") as z:
    z.extractall(datasets_path)

# with zipfile.ZipFile(complete_dataset_path, "r") as z:
#     z.extractall(datasets_path)

# Intializing spark context

In [7]:
spark = SparkSession \
   .builder \
    .master("local[*]") \
   .appName("Recommender-system") \
   .getOrCreate()

sc =  spark.sparkContext

# Load data set 
this tables we will use from dataset

Each line in the ratings dataset (ratings.csv) is formatted as: userId,movieId,rating,timestamp

Each line in the movies (movies.csv) dataset is formatted as: movieId,title,genres

In [8]:
# find ratings.csv path 
small_ratings_file = os.path.join(datasets_path, 'ml-latest-small', 'ratings.csv')
# load ratings.csv file to rdd 
small_ratings_raw_data = sc.textFile(small_ratings_file)
# find ratings.csv header
small_ratings_raw_data_header = small_ratings_raw_data.take(1)[0]

# parsing and removing irrelevant data from rating.csv 

In [9]:
# add file to rdd and removing header from it and removing timestamp from each row 
small_ratings_data = small_ratings_raw_data.filter(lambda line: line!=small_ratings_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1],tokens[2])).cache()

# Ensuring that data loaded correctly 
    RDD format -> ('user_id','movie_id','actual rating')

In [10]:
small_ratings_data.take(3)

[('1', '1', '4.0'), ('1', '3', '4.0'), ('1', '6', '4.0')]

# load movies.csv file

In [11]:
# finding movies.csv file 
small_movies_file = os.path.join(datasets_path, 'ml-latest-small', 'movies.csv')
# load movies.csv file to rdd
small_movies_raw_data = sc.textFile(small_movies_file)
# find rating .csv header 
small_movies_raw_data_header = small_movies_raw_data.take(1)[0]

# parsing and removing irrelevant data from movies.csv to RDD

In [12]:
# add file to rdd and removing header and genere from it 
small_movies_data = small_movies_raw_data.filter(lambda line: line!=small_movies_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (tokens[0],tokens[1])).cache()

# Ensuring that movies.cs file is loaded correctly 
    RDD formate -> ('movie_id','movie_name(production_year)')

In [13]:
small_movies_data.take(3)

[('1', 'Toy Story (1995)'),
 ('2', 'Jumanji (1995)'),
 ('3', 'Grumpier Old Men (1995)')]

# Parsing and spliting  dataset  to rdds 
    split dataset into
        . Traning dataset = 60%
        . Validation dataset = 20%
        . Test Dataset = 20%
note that we removed actual ratings from testing and validation dataset 

In [14]:
# split data randomly to training, validation and test data sets with ratio 60-20-20
training_RDD, validation_RDD, test_RDD = small_ratings_data.randomSplit([6, 2, 2], seed=0)
# remove actual user rating from validation dataset
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
# remove actual user rating from testing dataset
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

# Configuring and tunning our model

In [15]:
# seed is used for reproducability 
seed = 5
iterations = 10
# is used to avoid over and under fitting 
regularization_parameter = 0.1
tolerance = 0.02
ranks = [4, 8, 10]
min_error = float('inf')
best_rank = -1
best_model = None

for rank in ranks:
    model = ALS.train(training_RDD, rank, seed=seed, iterations=iterations, lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    print ('For rank {} the RMSE is {}'.format(rank, error))
    
    if error < min_error:
        min_error = error
        best_rank = rank
        best_model = model
        
print ('The best model was trained with rank {}'.format(best_rank))

For rank 4 the RMSE is 0.9190356728088455
For rank 8 the RMSE is 0.9187811173021727
For rank 10 the RMSE is 0.9162188124353527
The best model was trained with rank 10


# Saving the BEST model for future use 

In [16]:
# model_path = os.path.join('/home/tabdalla/Development/code/col-filtering_recommender-system', 'models', 'movie_lens_als_2')
# best_model.save(sc, model_path)

In [17]:
# model_path = os.path.join('/home/tabdalla/Development/code/col-filtering_recommender-system', 'models', 'movie_lens_als_2')
# loaded_model = MatrixFactorizationModel.load(sc, model_path)
# predictions = loaded_model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
# predictions.take(5)

# See predected results
    RDD format ((user_id,movie_is),predicted_rating)

In [18]:
predictions.take(5)

[((599, 45208), 1.5325057468429124),
 ((368, 3272), 2.6486584653325935),
 ((603, 3272), 3.038943126586739),
 ((182, 3272), 2.654129777267586),
 ((560, 52328), 3.1635993554993203)]

# Compare between predicted result and actual result in dataset
    RDD format ((user_id,movie_id),(actual_rating,predected_rating))

In [19]:
rates_and_preds.take(5)

[((140, 4322), (4.0, 3.274917309363992)),
 ((462, 148626), (3.5, 2.919451170962174)),
 ((427, 555), (4.5, 3.542172834664159)),
 ((279, 115569), (4.5, 3.786988737089138)),
 ((455, 539), (4.0, 3.602474727748841))]

# Test model using test dataset

In [20]:
model = ALS.train(training_RDD, best_rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
predictions = model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    
print ('For testing data the RMSE is {}'.format(error))

For testing data the RMSE is 0.9172323011798893


# Building the  model using the complete dataset instead of small data set
  in order to get better results we shall use the complete dataset but here we used the small one due to memory limitation 
  all we need to change 'ml-latest-small' -> 'ml-latest'

In [21]:
# Load the complete dataset file
## TAKE CARE HERE WE CHANGED USAGE OF COMPLETE PATH TO USE SMALL DUE TO MEMORY LIMITATION 
complete_ratings_file = os.path.join(datasets_path, 'ml-latest-small', 'ratings.csv')
complete_ratings_raw_data = sc.textFile(complete_ratings_file)
complete_ratings_raw_data_header = complete_ratings_raw_data.take(1)[0]


# Parse and modify ratings.csv file and parse it to rdd 
modification here means convert string results to int as we did before for accurate computation 

In [22]:

complete_ratings_data = complete_ratings_raw_data.filter(lambda line: line!=complete_ratings_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),int(tokens[1]),float(tokens[2]))).cache()
    
print ("There are {} ratings in the small dataset".format((complete_ratings_data.count())))

There are 100836 ratings in the small dataset


# Train model using complete dataset with chosen parameters
here we did't need validation dataset we need only test to find the error of our model so we splited our dataset to 
    Training -> 70%
    test -> 30% 

In [23]:
# split dataset to training and testing data sets with ration 70% - 30%  
training_RDD, test_RDD = complete_ratings_data.randomSplit([7, 3], seed=0)

complete_model = ALS.train(training_RDD, best_rank, seed=seed, 
                           iterations=iterations, lambda_=regularization_parameter)

# Test using testing dataset to calculate error

In [24]:
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

predictions = complete_model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    
print ('For testing data the RMSE is {}'.format(error))

For testing data the RMSE is 0.897018309116098


In [25]:
################################################################################################
#   HERE ALSO I CHANGED THE USAGE OF COMPLETE DATASET TO SMALL ONE DUE TO MEMORY LIMITATOIN    #
################################################################################################

In [26]:
complete_movies_file = os.path.join(datasets_path, 'ml-latest-small', 'movies.csv')
complete_movies_raw_data = sc.textFile(complete_movies_file)
complete_movies_raw_data_header = complete_movies_raw_data.take(1)[0]

# parsing movies.csv file 

In [27]:
complete_movies_data = complete_movies_raw_data.filter(lambda line: line!=complete_movies_raw_data_header)\
    .map(lambda line: line.split(",")).map(lambda tokens: (int(tokens[0]),tokens[1],tokens[2])).cache()

complete_movies_titles = complete_movies_data.map(lambda x: (int(x[0]),x[1]))
    
print ("There are {}movies in the complete dataset ".format(complete_movies_titles.count()))


There are 9742movies in the complete dataset 


# Calculate average rating for each movie

In [28]:
def get_counts_and_averages(ID_and_ratings_tuple):
    nratings = len(ID_and_ratings_tuple[1])
    return ID_and_ratings_tuple[0], (nratings, float(sum(x for x in ID_and_ratings_tuple[1]))/nratings)

movie_ID_with_ratings_RDD = (complete_ratings_data.map(lambda x: (x[1], x[2])).groupByKey())
movie_ID_with_avg_ratings_RDD = movie_ID_with_ratings_RDD.map(get_counts_and_averages)
movie_rating_counts_RDD = movie_ID_with_avg_ratings_RDD.map(lambda x: (x[0], x[1][0]))

In [29]:
# Add new user to use it in our model 

In [30]:
new_user_ID = 1

# The format of each line is (userID, movieID, rating)
new_user_ratings = [
     (0,260,4), # Star Wars (1977)
     (0,1,3), # Toy Story (1995)
     (0,16,2), # Casino (1995)
     (0,25,3), # Leaving Las Vegas (1995)
     (0,32,4), # Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
     (0,335,4), # Flintstones, The (1994)
     (0,379,3), # Timecop (1994)
     (0,296,2), # Pulp Fiction (1994)
     (0,858,5) , # Godfather, The (1972)
     (0,50,3) # Usual Suspects, The (1995)
    ]
new_user_ratings_RDD = sc.parallelize(new_user_ratings)
print ('New user ratings: {}'.format(new_user_ratings_RDD.take(10)))

New user ratings: [(0, 260, 4), (0, 1, 3), (0, 16, 2), (0, 25, 3), (0, 32, 4), (0, 335, 4), (0, 379, 3), (0, 296, 2), (0, 858, 5), (0, 50, 3)]


# Join new user rdd with oold one 

In [31]:
complete_data_with_new_ratings_RDD = complete_ratings_data.union(new_user_ratings_RDD)

In [32]:
# Train new ALS Model 

In [33]:
t0 = time()
new_ratings_model = ALS.train(complete_data_with_new_ratings_RDD, best_rank, seed=seed, 
                              iterations=iterations, lambda_=regularization_parameter)
tt = time() - t0

print ("New model trained in {} seconds".format(round(tt,3)))

New model trained in 1.611 seconds


# predict ratings for user id 

In [34]:
new_user_ratings_ids = map(lambda x: x[1], new_user_ratings)

new_user_unrated_movies_RDD = (complete_movies_data.filter(lambda x: x[0] not in new_user_ratings_ids).map(lambda x: (new_user_ID, x[0])))

# Use the input RDD, new_user_unrated_movies_RDD, with new_ratings_model.predictAll() to predict new ratings for the movies
new_user_recommendations_RDD = new_ratings_model.predictAll(new_user_unrated_movies_RDD)


# Transform new_user_recommendations_RDD into pairs of the form (Movie ID, Predicted Rating)

In [35]:
new_user_recommendations_rating_RDD = new_user_recommendations_RDD.map(lambda x: (x.product, x.rating))
new_user_recommendations_rating_title_and_count_RDD = \
    new_user_recommendations_rating_RDD.join(complete_movies_titles).join(movie_rating_counts_RDD)
new_user_recommendations_rating_title_and_count_RDD.take(3)

[(2052, ((3.3494157407042504, 'Hocus Pocus (1993)'), 23)),
 (80880, ((3.9365597171089948, 'Stone (2010)'), 2)),
 (4104, ((1.4549844191723624, 'Ernest Goes to Camp (1987)'), 5))]

# New user recommendation RDD 

In [36]:
new_user_recommendations_rating_title_and_count_RDD = \
    new_user_recommendations_rating_title_and_count_RDD.map(lambda r: (r[1][0][1], r[1][0][0], r[1][1]))

In [37]:
# Find top rated movies with more than 25 ratings 

In [38]:
top_movies = new_user_recommendations_rating_title_and_count_RDD.filter(lambda r: r[2]>=25).takeOrdered(25, key=lambda x: -x[1])

print ('TOP recommended movies (with more than 25 reviews):\n%s' %
        '\n'.join(map(str, top_movies)))

TOP recommended movies (with more than 25 reviews):
('"Shawshank Redemption', 5.285185716267375, 317)
('"Grand Day Out with Wallace and Gromit', 5.223192281612513, 28)
('Harold and Maude (1971)', 5.19906152114903, 26)
('Star Wars: Episode V - The Empire Strikes Back (1980)', 5.133441825730453, 211)
("Rosemary's Baby (1968)", 5.105022183979428, 32)
('"Great Escape', 5.0930324621117276, 43)
('"Philadelphia Story', 5.075125732289349, 29)
('Star Wars: Episode IV - A New Hope (1977)', 5.061964838059363, 251)
("Schindler's List (1993)", 5.060758112073332, 220)
('"Princess Bride', 5.05355591202288, 142)
('Wallace & Gromit: The Best of Aardman Animation (1996)', 5.045302936015783, 27)
('Wallace & Gromit: The Wrong Trousers (1993)', 5.032148118594586, 56)
('Forrest Gump (1994)', 5.030008197569898, 329)
('"Godfather: Part II', 5.008653445361541, 129)
('American History X (1998)', 5.005440471513096, 129)
("One Flew Over the Cuckoo's Nest (1975)", 5.004983065406305, 133)
('Fight Club (1999)', 5.00

In [39]:
# Predict rating for user ID = 0

In [40]:
my_movie = sc.parallelize([(0, 500)]) # Quiz Show (1994)
individual_movie_rating_RDD = new_ratings_model.predictAll(new_user_unrated_movies_RDD)
individual_movie_rating_RDD.take(1)

[Rating(user=1, product=45208, rating=2.349938151874632)]

In [41]:
#save model for future use 

In [42]:

model_path = os.path.join(cwd, 'models', 'movie_lens_als')

# Save and load model
model.save(sc, model_path)
same_model = MatrixFactorizationModel.load(sc, model_path)