In [13]:
from pyspark.mllib.recommendation import ALS
import math
from time import time

# Some constants
DATASET_RATINGS_PATH = "./ratings_full.csv"
DATASET_RATINGS_SMALL_PATH = "./ratings_small.csv"
DATASET_MOVIES_PATH = "./movies_full.csv"
DATASET_MOVIES_SMALL_PATH = "./movies_small.csv"

# The minimum number of reviews a movie must have, to recommend it reliably
REVIEW_MIN_AMOUNT = 20

# Define some helper functions
def readCSV(fname, removeHeader=False, separator=','):
    print("Loading file ", fname, "...")
    rdd = sc.textFile(fname)
    if removeHeader:
        firstline = rdd.first()
        rdd = rdd.filter(lambda x: x != firstline)
    return rdd.map(lambda x: x.split(separator))

# Load the ratings data
ratings_data = readCSV(DATASET_RATINGS_PATH, removeHeader=True)
small_ratings_data = readCSV(DATASET_RATINGS_SMALL_PATH, removeHeader=True)

# Load the movies data
movies_data = readCSV(DATASET_MOVIES_PATH, removeHeader=True)
small_movies_data = readCSV(DATASET_MOVIES_SMALL_PATH, removeHeader=True)

print("Parsing datasets...")

# Parse the ratings data
# [user_id, movie_id, rating, timestamp] -> (user_id, movie_id, rating)
ratings_data = ratings_data.map(lambda x: (int(x[0]), int(x[1]), float(x[2])))\
    .cache()
print("There are", ratings_data.count(), "ratings in the complete dataset.")
      
small_ratings_data = small_ratings_data.map(lambda x: (int(x[0]), int(x[1]), float(x[2])))\
    .cache()
print("There are", small_ratings_data.count(), "ratings in the small dataset.")

# Parse the movies data
# [id, title, genres[]] -> (id, title)
movies_data = movies_data.map(lambda x: (int(x[0]), x[1]))\
    .cache()
print("There are", movies_data.count(), "movies in the complete dataset.")
      
small_movies_data = small_movies_data.map(lambda x: (int(x[0]), x[1]))\
    .cache()
print("There are", small_movies_data.count(), "movies in the small dataset.")

print("Start training with small dataset...")

# Create some training sets based on the small data
training_RDD, validation_RDD, test_RDD = small_ratings_data.randomSplit([6, 2, 2], seed=0)
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

# Define some machine learning parameters
seed = 5
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

min_error = float("inf")
best_rank = -1
best_iteration = -1
for rank in ranks:
    model = ALS.train(training_RDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print("For rank", rank, "the RMSE is", error)
    if error < min_error:
        min_error = error
        best_rank = rank

print("The best model was trained with rank", best_rank)


print("Start training on complete dataset...")

training_RDD, test_RDD = ratings_data.randomSplit([7, 3], seed=0)

complete_model = ALS.train(training_RDD, best_rank, seed = seed,\
                          iterations = iterations, lambda_ = regularization_parameter)



test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

predictions = complete_model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    
print("For testing data the RMSE is", error)




def get_counts_and_averages(ID_and_ratings_tuple):
    nratings = len(ID_and_ratings_tuple[1])
    return ID_and_ratings_tuple[0], (nratings, float(sum(x for x in ID_and_ratings_tuple[1]))/nratings)

# Count the number of ratings per movie
movie_ID_with_ratings_RDD = (ratings_data.map(lambda x: (x[1], x[2])).groupByKey())
movie_ID_with_avg_ratings_RDD = movie_ID_with_ratings_RDD.map(get_counts_and_averages)
movie_rating_counts_RDD = movie_ID_with_avg_ratings_RDD.map(lambda x: (x[0], x[1][0]))

Loading file  ./ratings_full.csv ...
Loading file  ./ratings_small.csv ...
Loading file  ./movies_full.csv ...
Loading file  ./movies_small.csv ...
Parsing datasets...
There are 24404096 ratings in the complete dataset.
There are 100004 ratings in the small dataset.
There are 40110 movies in the complete dataset.
There are 9125 movies in the small dataset.
Start training with small dataset...
For rank 4 the RMSE is 0.9405925542574993
For rank 8 the RMSE is 0.9451745059144596
For rank 12 the RMSE is 0.9435903947376889
The best model was trained with rank 4
Start training on complete dataset...
For testing data the RMSE is 0.8318525567660949


In [21]:
new_user_ID = ratings_data.first()[0]
print("User ID:", new_user_ID)

# Get the list of movies watched by the user
new_user_watched_ids = ratings_data.filter(lambda x: x[0] == new_user_ID).map(lambda x: x[0]).collect()

# keep just those not on the ID list (thanks Lei Li for spotting the error!)
new_user_unrated_movies_RDD = (movies_data.filter(lambda x: x[0] not in new_user_watched_ids).map(lambda x: (new_user_ID, x[0])))

# Use the input RDD, new_user_unrated_movies_RDD, with new_ratings_model.predictAll() to predict new ratings for the movies
new_user_recommendations_RDD = complete_model.predictAll(new_user_unrated_movies_RDD)

# Transform new_user_recommendations_RDD into pairs of the form (Movie ID, Predicted Rating)
new_user_recommendations_rating_RDD = new_user_recommendations_RDD.map(lambda x: (x.product, x.rating))
new_user_recommendations_rating_title_and_count_RDD = \
    new_user_recommendations_rating_RDD.join(movies_data).join(movie_rating_counts_RDD)
#print(new_user_recommendations_rating_title_and_count_RDD.take(5))

new_user_recommendations_rating_title_and_count_RDD = \
    new_user_recommendations_rating_title_and_count_RDD.map(lambda r: (r[1][0][1], r[1][0][0], r[1][1]))
    
top_movies = new_user_recommendations_rating_title_and_count_RDD.filter(lambda r: r[2]>=REVIEW_MIN_AMOUNT).takeOrdered(25, key=lambda x: -x[1])

print("Best recommended movies:")
print("".join(map(str, top_movies)))


User ID: 1
TOP recommended movies (with more than 25 reviews):
('The Adventures of Sherlock Holmes and Doctor Watson: King of Blackmailers (1980)', 5.171305424121109, 26)('Napoléon (1927)', 5.0639180907073955, 28)('Promises (2001)', 5.0509586479029895, 148)('"Personal Journey with Martin Scorsese Through American Movies', 4.975808456516342, 33)('Touki Bouki (1973)', 4.972044433527637, 41)('Landscape in the Mist (Topio stin omichli) (1988)', 4.909311548773586, 40)('"Bonheur', 4.901570823758208, 99)('State of Siege (État de siège) (1972)', 4.901173313419776, 28)('"Wind', 4.900077300812809, 54)('God Grew Tired of Us (2006)', 4.860796246125785, 40)('Eight Deadly Shots (Kahdeksan surmanluotia) (1972)', 4.854013396170991, 29)('Mother and Son (Mat i syn) (1997)', 4.8491116759581985, 45)('Kiwi! (2006)', 4.848803656491755, 39)('"Triumph of the Nerds', 4.8425595238338985, 29)("Eu Não Quero Voltar Sozinho (I Don't Want to Go Back Alone) (2010)", 4.837718537656624, 49)('Patton Oswalt: Werewolves a