In [27]:
from pyspark.mllib.recommendation import ALS
import math
from time import time




########################################
# Configuration.                       #
########################################

# Paths to the available data sets
DATASET_RATINGS_PATH = "./ratings_full.csv"
DATASET_RATINGS_SMALL_PATH = "./ratings_small.csv"
DATASET_MOVIES_PATH = "./movies_full.csv"
DATASET_MOVIES_SMALL_PATH = "./movies_small.csv"

# The minimum number of reviews a movie must have, to recommend it reliably
REVIEW_MIN_AMOUNT = 20

# Split ratio for the small training/calibration data
TRAIN_SMALL_SPLIT = [6, 2, 2]

# Split ratio for the complete training data
TRAIN_COMPLETE_SPLIT = [7, 3]

# Seed to use for training
TRAIN_SEED = 5

# Ranks to use for training
TRAIN_RANKS = [4, 8, 10, 12]

# Number of iterations to train for
TRAIN_ITERATIONS = 12



########################################
# Function definitions.                #
########################################

# Define some helper functions
def readCSV(fname, removeHeader=False, separator=','):
    print("Loading file ", fname, "...")
    rdd = sc.textFile(fname)
    if removeHeader:
        firstline = rdd.first()
        rdd = rdd.filter(lambda x: x != firstline)
    return rdd.map(lambda x: x.split(separator))




########################################
# Load datasets.                       #
########################################

print("Loading all dataset files...")

# Load the ratings data
ratings_data = readCSV(DATASET_RATINGS_PATH, removeHeader=True)
small_ratings_data = readCSV(DATASET_RATINGS_SMALL_PATH, removeHeader=True)

# Load the movies data
movies_data = readCSV(DATASET_MOVIES_PATH, removeHeader=True)
small_movies_data = readCSV(DATASET_MOVIES_SMALL_PATH, removeHeader=True)




########################################
# Parse datasets.                      #
########################################

print("Parsing datasets...")

# Parse the complete ratings set
# [user_id, movie_id, rating, timestamp] -> (user_id, movie_id, rating)
ratings_data = ratings_data.map(lambda x: (int(x[0]), int(x[1]), float(x[2])))\
    .cache()
print("There are", ratings_data.count(), "ratings in the complete dataset.")

# Parse the small ratings set
small_ratings_data = small_ratings_data.map(lambda x: (int(x[0]), int(x[1]), float(x[2])))\
    .cache()
print("There are", small_ratings_data.count(), "ratings in the small dataset.")

# Parse the complete movies set
# [id, title, genres[]] -> (id, title)
movies_data = movies_data.map(lambda x: (int(x[0]), x[1]))\
    .cache()
print("There are", movies_data.count(), "movies in the complete dataset.")
      
# Parse the small ratings set
small_movies_data = small_movies_data.map(lambda x: (int(x[0]), x[1]))\
    .cache()
print("There are", small_movies_data.count(), "movies in the small dataset.")




########################################
# Calibrate machine learning.          #
########################################

print("Calibrating by training on small dataset...")

# Some parameters
regularization_parameter = 0.1
errors = [0, 0, 0]
err = 0

# Create some training sets based on the small data
training_set, validation_set, test_set = small_ratings_data\
        .randomSplit(TRAIN_SMALL_SPLIT, seed = 0)
prediction_validation = validation_set\
        .map(lambda x: tuple(x[0:2]))
prediction_test = test_set\
        .map(lambda x: tuple(x[0:2]))

# Rememer the minimum error value, best rank and iteration
min_error = float("inf")
best_rank = -1
best_iteration = -1

# Loop through each rank to train
for rank in TRAIN_RANKS:
    # Train on the selected rank
    model = ALS.train(training_set,\
                      rank,\
                      seed = TRAIN_SEED,\
                      iterations = TRAIN_ITERATIONS,\
                      lambda_ = regularization_parameter)
    
    # Make some predictions to test
    predictions = model\
            .predictAll(prediction_validation)\
            .map(lambda x: (tuple(x[0:2]), x[2]))
    rates_predictions = validation_set\
            .map(lambda x: ((int(x[0]), int(x[1])), float(x[2])))\
            .join(predictions)
            
    # Calculate the predeiction error value, and store it
    error = math.sqrt(rates_predictions\
                      .map(lambda x: (x[1][0] - x[1][1]) ** 2)\
                      .mean())
    errors[err] = error
    err += 1
    
    # Update the minimum error value and the best rank
    print("Training small dataset with rank", rank, "which has RMSE:", error)
    if error < min_error:
        min_error = error
        best_rank = rank

print("Trained small dataset calibration, best rank:", best_rank)




########################################
# Train on complete dataset.           #
########################################

print("Training complete dataset...")

# Randomly split the data, to use for training and testing
training_set, test_set = ratings_data.randomSplit(TRAIN_COMPLETE_SPLIT,\
                                                  seed = 0)

# Train using the data sets
complete_model = ALS.train(training_set,\
                           best_rank,\
                           seed = seed,\
                           iterations = iterations,\
                           lambda_ = regularization_parameter)




########################################
# Test trained data.                   #
########################################

print("Testing trained complete dataset...")

# Create a test prediction set
test_predict_set = test_set.map(lambda x: tuple(x[0:2]))

# Make come predictions to test
predictions = complete_model\
        .predictAll(test_predict_set)\
        .map(lambda x: (tuple(x[0:2]), x[2]))
rates_predictions = test_set\
        .map(lambda x: ((int(x[0]), int(x[1])), float(x[2])))\
        .join(predictions)

# Calculate the error value
error = math.sqrt(\
                  rates_predictions\
                          .map(lambda x: (x[1][0] - x[1][1]) ** 2)\
                          .mean())
    
print("Trained complete dataset test, RMSE:", error)




########################################
# Calculate movie review count.        #
########################################

# Helper method to count the number of ratings for each movie
def calc_avg_count(id_ratings):
    number = len(id_ratings[1])
    return id_ratings[0],\
        (number, float(sum(x for x in id_ratings[1])) / number)

# Count the number of ratings per movie
movie_id_ratings = (ratings_data\
                            .map(lambda x: (x[1], x[2]))\
                            .groupByKey())
movie_id_ratings_avg = movie_id_ratings.map(calc_avg_count)
movie_id_ratings_count = movie_id_ratings_avg.map(lambda x: (x[0], x[1][0]))

Loading all dataset files...
Loading file  ./ratings_full.csv ...
Loading file  ./ratings_small.csv ...
Loading file  ./movies_full.csv ...
Loading file  ./movies_small.csv ...
Parsing datasets...
There are 24404096 ratings in the complete dataset.
There are 100004 ratings in the small dataset.
There are 40110 movies in the complete dataset.
There are 9125 movies in the small dataset.
Calibrating by training on small dataset...
Training small dataset with  4 , has RMSE: 0.9405925542574993
Training small dataset with  8 , has RMSE: 0.9451745059144596
Training small dataset with  12 , has RMSE: 0.9435903947376889
Trained small dataset calibration, best rank: 4
Training complete dataset...
Testing trained complete dataset...
Trained complete dataset test, RMSE: 0.8318525567660949


In [25]:
new_user_ID = ratings_data.first()[0]
print("User ID:", new_user_ID)

# Get the list of movies watched by the user
new_user_watched_ids = ratings_data.filter(lambda x: x[0] == new_user_ID).map(lambda x: x[0]).collect()

# keep just those not on the ID list (thanks Lei Li for spotting the error!)
new_user_unrated_movies_RDD = (movies_data.filter(lambda x: x[0] not in new_user_watched_ids).map(lambda x: (new_user_ID, x[0])))

# Use the input RDD, new_user_unrated_movies_RDD, with new_ratings_model.predictAll() to predict new ratings for the movies
new_user_recommendations_RDD = complete_model.predictAll(new_user_unrated_movies_RDD)

# Transform new_user_recommendations_RDD into pairs of the form (Movie ID, Predicted Rating)
new_user_recommendations_rating_RDD = new_user_recommendations_RDD.map(lambda x: (x.product, x.rating))
new_user_recommendations_rating_title_and_count_RDD = \
    new_user_recommendations_rating_RDD.join(movies_data).join(movie_id_ratings_count)
#print(new_user_recommendations_rating_title_and_count_RDD.take(5))

new_user_recommendations_rating_title_and_count_RDD = \
    new_user_recommendations_rating_title_and_count_RDD.map(lambda r: (r[1][0][1], r[1][0][0], r[1][1]))
    
top_movies = new_user_recommendations_rating_title_and_count_RDD.filter(lambda r: r[2]>=REVIEW_MIN_AMOUNT).takeOrdered(25, key=lambda x: -x[1])

print("Best recommended movies:")
print("".join(map(str, top_movies)))


User ID: 1
Best recommended movies:
('"Lonely Wife', 5.24120978478798, 25)('Napoléon (1927)', 5.2280430548581815, 28)("Shall We Kiss? (Un baiser s'il vous plait) (2007)", 5.126939654881557, 22)('Landscape in the Mist (Topio stin omichli) (1988)', 5.0258980522406915, 40)('Connections (1978)', 5.022598078110561, 47)('Dancemaker (1998)', 5.0115251363085775, 45)('"Decalogue', 4.966019628457392, 498)('Laurence Anyways (2012)', 4.95170892040607, 90)('"Personal Journey with Martin Scorsese Through American Movies', 4.950990091268236, 33)('"Century of the Self', 4.943931901519441, 138)("Eu Não Quero Voltar Sozinho (I Don't Want to Go Back Alone) (2010)", 4.936002665483571, 49)('High School (1968)', 4.905992633030817, 30)('Girlhood (2003)', 4.904927636708862, 25)('"Great War', 4.886188509874742, 20)('Overlord (1975)', 4.875542161536053, 20)('Four Minutes (Vier Minuten) (2006)', 4.872331143616371, 55)('Powers of Ten (1977)', 4.867633081599967, 46)('"Crucified Lovers', 4.856042214780258, 21)('"St