In [1]:
from pyspark.mllib.recommendation import ALS
import math
import re
from time import time




########################################
# Configuration.                       #
########################################

# Paths to the available data sets
DATASET_RATINGS_PATH = "./ratings_full.csv"
DATASET_RATINGS_SMALL_PATH = "./ratings_small.csv"
DATASET_MOVIES_PATH = "./movies_full.csv"
DATASET_MOVIES_SMALL_PATH = "./movies_small.csv"

# The minimum number of reviews a movie must have, to recommend it reliably
REVIEW_MIN_AMOUNT = 30

# Split ratio for the small training/calibration data
TRAIN_SMALL_SPLIT = [6, 2, 2]

# Split ratio for the complete training data
TRAIN_COMPLETE_SPLIT = [7, 3]

# Define whether to test the complete data set
TRAIN_COMPLETE_TEST = False

# Seed to use for training
TRAIN_SEED = 5

# Ranks to use for training
TRAIN_RANKS = [4, 6, 8, 10, 12]

# Number of iterations to train for
TRAIN_ITERATIONS = 10

# The number of movies to recommend for the user
RECOMMENDATION_AMOUNT = 12



########################################
# Function definitions.                #
########################################

# Define some helper functions
# TODO: Separator parameter is not doing anything right now.
def readCSV(fname, removeHeader=False, separator=','):
    print("Loading file", fname, "...")
    rdd = sc.textFile(fname)
    if removeHeader:
        firstline = rdd.first()
        rdd = rdd.filter(lambda x: x != firstline)
    return rdd.map(lambda x: re.split(r',(?=(?:"[^"]*?(?: [^"]*)*))|,(?=[^",]+(?:,|$))', x))

# Benchmark time
global benchmark

# Reset the benchmark
def benchmark_reset():
    global benchmark
    benchmark = time()
    
# Get the benchmarked value
def benchmark_get():
    global benchmark
    return str(round(time() - benchmark, 2)) + " s"




########################################
# Load datasets.                       #
########################################

print("Preparing to start making suggestions...")

print("Loading all dataset files...")
benchmark_reset()

# Load the ratings data
ratings_data = readCSV(DATASET_RATINGS_PATH, removeHeader=True, separator='::')
small_ratings_data = readCSV(DATASET_RATINGS_SMALL_PATH, removeHeader=True)

# Load the movies data
movies_data = readCSV(DATASET_MOVIES_PATH, removeHeader=True,separator=',')
small_movies_data = readCSV(DATASET_MOVIES_SMALL_PATH, removeHeader=True,separator=',')

print("Datasets loaded, took", benchmark_get(), "\n")




########################################
# Parse datasets.                      #
########################################

print("Parsing datasets...")
benchmark_reset()

# Parse the complete ratings set
# [user_id, movie_id, rating, timestamp] -> (user_id, movie_id, rating)
ratings_data = ratings_data.map(lambda x: (int(x[0]), int(x[1]), float(x[2])))\
        .cache()
print("There are", ratings_data.count(), "ratings in the complete dataset.")

# Parse the small ratings set
small_ratings_data = small_ratings_data.map(lambda x: (int(x[0]), int(x[1]), float(x[2])))\
        .cache()
print("There are", small_ratings_data.count(), "ratings in the small dataset.")

# Parse the complete movies set
# [id, title, genres[]] -> (id, title)
movies_data = movies_data\
        .map(lambda x: (\
                        int(x[0]),\
                        x[1].replace('\"', ''),\
                        x[2].replace("(no genres listed)", "").split("|")))\
        .cache()
print("There are", movies_data.count(), "movies in the complete dataset.")
      
# Parse the small ratings set
small_movies_data = small_movies_data.map(lambda x: (int(x[0]), x[1]))\
        .cache()
print("There are", small_movies_data.count(), "movies in the small dataset.")

print("Parsing took", benchmark_get(), "\n")




########################################
# Create users database.               #
########################################

print("Creating set of users...")
benchmark_reset()

# Get the user IDs
user_ids = ratings_data.map(lambda x: x[0]).distinct().collect()

print("Got", len(user_ids), "users, took", benchmark_get(), "\n")




########################################
# Calculate movie review count.        #
########################################

print("Calculate movie ratings amount...")
benchmark_reset()

# Helper method to count the number of ratings for each movie
def calc_avg_count(id_ratings):
    number = len(id_ratings[1])
    return id_ratings[0],\
            (number, float(sum(x for x in id_ratings[1])) / number)

# Count the number of ratings per movie
movie_id_ratings = (ratings_data\
                            .map(lambda x: tuple(x[1:3]))\
                            .groupByKey())
movie_id_ratings_avg = movie_id_ratings.map(calc_avg_count)

# Create (movie_id, review_count) tuples, filter tuples that didn't reach the review amount threshold
movie_id_ratings_count = movie_id_ratings_avg\
        .filter(lambda x: x[1][0] >= REVIEW_MIN_AMOUNT)\
        .map(lambda x: (x[0], x[1][0]))\
        .cache()

print("Calculation done, took", benchmark_get(), "\n")




########################################
# Filter movies without many ratings.  #
########################################

print("Filtering movies with less than", REVIEW_MIN_AMOUNT, "ratings as not relevant...")
benchmark_reset()

# Create a list of movie IDs that are allowed
allowed_movie_ids = movie_id_ratings_count\
        .map(lambda x: x[0])\
        .collect()
        
# Filter the movies
movies_data = movies_data\
        .filter(lambda x: x[0] in allowed_movie_ids)\
        .cache()

print("Filtered movies, took", benchmark_get(), "\n")




########################################
# Calibrate machine learning.          #
########################################

print("Calibrating by training on small dataset...")
benchmark_reset()

# Some parameters
regularization_parameter = 0.1
errors = [0] * len(TRAIN_RANKS)
err = 0

# Create some training sets based on the small data
training_set, validation_set, test_set = small_ratings_data\
        .randomSplit(TRAIN_SMALL_SPLIT, seed = 0)
prediction_validation = validation_set\
        .map(lambda x: tuple(x[0:2]))
prediction_test = test_set\
        .map(lambda x: tuple(x[0:2]))

# Rememer the minimum error value, best rank and iteration
min_error = float("inf")
best_rank = -1
best_iteration = -1

# Loop through each rank to train
for rank in TRAIN_RANKS:
    # Train on the selected rank
    model = ALS.train(training_set,\
                      rank,\
                      seed = TRAIN_SEED,\
                      iterations = TRAIN_ITERATIONS,\
                      lambda_ = regularization_parameter)
    
    # Make some predictions to test
    predictions = model\
            .predictAll(prediction_validation)\
            .map(lambda x: (tuple(x[0:2]), x[2]))
    rating_predictions = validation_set\
            .map(lambda x: ((int(x[0]), int(x[1])), float(x[2])))\
            .join(predictions)
            
    # Calculate the predeiction error value, and store it
    error = math.sqrt(rating_predictions\
                      .map(lambda x: (x[1][0] - x[1][1])**2)\
                      .mean())
    errors[err] = error
    err += 1
    
    # Update the minimum error value and the best rank
    print("Training small dataset with rank", rank, "which has RMSE:", error)
    if error < min_error:
        min_error = error
        best_rank = rank

print("Trained small dataset calibration, took", benchmark_get(), " with a best rank:", best_rank, "\n")




########################################
# Train on complete dataset.           #
########################################

print("Training complete dataset...")
benchmark_reset()

# Randomly split the data, to use for training and testing
training_set, test_set = ratings_data.randomSplit(TRAIN_COMPLETE_SPLIT,\
                                                  seed = 0)

# Train using the data sets
trained_model = ALS.train(training_set,\
                           best_rank,\
                           seed = TRAIN_SEED,\
                           iterations = TRAIN_ITERATIONS,\
                           lambda_ = regularization_parameter)

print("Training took", benchmark_get(), "\n")




########################################
# Test trained data.                   #
########################################

if TRAIN_COMPLETE_TEST:
    print("Testing trained complete dataset...")
    benchmark_reset()

    # Create a test prediction set
    test_predict_set = test_set.map(lambda x: tuple(x[0:2]))

    # Make come predictions to test
    predictions = trained_model\
            .predictAll(test_predict_set)\
            .map(lambda x: (tuple(x[0:2]), x[2]))
    rates_predictions = test_set\
            .map(lambda x: ((int(x[0]), int(x[1])), float(x[2])))\
            .join(predictions)

    # Calculate the error value
    error = math.sqrt(\
                      rates_predictions\
                              .map(lambda x: (x[1][0] - x[1][1])**2)\
                              .mean())

    print("Trained complete dataset test, took", benchmark_get(), " with an RMSE:", error, "\n")



print("Ready to suggest!")

Preparing to start making suggestions...
Loading all dataset files...
Loading file ./ratings_full.csv ...
Loading file ./ratings_small.csv ...
Loading file ./movies_full.csv ...
Loading file ./movies_small.csv ...
Datasets loaded, took 1.31 s 

Parsing datasets...
There are 24404208 ratings in the complete dataset.
There are 100004 ratings in the small dataset.
There are 40110 movies in the complete dataset.
There are 9125 movies in the small dataset.
Parsing took 40.26 s 

Creating set of users...
Got 259138 users, took 8.47 s 

Calculate movie ratings amount...
Calculation done, took 0.02 s 

Filtering movies with less than 30 ratings as not relevant...
Filtered movies, took 13.15 s 

Calibrating by training on small dataset...
Training small dataset with rank 4 which has RMSE: 0.9405925542574993
Training small dataset with rank 6 which has RMSE: 0.9425347727952912
Training small dataset with rank 8 which has RMSE: 0.9451745059144596
Training small dataset with rank 10 which has RMSE

In [2]:
########################################
# Function definitions.                #
########################################

# Predict the ratings for all movies the user hasn't rated yet.
#
# Parameters:
# - user_id: The ID of the user to predict ratings for.
#
# Returns movie rating predictions.
def predict_movie_ratings(user_id):
    # Create a list of IDs of movies already rated by the user
    rated_ids = ratings_data\
            .filter(lambda x: x[0] == user_id)\
            .map(lambda x: x[0])\
            .collect()

    # Get all movie IDs that haven't been rated by the user
    unrated_ids = (movies_data\
                       .filter(lambda x: x[0] not in rated_ids)\
                       .map(lambda x: (user_id, x[0])))

    # Predict the recommendation value for all unrated movies for this user
    predictions = trained_model.predictAll(unrated_ids)

    # Transform the prediction result into proper tuples
    # (movie_id, predicted_rating)
    predictions = predictions.map(lambda x: (x.product, x.rating))
    
    # Saturate the list of tuples with the movie titles and number of ratings
    predictions = predictions\
            .join(movies_data)\
            .join(movies_data.map(lambda x: (x[0], x[2])))\
            .join(movie_id_ratings_count)

    # Remap the recommendations to get usable tuples:
    # (title, predicted_rating, rating_count)
    return predictions\
            .map(lambda x: (x[1][0][0][1], x[1][0][0][0], x[1][0][1], x[1][1]))

        
        
# Predict the top movies to watch for the given user.
#
# Parameters:
# - user_id: The ID of the user to predict movies for.
def predict_top_movies(user_id):
    # Print a status message
    print("Predicting top movies for user", user_id, "...")
    benchmark_reset()

    # Predict ratings for unwatched movies for this user
    predictions = predict_movie_ratings(user_id)
    
    # Take the top list of movies for the user as a list
    top_movies = predictions.takeOrdered(RECOMMENDATION_AMOUNT,\
                            key = lambda x: -x[1])
    
    # Print the benchmark time, and return the top movies
    print("Top movies predicted, took", benchmark_get(), "\n")
    return top_movies




########################################
# Recommendation process.              #
########################################

# Add user 9999999 in front of the list of user IDs
user_ids = [9999999] + user_ids

i = 0

# Loop through the list of users, and get movie recommendations for each one of them
for user_id in user_ids:
    # Increase the counter and make sure we didn't reach the limit yet
    i += 1
    if i > 25:
        break

    # Collect the results
    result = predict_top_movies(user_id)
    
    print("{:<45}{:<8}{:<9}{}".format("MOVIE NAME", "GUESS", "RATINGS", "GENRES"))
    for movie_name, rating_guess, genres, ratings in result:
        print("{:<45}{:<8}{:<9}{}".format(movie_name[:43], round(rating_guess, 4), ratings, ", ".join(genres)))
    print("\n")

print("...")

Predicting top movies for user 9999999 ...
Top movies predicted, took 11.54 s 

MOVIE NAME                                   GUESS   RATINGS  GENRES
George Carlin: Back in Town (1996)           5.1659  59       Comedy
Cat Came Back, The (1988)                    5.0717  32       Action, Animation, Comedy
The Night Manager (2016)                     4.9453  70       Crime, Drama, Mystery, Thriller
The Man Who Knew Infinity (2016)             4.9137  84       Drama
Matrix, The (1999)                           4.9113  71450    Action, Sci-Fi, Thriller
Hell or High Water (2016)                    4.8824  64       Crime, Drama
Bang Boom Bang - Ein todsicheres Ding (1999  4.8822  30       Action, Comedy
Doctor Who: A Christmas Carol (2010)         4.8735  97       Sci-Fi
Stranger Things                              4.8695  704      Drama
Forklift Driver Klaus: The First Day on the  4.8688  38       Comedy, Horror
Piper (2016)                                 4.8679  88       Animation
Jim Gaf