In [None]:
from pyspark.mllib.recommendation import ALS
import math
from time import time




########################################
# Configuration.                       #
########################################

# Paths to the available data sets
DATASET_RATINGS_PATH = "./ratings_full.csv"
DATASET_RATINGS_SMALL_PATH = "./ratings_small.csv"
DATASET_MOVIES_PATH = "./movies_full.csv"
DATASET_MOVIES_SMALL_PATH = "./movies_small.csv"

# The minimum number of reviews a movie must have, to recommend it reliably
REVIEW_MIN_AMOUNT = 20

# Split ratio for the small training/calibration data
TRAIN_SMALL_SPLIT = [6, 2, 2]

# Split ratio for the complete training data
TRAIN_COMPLETE_SPLIT = [7, 3]

# Seed to use for training
TRAIN_SEED = 5

# Ranks to use for training
TRAIN_RANKS = [4, 6, 8, 10, 12]

# Number of iterations to train for
TRAIN_ITERATIONS = 16

# The number of movies to recommend for the user
RECOMMENDATION_AMOUNT = 10



########################################
# Function definitions.                #
########################################

# Define some helper functions
def readCSV(fname, removeHeader=False, separator=','):
    print("Loading file", fname, "...")
    rdd = sc.textFile(fname)
    if removeHeader:
        firstline = rdd.first()
        rdd = rdd.filter(lambda x: x != firstline)
    return rdd.map(lambda x: x.split(separator))




########################################
# Load datasets.                       #
########################################

print("Loading all dataset files...")

# Load the ratings data
ratings_data = readCSV(DATASET_RATINGS_PATH, removeHeader=True)
small_ratings_data = readCSV(DATASET_RATINGS_SMALL_PATH, removeHeader=True)

# Load the movies data
movies_data = readCSV(DATASET_MOVIES_PATH, removeHeader=True)
small_movies_data = readCSV(DATASET_MOVIES_SMALL_PATH, removeHeader=True)




########################################
# Parse datasets.                      #
########################################

print("Parsing datasets...")

# Parse the complete ratings set
# [user_id, movie_id, rating, timestamp] -> (user_id, movie_id, rating)
ratings_data = ratings_data.map(lambda x: (int(x[0]), int(x[1]), float(x[2])))\
    .cache()
print("There are", ratings_data.count(), "ratings in the complete dataset.")

# Parse the small ratings set
small_ratings_data = small_ratings_data.map(lambda x: (int(x[0]), int(x[1]), float(x[2])))\
    .cache()
print("There are", small_ratings_data.count(), "ratings in the small dataset.")

# Parse the complete movies set
# [id, title, genres[]] -> (id, title)
movies_data = movies_data.map(lambda x: (int(x[0]), x[1]))\
    .cache()
print("There are", movies_data.count(), "movies in the complete dataset.")
      
# Parse the small ratings set
small_movies_data = small_movies_data.map(lambda x: (int(x[0]), x[1]))\
    .cache()
print("There are", small_movies_data.count(), "movies in the small dataset.")




########################################
# Calibrate machine learning.          #
########################################

print("Calibrating by training on small dataset...")

# Some parameters
regularization_parameter = 0.1
errors = [0] * len(TRAIN_RANKS)
err = 0

# Create some training sets based on the small data
training_set, validation_set, test_set = small_ratings_data\
        .randomSplit(TRAIN_SMALL_SPLIT, seed = 0)
prediction_validation = validation_set\
        .map(lambda x: tuple(x[0:2]))
prediction_test = test_set\
        .map(lambda x: tuple(x[0:2]))

# Rememer the minimum error value, best rank and iteration
min_error = float("inf")
best_rank = -1
best_iteration = -1

# Loop through each rank to train
for rank in TRAIN_RANKS:
    # Train on the selected rank
    model = ALS.train(training_set,\
                      rank,\
                      seed = TRAIN_SEED,\
                      iterations = TRAIN_ITERATIONS,\
                      lambda_ = regularization_parameter)
    
    # Make some predictions to test
    predictions = model\
            .predictAll(prediction_validation)\
            .map(lambda x: (tuple(x[0:2]), x[2]))
    rates_predictions = validation_set\
            .map(lambda x: ((int(x[0]), int(x[1])), float(x[2])))\
            .join(predictions)
            
    # Calculate the predeiction error value, and store it
    error = math.sqrt(rates_predictions\
                      .map(lambda x: (x[1][0] - x[1][1]) ** 2)\
                      .mean())
    errors[err] = error
    err += 1
    
    # Update the minimum error value and the best rank
    print("Training small dataset with rank", rank, "which has RMSE:", error)
    if error < min_error:
        min_error = error
        best_rank = rank

print("Trained small dataset calibration, best rank:", best_rank)




########################################
# Train on complete dataset.           #
########################################

print("Training complete dataset...")

# Randomly split the data, to use for training and testing
training_set, test_set = ratings_data.randomSplit(TRAIN_COMPLETE_SPLIT,\
                                                  seed = 0)

# Train using the data sets
trained_model = ALS.train(training_set,\
                           best_rank,\
                           seed = seed,\
                           iterations = iterations,\
                           lambda_ = regularization_parameter)




########################################
# Test trained data.                   #
########################################

print("Testing trained complete dataset...")

# Create a test prediction set
test_predict_set = test_set.map(lambda x: tuple(x[0:2]))

# Make come predictions to test
predictions = trained_model\
        .predictAll(test_predict_set)\
        .map(lambda x: (tuple(x[0:2]), x[2]))
rates_predictions = test_set\
        .map(lambda x: ((int(x[0]), int(x[1])), float(x[2])))\
        .join(predictions)

# Calculate the error value
error = math.sqrt(\
                  rates_predictions\
                          .map(lambda x: (x[1][0] - x[1][1]) ** 2)\
                          .mean())
    
print("Trained complete dataset test, RMSE:", error)




########################################
# Calculate movie review count.        #
########################################

# Helper method to count the number of ratings for each movie
def calc_avg_count(id_ratings):
    number = len(id_ratings[1])
    return id_ratings[0],\
        (number, float(sum(x for x in id_ratings[1])) / number)

# Count the number of ratings per movie
movie_id_ratings = (ratings_data\
                            .map(lambda x: (x[1], x[2]))\
                            .groupByKey())
movie_id_ratings_avg = movie_id_ratings.map(calc_avg_count)
movie_id_ratings_count = movie_id_ratings_avg.map(lambda x: (x[0], x[1][0]))

Loading all dataset files...
Loading file  ./ratings_full.csv ...
Loading file  ./ratings_small.csv ...
Loading file  ./movies_full.csv ...
Loading file  ./movies_small.csv ...
Parsing datasets...
There are 24404096 ratings in the complete dataset.
There are 100004 ratings in the small dataset.
There are 40110 movies in the complete dataset.
There are 9125 movies in the small dataset.
Calibrating by training on small dataset...
Training small dataset with rank 4 which has RMSE: 0.9393526035943128
Training small dataset with rank 8 which has RMSE: 0.9437873274872436
Training small dataset with rank 10 which has RMSE: 0.938198826757663
Training small dataset with rank 12 which has RMSE: 0.9430925559243571
Trained small dataset calibration, best rank: 10
Training complete dataset...
Testing trained complete dataset...


In [30]:
########################################
# Function definitions.                #
########################################

# Predict the ratings for all movies the user hasn't rated yet.
#
# Parameters:
# - user_id: The ID of the user to predict ratings for.
#
# Returns movie rating predictions.
def predict_movie_ratings(user_id):
    # Create a list of IDs of movies already rated by the user
    rated_ids = ratings_data\
            .filter(lambda x: x[0] == user_id)\
            .map(lambda x: x[0])\
            .collect()

    # Get all movie IDs that haven't been rated by the user
    unrated_ids = (movies_data\
                       .filter(lambda x: x[0] not in rated_ids)\
                       .map(lambda x: (user_id, x[0])))

    # Predict the recommendation value for all unrated movies for this user
    predictions = trained_model.predictAll(unrated_ids)

    # Transform the prediction result into proper tuples
    # (movie_id, predicted_rating)
    predictions = predictions.map(lambda x: (x.product, x.rating))
    
    # Saturate the list of tuples with the movie titles and number of ratings
    predictions = predictions\
            .join(movies_data)\
            .join(movie_id_ratings_count)

    # Remap the recommendations to get usable tuples:
    # (title, predicted_rating, rating_count)
    return predictions\
            .map(lambda x: (x[1][0][1], x[1][0][0], x[1][1]))

        
        
# Predict the top movies to watch for the given user.
#
# Parameters:
# - user_id: The ID of the user to predict movies for.
def predict_top_movies(user_id):
    # Print a status message
    print("Processing movies for user", user_id)

    # Predict ratings for unwatched movies for this user
    predictions = predict_movie_ratings(user_id)
    
    # Filter movies that have less ratings than the specified constraint
    predictions = predictions\
            .filter(lambda x: x[2] >= REVIEW_MIN_AMOUNT)
    
    # Take the top list of movies for the user as a list
    return predictions.takeOrdered(RECOMMENDATION_AMOUNT,\
                            key = lambda x: -x[1])




########################################
# Recommendation process.              #
########################################

# Get the first user, to recommend movies for
selected_user = ratings_data.first()[0]

# Print the recommended movies for the selected user
print("Recommended movies for user", selected_user, ":")
print("".join(map(str, predict_top_movies(selected_user))))


Recommended movies for user 1 :
Processing movies for user 1


NameError: name 'RECOMMENDATION_AMOUNT' is not defined