In [9]:
from pyspark.mllib.recommendation import ALS
import math

# Some constants
DATASET_RATINGS_PATH = "./ratings_full.csv"
DATASET_RATINGS_SMALL_PATH = "./ratings_small.csv"
DATASET_MOVIES_PATH = "./movies_full.csv"
DATASET_MOVIES_SMALL_PATH = "./movies_small.csv"

# Define some helper functions
def readCSV(fname, removeHeader=False, separator=','):
    print("Loading file ", fname, "...")
    rdd = sc.textFile(fname)
    if removeHeader:
        firstline = rdd.first()
        rdd = rdd.filter(lambda x: x != firstline)
    return rdd.map(lambda x: x.split(separator))

# Load the ratings data
ratings_data = readCSV(DATASET_RATINGS_PATH, removeHeader=True)
small_ratings_data = readCSV(DATASET_RATINGS_SMALL_PATH, removeHeader=True)

# Load the movies data
movies_data = readCSV(DATASET_MOVIES_PATH, removeHeader=True)
small_movies_data = readCSV(DATASET_MOVIES_SMALL_PATH, removeHeader=True)

print("Parsing datasets...")

# Parse the ratings data
# [user_id, movie_id, rating, timestamp] -> (user_id, movie_id, rating)
ratings_data = ratings_data.map(lambda x: tuple(x[:-1]))\
    .cache()
print("There are", ratings_data.count(), "ratings in the full dataset.")
      
small_ratings_data = small_ratings_data.map(lambda x: tuple(x[:-1]))\
    .cache()
print("There are", small_ratings_data.count(), "ratings in the small dataset.")

# Parse the movies data
# [id, title, genres[]] -> (id, title)
movies_data = movies_data.map(lambda x: tuple(x[:-1]))\
    .cache()
print("There are", movies_data.count(), "movies in the full dataset.")
      
small_movies_data = small_movies_data.map(lambda x: tuple(x[:-1]))\
    .cache()
print("There are", small_movies_data.count(), "movies in the small dataset.")

print("Start training with small dataset...")

# Create some training sets based on the small data
training_RDD, validation_RDD, test_RDD = small_ratings_data.randomSplit([6, 2, 2], seed=0)
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

# Define some machine learning parameters
seed = 5
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    model = ALS.train(training_RDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print("For rank", rank, "the RMSE is", error)
    if error < min_error:
        min_error = error
        best_rank = rank

print("The best model was trained with rank", best_rank)


Loading file  ./ratings_full.csv ...
Loading file  ./ratings_small.csv ...
Loading file  ./movies_full.csv ...
Loading file  ./movies_small.csv ...
Parsing datasets...
There are 24404096 ratings in the dataset.
There are 100004 ratings in the small dataset.
There are 40110 movies in the dataset.
There are 9125 movies in the small dataset.
Start training with small dataset...
For rank 4 the RMSE is 0.9405925542574993
For rank 8 the RMSE is 0.9451745059144596
For rank 12 the RMSE is 0.9435903947376889
The best model was trained with rank 4
