In [36]:
# Define some helper functions
def readCSV(fname, removeHeader=False, separator=','):
    print("Loading file ", fname, "...")
    rdd = sc.textFile(fname)
    if removeHeader:
        firstline = rdd.first()
        rdd = rdd.filter(lambda x: x != firstline)
    return rdd.map(lambda x: x.split(separator))

# Load the movies and ratings database
movies = readCSV("./movies.csv", removeHeader=True)
ratings = readCSV("./ratings_train.csv", removeHeader=True)

# Print the first entries to debug whether the data is loaded correctly
print("First movie:", movies.first())
print("First rating:", ratings.first())

# Parse the movie genres
# [id, name, genres[]]
movies = movies.map(lambda x: [x[0], x[1], x[2].split('|')])
print("First movie, processed:", movies.first())

# Parse the rating data
# [user_id, movie_id, rating, timestamp]
ratings = ratings.map(lambda x: x[0].split('::'))
print("First rating, processed:", ratings.first())

# Select the user to suggest movies for
client = ratings.first();
print("Determining movie suggestions for user", client[0], "...")

def addToSet(input_set, value):
    input_set.add(value)
    return input_set

# Group all ratings by their user keys
user_ratings = ratings.map(lambda x: (x[0], tuple(x[1:])))\
                   .aggregateByKey(\
                       set(), # initial value for an accumulator \
                       addToSet, # function to add a value to an accumulator \
                       lambda r1, r2: r1.union(r2) # function to merge two accumulators \
                   )

# Get the ratings for the selected client
client_ratings = user_ratings.lookup(client[0])
print("Client ratings:", client_ratings)

# TEST: Calculate rating averages
# print("Calculating ratings average...")
# user_ratings = ratings.map(lambda x: float(x[2]))
# print("Avg rating:", user_ratings.sum()/user_ratings.count())

Loading file  ./movies.csv ...
Loading file  ./ratings_train.csv ...
First movie: ['1', 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy']
First rating: ['11973::11::3.0::943354625']
First movie, processed: ['1', 'Toy Story (1995)', ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']]
First rating, processed: ['11973', '11', '3.0', '943354625']
Determining movie suggestions for user 11973 ...
[{('529', '4.0', '943354197'), ('14', '4.0', '943353278'), ('2336', '4.0', '943354197'), ('11', '3.0', '943354625'), ('1721', '5.0', '943353411'), ('2268', '4.0', '943354292'), ('590', '3.0', '943354241'), ('161', '4.0', '943354197'), ('36', '3.0', '943354345'), ('1358', '4.0', '943354345'), ('994', '4.0', '943354503'), ('110', '5.0', '943354384'), ('608', '5.0', '943354345'), ('1213', '5.0', '943353614'), ('1784', '5.0', '943354241'), ('2433', '3.0', '943354503'), ('300', '4.0', '943354345'), ('1357', '3.0', '943354292'), ('1594', '5.0', '943354345'), ('1095', '5.0', '9433