In [None]:
from pyspark.sql.functions import *

# Define some constants
RATING_MIN = 0.5
RATING_MAX = 5.0
RATING_RANGE = RATING_MAX - RATING_MIN

# Enable crossjoins
print("Configuring Spark...")
spark.conf.set("spark.sql.crossJoin.enabled", True)

# Define some helper functions
def readCSV(fname, removeHeader=False, separator=','):
    print("Loading file ", fname, "...")
    rdd = sc.textFile(fname)
    if removeHeader:
        firstline = rdd.first()
        rdd = rdd.filter(lambda x: x != firstline)
    return rdd.map(lambda x: x.split(separator))

# Load the movies and ratings database
movies = readCSV("./movies.csv", removeHeader=True)
ratings = readCSV("./ratings_train.csv", removeHeader=True)

# Print the first entries to debug whether the data is loaded correctly
print("First movie:", movies.first())
print("First rating:", ratings.first())

# Parse the movie genres
# [id, name, genres[]]
movies = movies.map(lambda x: [x[0], x[1], x[2].split('|')])
print("First movie, processed:", movies.first())

# Parse the rating data
# [user_id, movie_id, rating, timestamp]
ratings = ratings.map(lambda x: x[0].split('::'))
print("First rating, processed:", ratings.first())

print("Caching ratings...")
ratings = ratings.cache()

# Select the user to suggest movies for
client = ratings.first();
client_id = client[0];
#print("Determining movie suggestions for user", client[0], "...")
#
#def addToSet(input_set, value):
#    input_set.add(value)
#    return input_set
#
## Group all ratings by their user keys
#user_ratings = ratings.map(lambda x: (x[0], tuple(x[1:])))\
#                   .aggregateByKey(\
#                       set(), # initial value for an accumulator \
#                       addToSet, # function to add a value to an accumulator \
#                       lambda r1, r2: r1.union(r2) # function to merge two accumulators \
#                   )
#
## Get the ratings for the selected client
#client_ratings = user_ratings.lookup(client_id)
#print("Client ratings:", client_ratings)
#
## The selected client must not be in the list of user ratings
#user_ratings = user_ratings.filter(lambda x: x[0] != client_id)
#
#print("Caching user ratings...")
#user_ratings = user_ratings.cache()

# Create a data frame with all the ratings
ratings_df = spark.createDataFrame(ratings, ['user_id', 'movie_id', 'rating', 'timestamp'])
client_ratings_df = ratings_df.filter(ratings_df.user_id == client_id).alias("client")
user_ratings_df = ratings_df.filter(ratings_df.user_id != client_id).alias("other")
movies_df = spark.createDataFrame(movies, ['id', 'name', 'genres'])\
    .select("id", "name").alias("movies")

#####################################
print("user_ratings_df:")
#user_ratings_df.show(5)

# Join the movies watched by the client and the other user
aaa = user_ratings_df.join(client_ratings_df, on = "movie_id", how = "inner")

# Determine the rating distance for each user/movie pair, and normalize it
aaa = aaa.select(\
                 col("client.user_id").alias("sim_client_user_id"),\
                 col("other.user_id").alias("sim_other_user_id"),\
                 "movie_id",\
                 (abs(col("client.rating") - col("other.rating")) - RATING_RANGE / 2).alias("rating_dist_norm")\
                )

#####################################
print("aaa:")
#aaa.show(5)

# For each client, calculate the similarity to the other users
users_similarity = aaa.groupBy("sim_client_user_id", "sim_other_user_id")\
    .agg(sum("rating_dist_norm").alias("similarity")).alias("similarity")
    
#####################################
print("users_similarity:")
#users_similarity = users_similarity.cache()
#users_similarity.show(5)

## Create a list of users
print("Creating list of users...")
user_list_df = ratings_df.select(col("user_id").alias("list_user_id")).distinct()
# TODO: Do not limit to 1 in production!!!
user_list_df = user_list_df.where(col("list_user_id") == users_similarity.first().sim_client_user_id)
user_list_d = user_list_df.cache()

#####################################
print("user_list_df:")
#user_list_df.show(1)

suggestions = user_list_df.join(\
                                movies_df.select(\
                                                 col("id").alias("sug_movie_id")\
                                                ),\
                                on = col("sug_movie_id") != True,\
                                how = "inner")

#####################################
print("suggestions:")
#suggestions.show(5)

suggestions = suggestions.join(user_ratings_df,\
                               on = [col("sug_movie_id") == col("movie_id"),\
                                     col("list_user_id") != col("user_id")],\
                               how = "inner")

######################################
print("suggestions2:")
#suggestions.show(5)

suggestions = suggestions.join(\
                               users_similarity,\
                               on = [col("list_user_id") == col("sim_client_user_id"),\
                                    col("user_id") == col("sim_other_user_id")],\
                              how = "inner")

#####################################
print("suggestions3:")
#suggestions = suggestions.cache()
#suggestions.show(10)

suggestions = suggestions.select("*", (col("rating") * col("similarity")).alias("rating_mul"))

#####################################
print("suggestions4:")
#suggestions = suggestions.cache()
#suggestions.show(10)

suggestions = suggestions\
    .groupBy("list_user_id", "sug_movie_id")\
    .agg((ceil((sum("rating_mul") / sum("similarity")) * 2) / 2).alias("rating_norm"))

#####################################
print("suggestions5:")
#suggestions = suggestions.cache()
#suggestions.show(10)

#suggestions = suggestions\
#    .select("*")\
#    .groupBy("list_user_id", "sug_movie_id")\
#    .agg((col("rating_mul") / col("similarity_total")).alias("rating_norm"))
#    
######################################
print("suggestions6:")
#suggestions = suggestions.cache()
#suggestions.show(10)

# TODO: Filter movies that have been watched already here



#unwatched.filter(client_ratings_df\
#                 .select(client_ratings_df.movie_id)\
#                 .where(client_ratings_df.movie_id == unwatched.id)\
#                 .limit(1)\
#                 .count() == 0)


suggestions.sort(desc("rating_norm")).show(10000)

print("DONE")

Configuring Spark...
Loading file  ./movies.csv ...
Loading file  ./ratings_train.csv ...
First movie: ['1', 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy']
First rating: ['11973::11::3.0::943354625']
First movie, processed: ['1', 'Toy Story (1995)', ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']]
First rating, processed: ['11973', '11', '3.0', '943354625']
Caching ratings...
user_ratings_df:
aaa:
users_similarity:
Creating list of users...
user_list_df:
suggestions:
suggestions2:
suggestions3:
suggestions4:
suggestions5:
suggestions6:
