In [1]:
from pyspark.sql.functions import *

# Define some constants
RATING_MIN = 0.5
RATING_MAX = 5.0
RATING_RANGE = RATING_MAX - RATING_MIN

# Enable crossjoins
print("Configuring Spark...")
spark.conf.set("spark.sql.crossJoin.enabled", True)

# Define some helper functions
def readCSV(fname, removeHeader=False, separator=','):
    print("Loading file ", fname, "...")
    rdd = sc.textFile(fname)
    if removeHeader:
        firstline = rdd.first()
        rdd = rdd.filter(lambda x: x != firstline)
    return rdd.map(lambda x: x.split(separator))

# Load the movies and ratings database
ratings = readCSV("./ratings_train.csv", removeHeader=True)
movies = readCSV("./movies.csv", removeHeader=True)

# Parse the rating data
# [user_id, movie_id, rating, timestamp]
print("Parsing ratings...")
ratings = ratings.map(lambda x: x[0].split('::'))

# Parse the movie genres
# [id, name, genres[]]
print("Parsing movies...")
movies = movies.map(lambda x: [x[0], x[1], x[2].split('|')])

# Create a dataframe for the ratings
print("Create ratings dataframe...")
ratings_df = spark.createDataFrame(ratings, ['user_id', 'movie_id', 'rating', 'timestamp'])

# Cache the ratings
print("Caching ratings dataframe...")
ratings_df = ratings_df.cache()

# TODO: Remove this query
#client_ratings_df = ratings_df.filter(ratings_df.user_id == client_id).alias("client")
# TODO: Remove this query
#user_ratings_df = ratings_df.filter(ratings_df.user_id != client_id).alias("other")

# Create a dataframe for the movies, and filter it
print("Create movies dataframe...")
movies_df = spark.createDataFrame(\
                                  movies,\
                                  ['id', 'name', 'genres'])\
    .select(\
            "id",\
            "name")\
    .alias("movies")

# Cache list of movies
print("Caching movies dataframe...")
movies_df = movies_df.cache()

## Create a list of users
print("Creating list of users...")
user_list = ratings_df.select(col("user_id").alias("list_user_id")).distinct()
#user_list_df = user_list_df.where(col("list_user_id") == users_similarity.first().sim_client_user_id)

# Cache the list of users
print("Caching list of users...")
user_list = user_list.cache()

def user_iter(user):
    print("Printing user...")
    print("UserID: ", user.list_user_id)

user_list.foreach(user_iter)

# Join the movies watched by the client and the other user
print("Creating ratings relation map...")
client_df = ratings_df.alias("client");
other_df = ratings_df.alias("other");
ratings_map = client_df\
    .join(other_df,\
          on = [col("client.movie_id") == col("other.movie_id"),\
                col("client.user_id") != col("other.user_id")],\
          how = "inner")

# Determine the rating distance for each user/movie pair, and normalize it
print("Filtering ratings map, and calculating ratings distance...")
ratings_map = ratings_map.select(\
                 col("client.user_id").alias("sim_client_user_id"),\
                 col("other.user_id").alias("sim_other_user_id"),\
                 "client.movie_id",\
                 (abs(col("client.rating") - col("other.rating")) - RATING_RANGE / 2).alias("rating_dist_norm")\
                )

# Cache the ratings map
#print("Caching ratings map...")
#ratings_map = ratings_map.cache()

# Calculate the user relation similarity
print("Calculating user relation similarity...")
users_similarity = ratings_map\
    .groupBy(\
             "sim_client_user_id",\
             "sim_other_user_id")\
    .agg(\
         sum("rating_dist_norm")\
             .alias("similarity"))\
    .alias("similarity")
    
# Cache the user similarities
#print("Caching user similarities...")
#user_similarity = user_similarity.cache()

# Join all movies to all users 
print("Relating all users to all movies to map suggestions...")
suggestion_map = user_list.join(\
                                movies_df.select(\
                                                 col("id").alias("sug_movie_id")\
                                                ),\
                                on = col("sug_movie_id") != True,\
                                how = "inner")

# Map all ratings to movies
print("Mapping all ratings to movies...")
suggestion_map = suggestion_map.join(ratings_df,\
                               on = [col("sug_movie_id") == col("movie_id"),\
                                     col("list_user_id") != col("user_id")],\
                               how = "inner")

# Add user similarity values
print("Add user similarity values to ratings...")
suggestion_map = suggestion_map.join(\
                               users_similarity,\
                               on = [col("list_user_id") == col("sim_client_user_id"),\
                                    col("user_id") == col("sim_other_user_id")],\
                              how = "inner")

# Multiply the rating values by their similarity
print("Expanding ratings based on user similarity...")
suggestion_map = suggestion_map\
    .select(\
            "*",\
            (col("rating") * col("similarity"))\
                .alias("rating_mul"))

# Normalize the expanded rating values
print("Normalizing expanded ratings...")
suggestion_map = suggestion_map\
    .groupBy("list_user_id", "sug_movie_id")\
    .agg((ceil((sum("rating_mul") / sum("similarity")) * 2) / 2).alias("rating_norm"))
    
# Cache the normalized ratings
suggestion_map = suggestion_map.cache()

# Create and output a list of ratings
print("Creating list of ratings to print...")
suggestion_map.sort(desc("rating_norm")).show(10000)

print("DONE")

Configuring Spark...
Loading file  ./ratings_train.csv ...
Loading file  ./movies.csv ...
Parsing ratings...
Parsing movies...
Create ratings dataframe...
Caching ratings dataframe...
Create movies dataframe...
Caching movies dataframe...
Creating list of users...
Caching list of users...


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 5.0 failed 1 times, most recent failure: Lost task 3.0 in stage 5.0 (TID 24, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 172, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 167, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/lib/spark/python/pyspark/rdd.py", line 2371, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/lib/spark/python/pyspark/rdd.py", line 2371, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/lib/spark/python/pyspark/rdd.py", line 2371, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/lib/spark/python/pyspark/rdd.py", line 317, in func
    return f(iterator)
  File "/usr/lib/spark/python/pyspark/rdd.py", line 748, in processPartition
    f(x)
  File "<ipython-input-1-818c5040bace>", line 69, in user_iter
NameError: name 'sys' is not defined

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
	at org.apache.spark.scheduler.Task.run(Task.scala:86)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1454)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1442)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1441)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1441)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:811)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:811)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1667)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1622)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1611)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1890)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1916)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1930)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:912)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:911)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:453)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:497)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 172, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 167, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/lib/spark/python/pyspark/rdd.py", line 2371, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/lib/spark/python/pyspark/rdd.py", line 2371, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/lib/spark/python/pyspark/rdd.py", line 2371, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "/usr/lib/spark/python/pyspark/rdd.py", line 317, in func
    return f(iterator)
  File "/usr/lib/spark/python/pyspark/rdd.py", line 748, in processPartition
    f(x)
  File "<ipython-input-1-818c5040bace>", line 69, in user_iter
NameError: name 'sys' is not defined

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
	at org.apache.spark.scheduler.Task.run(Task.scala:86)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more
