In [1]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,col,collect_list
from pyspark.sql.types import StringType, ArrayType, DoubleType,IntegerType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RankingEvaluator
from HR import HitRate

from pyspark.sql.types import *
from pyspark.sql import functions as F

#For windows user only
import os 
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.spark_splitters import spark_random_split
from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation
from recommenders.utils.spark_utils import start_or_get_spark

# Create spark session

In [3]:
spark = SparkSession.builder.master("local[*]") \
                    .config('spark.ui.showConsoleProgress', 'false')\
                    .config('spark.driver.memory', '12g')\
                    .config('spark.executor.memory', '2g')\
                    .appName('MovieRecomender') \
                    .getOrCreate()
                    
spark.sparkContext.setCheckpointDir('checkpoint/')

22/10/08 17:53:38 WARN Utils: Your hostname, lap15450-ThinkPad-X13-Gen-2i resolves to a loopback address: 127.0.1.1; using 192.168.0.193 instead (on interface wlp0s20f3)
22/10/08 17:53:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/10/08 17:53:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
schema =             StructType([
                    StructField('UserID', LongType(), True),
                     StructField('MovieID', LongType(), True),
                     StructField('Rating', IntegerType(), True),
                     StructField('Timestamp', LongType(), True),
                     ])

In [5]:
df = spark.read.option("sep", "::").schema(schema).csv("data/ml-10m/ratings.dat")
df = df.toDF(*["UserID", "MovieID", "Rating", "Timestamp"])
df.createOrReplaceTempView("dataset");
df = df.dropna()
df.persist().count() #Force persist due to size

7951824

# Model config

In [6]:
rank = 20
iter = 15
regParam = 0.1

# Hit Rate

In [7]:
def evaluate_hit_rate(als, left_out_df, keep_one_df, full_matrix, n_users):
    hr_evaluator = HitRate(predictionCol='prediction', labelCol='Rating', userCol='UserID', itemCol = "MovieID")
    value = hr_evaluator.eval(als, left_out_df, keep_one_df, full_matrix, n_users)
    return value

# RMSE

In [8]:
def evaluate_rmse(model, test):
    rmse = RegressionEvaluator(metricName="rmse", labelCol="Rating", predictionCol="prediction")       
    predictions=model.transform(test).na.drop()
    return rmse.evaluate(predictions)

# NCDG at K

In [10]:
def evaluate_ndcg_at_k(model, full_matrix, train, test, k = 10):
    recommendations = model.transform(full_matrix)
    rank_eval = SparkRankingEvaluation(test, recommendations, k = k, col_user="UserID", col_item="MovieID", 
                                    col_rating="Rating", col_prediction="prediction", 
                                    relevancy_method="top_k")
    return rank_eval.ndcg_at_k()

# Evaluation

In [11]:
#Leave one out for calculating hit rate
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

windowSpec  = Window.partitionBy("UserID").orderBy(F.col("Rating").desc())
tmp = df.withColumn("row_number", row_number().over(windowSpec))      
left_out_dataframe = tmp.filter(F.col("row_number") != 1)
keep_one_dataframe = tmp.filter(F.col("row_number") == 1)
left_out_count = left_out_dataframe.persist().count() #Force persist due to size
keep_out_count = keep_one_dataframe.persist().count() #Force persist
print("Left out {}, training to evaluate hit rate on {}.".format(keep_out_count, left_out_count))

Left out 69873, training to evaluate hit rate on 7881951.


In [12]:
user_df = df.select("UserID").distinct()
movie_df = df.select("MovieID").distinct()

user_df.persist().count() #Force persist
movie_df.persist().count() #Force persist
full_matrix = user_df.crossJoin(movie_df)
full_matrix.persist().count() #Force persist due to size

#Count
n_users = user_df.count()
n_items = movie_df.count()

als = ALS(
    rank=rank,
    maxIter=iter,
    regParam=regParam,
    
    userCol="UserID",
    itemCol="MovieID",
    ratingCol="Rating",
    implicitPrefs=False,
    coldStartStrategy='drop',
    nonnegative=False,
)            
# NDCG and RMSE
(train, test) = df.randomSplit([0.8, 0.2])  
model = als.fit(train)

22/10/08 17:56:33 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/10/08 17:56:33 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/10/08 17:56:34 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
22/10/08 17:56:34 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


In [13]:
ndcg = (evaluate_ndcg_at_k(model, full_matrix, train, test, k = 10))

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/lap15450/.local/lib/python3.8/site-packages/py4j/java_gateway.py", line 1207, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/lap15450/.local/lib/python3.8/site-packages/py4j/java_gateway.py", line 1033, in send_command
    response = connection.send_command(command)
  File "/home/lap15450/.local/lib/python3.8/site-packages/py4j/java_gateway.py", line 1211, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while receiving
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:42305)
Traceback (most recent call last):
  File "/home/lap15450/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 33

Py4JError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob

In [None]:
rmse = (evaluate_rmse(model, test))

In [None]:
hit_rate = (evaluate_hit_rate(als, left_out_dataframe, keep_one_dataframe, \
                                     full_matrix, n_users))

In [None]:
print("Evaluating, rank: {}, iter: {}, regParam: {}".format(rank, iter, regParam))
print("NDCG: {}, RMSE: {}, Hit: {}".format(ndcg, rmse, hit_rate))