In [1]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,col,collect_list
from pyspark.sql.types import StringType, ArrayType, DoubleType,IntegerType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RankingEvaluator
from HR import HitRate

from pyspark.sql.types import *
from pyspark.sql import functions as F

#For windows user only
import os 
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Create spark session

In [2]:
spark = SparkSession.builder.master("local[*]") \
                    .config('spark.ui.showConsoleProgress', 'false')\
                    .appName('MovieRecomender') \
                    .getOrCreate()
                    
spark.sparkContext.setCheckpointDir('checkpoint/')

22/10/08 14:56:58 WARN Utils: Your hostname, lap15450-ThinkPad-X13-Gen-2i resolves to a loopback address: 127.0.1.1; using 192.168.0.193 instead (on interface wlp0s20f3)
22/10/08 14:56:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/10/08 14:56:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
schema =             StructType([
                    StructField('UserID', LongType(), True),
                     StructField('MovieID', LongType(), True),
                     StructField('Rating', IntegerType(), True),
                     StructField('Timestamp', LongType(), True),
                     ])

In [4]:
df = spark.read.option("sep", "::").schema(schema).csv("data/ml-1m/ratings.dat")
df = df.toDF(*["UserID", "MovieID", "Rating", "Timestamp"])
df.createOrReplaceTempView("dataset");
df = df.dropna()
df.cache().count() #Force cache

1000209

# Model config

In [6]:
rankRange = [15, 20]
iterRange = [10, 15]
regParamRange = [0.05, 0.1]

# Hit Rate

In [7]:
def evaluate_hit_rate(als, left_out_df, keep_one_df, full_matrix, n_users):
    hr_evaluator = HitRate(predictionCol='prediction', labelCol='Rating', userCol='UserID', itemCol = "MovieID")
    value = hr_evaluator.eval(als, left_out_df, keep_one_df, full_matrix, n_users)
    return value

# RMSE

In [8]:
def evaluate_rmse(model, test):
    rmse = RegressionEvaluator(metricName="rmse", labelCol="Rating", predictionCol="prediction")       
    predictions=model.transform(test).na.drop()
    return rmse.evaluate(predictions)

# NCDG at K

In [9]:
def getRank(a):
    ret=[]
    for i in a:
        ret.append(float(i.MovieID))
    return ret
def toDouble(a):
    return [float(i) for i in a]

convertUDF = udf(lambda z: getRank(z),ArrayType(DoubleType()))
toDoubleUDF = udf(lambda z: toDouble(z),ArrayType(DoubleType()))

In [10]:
def evaluate_ndcg_at_k(model, df, n_items, k = 10):
    ndcg = RankingEvaluator(labelCol="RealRank", predictionCol="recommendations",metricName="ndcgAtK", k=k)
    tempt=df.sort(col('Rating').desc()).groupBy("UserID").agg(collect_list('MovieID').alias("RealRank"))
    tempt=tempt.withColumn("RealRank",toDoubleUDF(col("RealRank")))

    rec=model.recommendForAllUsers(n_items).join(tempt,"UserID","inner")
    rec=rec.withColumn("recommendations",convertUDF(col("recommendations")))
    return ndcg.evaluate(rec)

# Params testing

In [11]:
#Leave one out for calculating hit rate
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

windowSpec  = Window.partitionBy("UserID").orderBy(F.col("Rating").desc())
tmp = df.withColumn("row_number", row_number().over(windowSpec))      
left_out_dataframe = tmp.filter(F.col("row_number") != 1)
keep_one_dataframe = tmp.filter(F.col("row_number") == 1)
left_out_count = left_out_dataframe.persist().count() #Force persist due to size
keep_out_count = keep_one_dataframe.persist().count() #Force persist
print("Left out {}, training to evaluate hit rate on {}.".format(keep_out_count, left_out_count))

Left out 6040, training to evaluate hit rate on 994169.


In [12]:
user_df = df.select("UserID").distinct()
movie_df = df.select("MovieID").distinct()

user_df.cache().count() #Force cache
movie_df.cache().count() #Force cache
full_matrix = user_df.crossJoin(movie_df)
full_matrix.persist().count() #Force persist due to size

#Count
n_users = user_df.count()
n_items = movie_df.count()
for rank in rankRange:
    for iter in iterRange:
        for regParam in regParamRange:
            als = ALS(
                rank=rank,
                maxIter=iter,
                regParam=regParam,
                
                userCol="UserID",
                itemCol="MovieID",
                ratingCol="Rating",
                implicitPrefs=False,
                coldStartStrategy='drop',
                nonnegative=False,
            )
            hit_rate = (evaluate_hit_rate(als, left_out_dataframe, keep_one_dataframe, \
                                     full_matrix, n_users))
            
            # NDCG and RMSE
            
            (train, test) = df.randomSplit([0.8, 0.2])  
            model = als.fit(train)
            ndcg = (evaluate_ndcg_at_k(model, df, n_items))
            rmse = (evaluate_rmse(model, test))
            
            print("Evaluating, rank: {}, iter: {}, regParam: {}".format(rank, iter, regParam))
            print("NDCG: {}, RMSE: {}, Hit: {}".format(ndcg, rmse, hit_rate))

22/10/08 14:57:21 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/10/08 14:57:21 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/10/08 14:57:21 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
22/10/08 14:57:21 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


Evaluating, rank: 15, iter: 10, regParam: 0.05
NDCG: 0.1055040196139647, RMSE: 0.8603844835046167, Hit: 0.013079470198675497
Evaluating, rank: 15, iter: 10, regParam: 0.1
NDCG: 0.13300829955180124, RMSE: 0.8704314971448491, Hit: 0.015562913907284768
Evaluating, rank: 15, iter: 15, regParam: 0.05
NDCG: 0.1242452442105532, RMSE: 0.8591483659474212, Hit: 0.014403973509933774
Evaluating, rank: 15, iter: 15, regParam: 0.1
NDCG: 0.13943534281728534, RMSE: 0.8645012563503666, Hit: 0.017218543046357615
Evaluating, rank: 20, iter: 10, regParam: 0.05
NDCG: 0.14237618062389668, RMSE: 0.8634282235303143, Hit: 0.019867549668874173
Evaluating, rank: 20, iter: 10, regParam: 0.1
NDCG: 0.12828178958824524, RMSE: 0.8674850341093442, Hit: 0.016059602649006622
Evaluating, rank: 20, iter: 15, regParam: 0.05
NDCG: 0.16406296732020675, RMSE: 0.8594721721304373, Hit: 0.019867549668874173
Evaluating, rank: 20, iter: 15, regParam: 0.1
NDCG: 0.15443455123896804, RMSE: 0.861736985872194, Hit: 0.01705298013245033
