In [None]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from hit_rate import HitRate
import pandas as pd

In [None]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [None]:
spark = SparkSession.builder.master("local[*]") \
                    .config('spark.ui.showConsoleProgress', 'false')\
                    .appName('MovieRecomender') \
                    .getOrCreate()
print(spark.sparkContext)
print("Spark App Name : "+ spark.sparkContext.appName)

In [None]:
df = pd.read_csv("data/ratings.dat", engine='python', sep='::', names=["UserID", "MovieID", "Rating", "Timestamp"])
df=df.select(["UserID","MovieID","Rating"])
df.na.drop()
df = spark.createDataFrame(df)
df.createOrReplaceTempView("dataset");
df = df.cache()
df.count() #force cache

In [None]:
sql = '''
select 
  A.UserID, A.MovieID, Rating
from 
  (
    select 
      * 
    from 
      (
        select 
          distinct(UserID) 
        from 
          dataset
      ), 
      (
        select 
          distinct(MovieID) 
        from 
          dataset
      )
  ) as A left outer join dataset as B
  on (A.UserID, A.MovieID) = (B.UserID, B.MovieID)
'''
full_matrix = spark.sql(sql)
full_matrix.show()

In [None]:
full_matrix.count()

Leave one out for each group in full_matrix

loo_matrix = full_matrix

In [None]:
(train, test) = df.randomSplit([0.8, 0.2])

In [None]:
als = ALS(userCol="UserID", itemCol="MovieID", ratingCol="Rating", nonnegative = True, implicitPrefs = False,coldStartStrategy="drop")

In [None]:
grid_search = ParamGridBuilder().addGrid(als.rank,[30,50]).addGrid(als.maxIter,[15,20,50]).addGrid(als.regParam, [0.05] ).build()

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Rating", predictionCol="prediction") 

In [None]:
cv = CrossValidator(estimator=als, estimatorParamMaps=grid_search, evaluator=evaluator, numFolds=5)

In [None]:
spark.sparkContext.setCheckpointDir('checkpoint/')
cv_fitted=cv.fit(train)

In [None]:
print(cv_fitted.bestModel.rank, cv_fitted.bestModel._java_obj.parent().getMaxIter(),cv_fitted.bestModel._java_obj.parent().getRegParam())

In [None]:
evaluator.evaluate(cv_fitted.transform(test).na.drop())

In [None]:
hr_evaluator = HitRate(predictionCol='prediction', labelCol='rating', userCol='userId')
model = als.fit(train)
predictions = model.transform(test)
hr_evaluator.evaluate(predictions)