In [7]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from hit_rate import HitRate
import pandas as pd

In [8]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [9]:
spark = SparkSession.builder.master("local[*]") \
                    .config('spark.ui.showConsoleProgress', 'false')\
                    .appName('MovieRecomender') \
                    .getOrCreate()
print(spark.sparkContext)
print("Spark App Name : "+ spark.sparkContext.appName)

<SparkContext master=local[*] appName=MovieRecomender>
Spark App Name : MovieRecomender


In [10]:
df = pd.read_csv("data/ratings.dat", engine='python', sep='::', names=["UserID", "MovieID", "Rating", "Timestamp"])
df=df.drop(["Timestamp"],axis=1)
df.dropna()
df = spark.createDataFrame(df)
df.createOrReplaceTempView("dataset");
df = df.cache()
df.count() #force cache

22/10/06 09:31:42 WARN TaskSetManager: Stage 2 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.


1000209

In [11]:
sql = '''
select 
  A.UserID, A.MovieID, Rating
from 
  (
    select 
      * 
    from 
      (
        select 
          distinct(UserID) 
        from 
          dataset
      ), 
      (
        select 
          distinct(MovieID) 
        from 
          dataset
      )
  ) as A left outer join dataset as B
  on (A.UserID, A.MovieID) = (B.UserID, B.MovieID)
'''
full_matrix = spark.sql(sql)
full_matrix.show()

22/10/06 09:31:48 WARN TaskSetManager: Stage 4 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/06 09:31:48 WARN TaskSetManager: Stage 5 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/06 09:31:48 WARN TaskSetManager: Stage 7 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.


+------+-------+------+
|UserID|MovieID|Rating|
+------+-------+------+
|     1|     94|  null|
|     1|    185|  null|
|     1|    328|  null|
|     1|    354|  null|
|     1|    367|  null|
|     1|   1086|  null|
|     1|   1160|  null|
|     1|   1500|  null|
|     1|   1620|  null|
|     1|   2393|  null|
|     1|   2597|  null|
|     1|   2857|  null|
|     1|   2868|  null|
|     1|   2984|  null|
|     1|   3003|  null|
|     1|   3134|  null|
|     1|   3437|  null|
|     1|   3462|  null|
|     1|   3808|  null|
|     2|    702|  null|
+------+-------+------+
only showing top 20 rows



In [12]:
full_matrix.count()

22/10/06 09:34:51 WARN TaskSetManager: Stage 9 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/06 09:34:51 WARN TaskSetManager: Stage 10 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/06 09:34:51 WARN TaskSetManager: Stage 12 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.


22384240

Leave one out for each group in full_matrix

loo_matrix = full_matrix

In [13]:
(train, test) = df.randomSplit([0.8, 0.2])

In [14]:
als = ALS(userCol="UserID", itemCol="MovieID", ratingCol="Rating", nonnegative = True, implicitPrefs = False,coldStartStrategy="drop")

In [15]:
grid_search = ParamGridBuilder().addGrid(als.rank,[50]).addGrid(als.maxIter,[15]).addGrid(als.regParam, [0.05] ).build()
#thay đổi hyperparams ở đây và chạy lấy kết quả viết báo cáo

In [16]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Rating", predictionCol="prediction") 

In [17]:
cv = CrossValidator(estimator=als, estimatorParamMaps=grid_search, evaluator=evaluator, numFolds=5)

In [18]:
spark.sparkContext.setCheckpointDir('checkpoint/')
cv_fitted=cv.fit(train)

22/10/06 09:38:42 WARN TaskSetManager: Stage 15 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/06 09:38:42 WARN TaskSetManager: Stage 16 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/06 09:38:45 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/10/06 09:38:45 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/10/06 09:38:59 WARN TaskSetManager: Stage 118 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/06 09:39:03 WARN TaskSetManager: Stage 123 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/06 09:39:03 WARN TaskSetManager: Stage 124 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/06 09:39:19 WARN TaskSetManager: Stage 256 contains a task of very lar

In [19]:
print(cv_fitted.bestModel.rank, cv_fitted.bestModel._java_obj.parent().getMaxIter(),cv_fitted.bestModel._java_obj.parent().getRegParam())

50 15 0.05


In [20]:
evaluator.evaluate(cv_fitted.transform(test).na.drop())

22/10/06 09:41:44 WARN TaskSetManager: Stage 1348 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.


0.8563465051361246

In [21]:
hr_evaluator = HitRate(predictionCol='prediction', labelCol='rating', userCol='userId')
model = als.fit(train)
predictions = model.transform(test)
hr_evaluator.evaluate(predictions)

22/10/06 09:41:47 WARN TaskSetManager: Stage 1353 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/06 09:41:47 WARN TaskSetManager: Stage 1354 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.


TypeError: _evaluate() missing 1 required positional argument: 'gt'