In [1]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf,col,collect_list
from pyspark.sql.types import StringType, ArrayType, DoubleType,IntegerType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RankingEvaluator
from hit_rate import HitRate
import pandas as pd

In [2]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
spark = SparkSession.builder.master("local[*]") \
                    .config('spark.ui.showConsoleProgress', 'false')\
                    .appName('MovieRecomender') \
                    .getOrCreate()
print(spark.sparkContext)
print("Spark App Name : "+ spark.sparkContext.appName)

22/10/07 20:13:39 WARN Utils: Your hostname, mt-pc resolves to a loopback address: 127.0.1.1; using 192.168.31.100 instead (on interface eno1)
22/10/07 20:13:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/10/07 20:13:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


<SparkContext master=local[*] appName=MovieRecomender>
Spark App Name : MovieRecomender


In [4]:
df = pd.read_csv("data/ratings.dat", engine='python', sep='::', names=["UserID", "MovieID", "Rating", "Timestamp"])
df=df.drop(["Timestamp"],axis=1)
df.dropna()
df = spark.createDataFrame(df)
df.createOrReplaceTempView("dataset");
df = df.cache()
df.count() #force cache

22/10/07 20:14:01 WARN TaskSetManager: Stage 0 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.


1000209

In [5]:
sql = '''
select 
  A.UserID, A.MovieID, Rating
from 
  (
    select 
      * 
    from 
      (
        select 
          distinct(UserID) 
        from 
          dataset
      ), 
      (
        select 
          distinct(MovieID) 
        from 
          dataset
      )
  ) as A left outer join dataset as B
  on (A.UserID, A.MovieID) = (B.UserID, B.MovieID)
'''
#full_matrix = spark.sql(sql)
#full_matrix.show()

In [6]:
#full_matrix.count()

Leave one out for each group in full_matrix

loo_matrix = full_matrix

In [7]:
(train, test) = df.randomSplit([0.8, 0.2])

In [8]:
als = ALS(userCol="UserID", itemCol="MovieID", ratingCol="Rating", nonnegative = True, implicitPrefs = False,coldStartStrategy="drop",rank=50,maxIter=15,regParam=0.05)

In [9]:
grid_search = ParamGridBuilder().addGrid(als.rank,[50]).addGrid(als.maxIter,[15]).addGrid(als.regParam, [0.05] ).build()
#thay đổi hyperparams ở đây và chạy lấy kết quả viết báo cáo

In [16]:
rmse = RegressionEvaluator(metricName="rmse", labelCol="Rating", predictionCol="prediction") 
ndcg = RankingEvaluator(labelCol="RealRank", predictionCol="recommendations",metricName="ndcgAtK", k=10)

In [11]:
cv = CrossValidator(estimator=als, estimatorParamMaps=grid_search, evaluator=rmse, numFolds=5)

In [12]:
spark.sparkContext.setCheckpointDir('checkpoint/')
#model=cv.fit(train)
model=als.fit(train)

22/10/07 20:14:02 WARN TaskSetManager: Stage 2 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/07 20:14:03 WARN TaskSetManager: Stage 3 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/07 20:14:06 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/10/07 20:14:06 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [47]:

def getRank(a):
    ret=[]
    for i in a:
        ret.append(float(i.MovieID))
    return ret
convertUDF = udf(lambda z: getRank(z),ArrayType(DoubleType()))

def toDouble(a):
    return [float(i) for i in a]
toDoubleUDF = udf(lambda z: toDouble(z),ArrayType(DoubleType()))

tempt=df.sort(col('Rating').desc()).groupBy("UserID").agg(collect_list('MovieID').alias("RealRank"))
tempt=tempt.withColumn("RealRank",toDoubleUDF(col("RealRank")))

rec=model.recommendForAllUsers(3952).join(tempt,"UserID","inner")
rec=rec.withColumn("recommendations",convertUDF(col("recommendations")))
rec.show()

22/10/07 20:48:22 WARN TaskSetManager: Stage 290 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/07 20:48:22 WARN TaskSetManager: Stage 291 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.


+------+--------------------+--------------------+
|UserID|     recommendations|            RealRank|
+------+--------------------+--------------------+
|    26|[572.0, 3106.0, 1...|[1682.0, 1259.0, ...|
|    29|[572.0, 1997.0, 3...|[2993.0, 1262.0, ...|
|   474|[2905.0, 1068.0, ...|[1250.0, 589.0, 1...|
|   964|[3092.0, 2997.0, ...|[2997.0, 588.0, 1...|
|  1677|[37.0, 2571.0, 23...|[1617.0, 2628.0, ...|
|  1697|[572.0, 3114.0, 3...|[2987.0, 3798.0, ...|
|  1806|[572.0, 598.0, 30...|[3793.0, 2058.0, ...|
|  1950|[1199.0, 608.0, 3...|[720.0, 1.0, 595....|
|  2040|[3870.0, 2129.0, ...|[3798.0, 3948.0, ...|
|  2214|[3092.0, 608.0, 2...|[1.0, 2064.0, 126...|
|  2250|[2997.0, 2959.0, ...|[1252.0, 2997.0, ...|
|  2453|[572.0, 527.0, 67...|[1249.0, 1250.0, ...|
|  2509|[1545.0, 326.0, 3...|[858.0, 3178.0, 5...|
|  2529|[572.0, 2562.0, 3...|[2987.0, 1250.0, ...|
|  2927|[110.0, 572.0, 37...|[1275.0, 1291.0, ...|
|  3091|[572.0, 213.0, 11...|[593.0, 2263.0, 1...|
|  3506|[1206.0, 1423.0, ...|[2

In [14]:
#print(cv_fitted.bestModel.rank, cv_fitted.bestModel._java_obj.parent().getMaxIter(),cv_fitted.bestModel._java_obj.parent().getRegParam())

In [48]:
predictions=model.transform(test).na.drop()
print(rmse.evaluate(predictions),ndcg.evaluate(rec))

22/10/07 20:48:38 WARN TaskSetManager: Stage 322 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/07 20:48:41 WARN TaskSetManager: Stage 327 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/07 20:48:41 WARN TaskSetManager: Stage 328 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.


0.8565193235068412 0.2677876035593543


In [None]:
#hr_evaluator = HitRate(predictionCol='prediction', labelCol='rating', userCol='userId')
#model = als.fit(train)
#predictions = model.transform(test)
#hr_evaluator.evaluate(predictions)

22/10/07 18:38:29 WARN TaskSetManager: Stage 667 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/07 18:38:29 WARN TaskSetManager: Stage 668 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.


TypeError: _evaluate() missing 1 required positional argument: 'gt'

In [46]:
tempt=df.sort(col('Rating').desc()).groupBy("UserID").agg(collect_list('MovieID').alias("RealRank"))
tempt=tempt.withColumn("RealRank",toDoubleUDF(col("RealRank")))
tempt.show()

22/10/07 20:48:09 WARN TaskSetManager: Stage 286 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.
22/10/07 20:48:10 WARN TaskSetManager: Stage 287 contains a task of very large size (1581 KiB). The maximum recommended task size is 1000 KiB.


+------+--------------------+
|UserID|            RealRank|
+------+--------------------+
|    26|[1682.0, 1259.0, ...|
|    29|[2993.0, 1262.0, ...|
|   474|[1250.0, 589.0, 1...|
|   964|[2997.0, 588.0, 1...|
|  1677|[1617.0, 2628.0, ...|
|  1697|[2987.0, 3798.0, ...|
|  1806|[3793.0, 2058.0, ...|
|  1950|[720.0, 1.0, 595....|
|  2040|[3798.0, 3948.0, ...|
|  2214|[1.0, 2064.0, 126...|
|  2250|[1252.0, 2997.0, ...|
|  2453|[1249.0, 1250.0, ...|
|  2509|[858.0, 3178.0, 5...|
|  2529|[2987.0, 1250.0, ...|
|  2927|[1275.0, 1291.0, ...|
|  3091|[593.0, 2263.0, 1...|
|  3506|[2997.0, 3000.0, ...|
|  3764|[3006.0, 923.0, 2...|
|  4590|[296.0, 50.0, 226...|
|  4823|[3794.0, 2064.0, ...|
+------+--------------------+
only showing top 20 rows

