# DEMO RECOMMENDER SYSTEM

In [1]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext

## Nhập dữ liệu

In [2]:
sc= SparkContext(master= 'local', appName= 'Chapter 9 - Demo recommender system')
ss= SparkSession(sc)

In [3]:
path= '/Users/vovanthuong/Desktop/9 - Big Data in Machine Learning/Data/Chapter9/LDS9_Data_Day_7/movielens_ratings.csv'
data= ss.read.csv(path, header= True, inferSchema= True)

In [4]:
data.show(3)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
+-------+------+------+
only showing top 3 rows



In [5]:
userId= data.select('userId').distinct()
userId.count()

30

In [6]:
movieId= data.select('movieId').distinct()
movieId.count()

100

In [7]:
data.count()

1501

## Tạo dữ liệu train và test

In [13]:
train, test= data.randomSplit([0.8, 0.2])

## Xây dựng mô hình

In [10]:
from pyspark.ml.recommendation import ALS

In [147]:
als= ALS(maxIter= 20, regParam= 0.05,
         userCol= 'userId', itemCol= 'movieId', ratingCol= 'rating')
als_model= als.fit(train)

In [148]:
train_prediction= als_model.transform(train)
train_prediction.show(10)

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|     31|   1.0|    27| 1.0425134|
|     31|   4.0|    12| 3.5470302|
|     31|   1.0|    13| 1.1550571|
|     31|   1.0|     5| 1.0841353|
|     31|   1.0|    19|  1.071628|
|     31|   1.0|     4| 1.1552005|
|     31|   3.0|     8| 2.8769984|
|     31|   3.0|     7|  2.699551|
|     31|   2.0|    25| 2.1879303|
|     31|   3.0|    14| 2.7744582|
+-------+------+------+----------+
only showing top 10 rows



## Đánh giá mô hình

In [143]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator= RegressionEvaluator(labelCol= 'rating',predictionCol= 'prediction', metricName= 'rmse')

### Tập train

In [149]:
evaluator.evaluate(train_prediction)

0.22459211817314698

### Tập test

In [150]:
test_prediction= als_model.transform(test)

In [151]:
evaluator.evaluate(test_prediction)

0.9825128438573202

## Thực hiện gợi ý cho 1 user

In [203]:
from pyspark.sql.functions import posexplode, col
def recommend_for_user(als_model, userId, numItems):
    # Đề xuất một số lượng item cao nhất cho tất cả user
    data= als_model.recommendForAllUsers(numItems= numItems)
    # Lọc lại user quan tâm
    data= data.where(data.userId == userId)
    # Chuyển đổi dữ liệu
    data= data.select('userId', 
                      f.posexplode("recommendations").alias('No', 'recommendations'))
    data= data.select('userId', 'No',
                      (col('recommendations')['movieId']).alias('movieId'),
                      (col('recommendations')['rating']).alias('rating'))
    return data

In [204]:
recommend_for_user(als_model= als_model, userId= 1, numItems= 10).show()

+------+---+-------+---------+
|userId| No|movieId|   rating|
+------+---+-------+---------+
|     1|  0|     68|3.5432656|
|     1|  1|     22|3.3865113|
|     1|  2|     62|3.3731127|
|     1|  3|     77|2.8645623|
|     1|  4|     51|2.7663949|
|     1|  5|     75|2.7491271|
|     1|  6|      9|2.6783535|
|     1|  7|     28|2.6719494|
|     1|  8|     23|2.6468697|
|     1|  9|     90|2.5930696|
+------+---+-------+---------+

