# CHAPTER 9 - EXERCISE 3: MUSICAL INSTRUMENTS

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession

## Nhập dữ liệu

In [2]:
sc= SparkContext(master= 'local', appName= 'Chapter 9 - Exercise 3')
ss= SparkSession(sc)

In [7]:
path= '/Users/vovanthuong/Desktop/9 - Big Data in Machine Learning/Data/Chapter9/LDS9_Data_Day_7/Musical_Instruments_5.json'
df= ss.read.json(path)

In [9]:
df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [8]:
df.show(3)

+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin| helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|1384719342|  [0, 0]|    5.0|Not much to write...|02 28, 2014|A2IBPI20UZIR0U|cassandra tu "Yea...|                good|    1393545600|
|1384719342|[13, 14]|    5.0|The product does ...|03 16, 2013|A14VAT5EAX3D9S|                Jake|                Jake|    1363392000|
|1384719342|  [1, 1]|    5.0|The primary job o...|08 28, 2013|A195EZSQDW3E21|Rick Bennette "Ri...|It Does The Job Well|    1377648000|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
only showing top 3 rows



In [10]:
# Số lượng review
df.count()

10261

In [12]:
# Số lượng item
df.select('asin').distinct().count()

900

In [13]:
# Số lượng user
df.select('reviewerID').distinct().count()

1429

## Tạo tập train và test

In [14]:
train, test= df.randomSplit([0.8, 0.2])

## Xây dựng mô hình

In [133]:
from pyspark.ml.feature import SQLTransformer, StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml import Pipeline

# 1
select= SQLTransformer(statement= 'SELECT asin, reviewerID, overall FROM __THIS__')
# 2
asin_laber_to_indexer= StringIndexer(inputCol= 'asin', outputCol= 'asin_idx')
# 3
reviewerID_laber_to_indexer= StringIndexer(inputCol= 'reviewerID', outputCol= 'reviewerID_idx')
# 4
als= ALS(maxIter= 25, regParam= 0.05,
         userCol= 'reviewerID_idx', itemCol= 'asin_idx', ratingCol= 'overall')
# Pipe
pipe_als= Pipeline(stages= [select, asin_laber_to_indexer, reviewerID_laber_to_indexer, als])

In [134]:
pipe_als_model= pipe_als.fit(train)

## Đánh giá kết quả của mô hình

In [135]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator= RegressionEvaluator(labelCol= 'overall', predictionCol= 'prediction',metricName= 'rmse')

### Trên tập train

In [136]:
train_result= pipe_als_model.transform(train)
evaluator.evaluate(train_result)

0.0923985878743525

### Trên tập test

In [137]:
test_result= pipe_als_model.transform(test)
evaluator.evaluate(test_result)

1.1790525368814895

## Recomment cho toàn bộ user

In [147]:
recomment_for_all= pipe_als_model.stages[3].recommendForAllUsers(numItems= 10)

## Recomment cho 1 user

In [176]:
# Cách 1: Chuyển reviewerID_idx thành reviewerID
from pyspark.ml.feature import IndexToString
from pyspark.sql.functions import posexplode, col

reviewerID_indexer_to_label = IndexToString(inputCol="reviewerID_idx", 
                                            outputCol="reviewerID", 
                                            labels= pipe_als_model.stages[2].labels)

def recommend_for_user(als_model, reviewerID, numItems):
    # Đề xuất một số lượng item cao nhất cho tất cả user
    data= als_model.recommendForAllUsers(numItems= numItems)
    # Chuyển từ reviewerID_idx lại reviewerID
    data= reviewerID_indexer_to_label.transform(data)
    # Lọc lại user quan tâm
    data= data.where(data.reviewerID == reviewerID)
    # Chuyển đổi dữ liệu
    data= data.select('reviewerID', 'reviewerID_idx',
                      posexplode("recommendations").alias('No', 'recommendations'))
    data= data.select('reviewerID', 'reviewerID_idx', 'No',
                      (col('recommendations')['asin_idx']).alias('asin_idx'),
                      (col('recommendations')['rating']).alias('rating'))
    return data

In [175]:
recommend_for_user(als_model= pipe_als_model.stages[-1], 
                   reviewerID= 'A2IBPI20UZIR0U', numItems= 5).show()

+--------------+--------------+---+--------+---------+
|    reviewerID|reviewerID_idx| No|asin_idx|   rating|
+--------------+--------------+---+--------+---------+
|A2IBPI20UZIR0U|            81|  0|     433|5.7016926|
|A2IBPI20UZIR0U|            81|  1|     401| 5.538135|
|A2IBPI20UZIR0U|            81|  2|     124|5.4247503|
|A2IBPI20UZIR0U|            81|  3|     273|  5.42354|
|A2IBPI20UZIR0U|            81|  4|     161|5.4059787|
+--------------+--------------+---+--------+---------+



In [220]:
# Cách 2: Chuyển reviewerID thành reviewerID_idx
from pyspark.ml.feature import IndexToString
from pyspark.sql.functions import posexplode, col

reviewerID_indexer_to_label = IndexToString(inputCol="reviewerID_idx", 
                                            outputCol="reviewerID", 
                                            labels= pipe_als_model.stages[2].labels)

def recommend_for_user(als_model, reviewerID, numItems):
    # Chuyển reviewerID thành reviewerID_idx
    reviewerID= ss.createDataFrame([(reviewerID,)], ['reviewerID', ])
    reviewerID= pipe_als_model.stages[2].transform(reviewerID)
    reviewerID_idx= reviewerID.select('reviewerID_idx').rdd.flatMap(lambda x: x).collect()[0]
    # Đề xuất một số lượng item cao nhất cho tất cả user
    data= als_model.recommendForAllUsers(numItems= numItems)
    # Lọc lại user quan tâm
    data= data.where(data.reviewerID_idx == reviewerID_idx)
    # Đưa cột reviewerID vào data
    data= data.join(reviewerID, on=['reviewerID_idx'], how='inner')
    # Chuyển đổi dữ liệu
    data= data.select('reviewerID', 'reviewerID_idx',
                      posexplode("recommendations").alias('No', 'recommendations'))
    data= data.select('reviewerID', 'reviewerID_idx', 'No',
                      (col('recommendations')['asin_idx']).alias('asin_idx'),
                      (col('recommendations')['rating']).alias('rating'))
    return data

In [221]:
recommend_for_user(als_model= pipe_als_model.stages[-1], 
                   reviewerID= 'A2IBPI20UZIR0U', numItems= 5).show()

+--------------+--------------+---+--------+---------+
|    reviewerID|reviewerID_idx| No|asin_idx|   rating|
+--------------+--------------+---+--------+---------+
|A2IBPI20UZIR0U|            81|  0|     433|5.7016926|
|A2IBPI20UZIR0U|            81|  1|     401| 5.538135|
|A2IBPI20UZIR0U|            81|  2|     124|5.4247503|
|A2IBPI20UZIR0U|            81|  3|     273|  5.42354|
|A2IBPI20UZIR0U|            81|  4|     161|5.4059787|
+--------------+--------------+---+--------+---------+

