#### Final project: Computing the avarage rating of Amazon products using PySpark


In [None]:
#docker command
docker run -it \
  -v ./work:/home/jovyan/work \
  --user root \
  -e CHOWN_HOME=yes \
  -e CHOWN_HOME_OPTS='-R' \
  -p 8888:8888 \
  jupyter/pyspark-notebook

In [1]:
import pyspark
import json
import numpy as np

In [2]:
# Smaller dataset that I used for testing
DATA_PATH_FASHION = "./work/AMAZON_FASHION_5.json"
# Main dataset
DATA_PATH = "./work/Grocery_and_Gourmet_Food_5.json"

In [3]:
# Create Spark Session
spark = pyspark.sql.SparkSession.builder \
    .master("local[*]") \
    .appName("Amazon Fashion") \
    .getOrCreate()

spark.version

'3.5.0'

In [4]:
# Import dataset to Spark Resilient Distributed Dataset
sc = spark.sparkContext
json_rdd = sc.textFile(DATA_PATH)
data_rdd = json_rdd.map(lambda x: json.loads(x))

In [5]:
# Check the number of reviews in the dataset
# 1 143 860 records
# 5.1 seconds
data_rdd.count()

1143860

In [6]:
# Data format

data_rdd.take(1)

[{'overall': 5.0,
  'verified': True,
  'reviewTime': '11 19, 2014',
  'reviewerID': 'A1QVBUH9E1V6I8',
  'asin': '4639725183',
  'reviewerName': 'Jamshed Mathur',
  'reviewText': 'No adverse comment.',
  'summary': 'Five Stars',
  'unixReviewTime': 1416355200}]

#### Example 1: Average rating of Amazon products

1. Map each review to a tuple of the form `(productID, (rating, 1.0))`
2. Reduce the tuples by `productID` key to obtain `(productID, (sumOfRatings, totalRatingsCount))`
3. Map each tuple to `(productID, sumOfRatings/totalRatingsCount)` to obtain the average rating per product

In [7]:
# Average rating of all products
# 6.4 seconds
avg_rating_by_product = data_rdd \
    .map(lambda x: (x['asin'], (x['overall'], 1))) \
    .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \
    .map(lambda x: (x[0], x[1][0] / x[1][1]))

avg_rating_by_product.take(5)

[('B000052X2S', 4.555555555555555),
 ('B0000CDBPT', 4.5),
 ('B0000D9MQV', 4.428571428571429),
 ('B0000DHZY1', 4.791666666666667),
 ('B0000DID5R', 4.4523809523809526)]

#### Example 2: Filtering best rated products

1. Use the previous result (it is possible to chain the operations in a single pipeline)
2. Filter the products with average rating greater than `4.5`
3. Map products to show only the `productID`.
4. Collect the results in a list 

In [11]:
# Select products with average rating >= 4.5
# 0.8 seconds
best_products = avg_rating_by_product \
    .filter(lambda x: x[1] >= 4.5) \
    .map(lambda x: x[0]) \
    .collect()

print(len(best_products))
print(best_products[:5])

20571
['B000052X2S', 'B0000CDBPT', 'B0000DHZY1', 'B0000E5L25', 'B0000TW1NU']


#### Example 3: Finding the favourite product by user

1. Map each review to a tuple of the form `(userID, (productID, rating))`
2. Group by key to obtain `(userID, [(productID, rating), (productID, rating), ...])`
3. Map each tuple to `(userID, (productID, rating))` to obtain the favourite product by user. Selecting the product with maximum rating.

P.S. It is rather proof of concept, because there can be many products with rating 5.0. More realistic approach would be select an iterable of such products.

In [25]:
# Find the favourite product of each user
# 1 minute
favourite_product_by_user = data_rdd \
    .map(lambda x: (x['reviewerID'], (x['asin'], x['overall']))) \
    .groupByKey() \
    .map(lambda x: (x[0], max(x[1], key=lambda y: y[1])[0]))

favourite_product_by_user.take(5)

[('A3QHVBQYDV7Z6U', '4639725183'),
 ('A3KDQ6A29REK5K', '5463213682'),
 ('A1KGNAFK85SNID', '5463213682'),
 ('A1BJVYTBOS2AGM', '9742356831'),
 ('A31GYNW4UGVZNK', '9742356831')]