# Recommender System

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer

spark = SparkSession.builder.appName('red').getOrCreate()

## Read in 12 Dataset of 21.78GB

In [3]:
schema = StructType([ \
    StructField("marketplace",       StringType(),    True), \
    StructField("customer_id",       StringType(),    True), \
    StructField("review_id",         StringType(),    True), \
    StructField("product_id",        StringType(),    True), \
    StructField("product_parent",    StringType(),    True), \
    StructField("product_title",     StringType(),    True), \
    StructField("product_category",  StringType(),    True), \
    StructField("star_rating",       IntegerType(),   True), \
    StructField("helpful_votes",     IntegerType(),   True), \
    StructField("total_votes",       IntegerType(),   True), \
    StructField("vine",              StringType(),    True), \
    StructField("verified_purchase", StringType(),    True), \
    StructField("review_headline",   StringType(),    True), \
    StructField("review_body",       StringType(),    True), \
    StructField("review_date",       TimestampType(), True), \
  ])

path = ['archive/amazon_reviews_us_Apparel_v1_00.tsv',
        'archive/amazon_reviews_us_Automotive_v1_00.tsv',
        'archive/amazon_reviews_us_Baby_v1_00.tsv',
        'archive/amazon_reviews_us_Beauty_v1_00.tsv',
        'archive/amazon_reviews_us_Books_v1_02.tsv',
        'archive/amazon_reviews_us_Camera_v1_00.tsv',
        'archive/amazon_reviews_us_Electronics_v1_00.tsv',
        'archive/amazon_reviews_us_Furniture_v1_00.tsv',
        'archive/amazon_reviews_us_Sports_v1_00.tsv',
        'archive/amazon_reviews_us_Grocery_v1_00.tsv',
        'archive/amazon_reviews_us_Personal_Care_Appliances_v1_00.tsv',
        'archive/amazon_reviews_us_Music_v1_00.tsv']

data = spark.read.csv(path, schema=schema, header=True, sep='\t', mode='DROPMALFORMED')

## Filter out non-product records in product_category column

In [4]:
product_category = ['Sports', 'Baby', 'Apparel', 'Grocery', 'Electronics', 'Automotive', 'Books', 'Music', 'Furniture', 'Personal_Care_Appliances', 'Camera', 'Beauty']
data_filter = data.filter(data.product_category.isin(product_category))

In [5]:
# After filtering out, how many recoreds remaining
data_filter.count()

37172391

## Find out top 20 customers posting the most reviews

In [6]:
customer_count = data_filter.groupBy('customer_id').count()
top20 = customer_count.sort(col('count').desc()).head(20)
top20_l = list(map(lambda x : x[0], top20))

In [7]:
top20_l

['50122160',
 '50732546',
 '50736950',
 '38214553',
 '51184997',
 '18116317',
 '23267387',
 '50345651',
 '52496677',
 '14539589',
 '15725862',
 '50913245',
 '50881246',
 '20018062',
 '22073263',
 '52615377',
 '19380211',
 '51381678',
 '37455882',
 '50441674']

## Calculate each customer's the average rating of each product category

In [8]:
avg_c_p = data_filter.filter(data_filter.customer_id.isin(top20_l)).groupBy('customer_id', 'product_category').mean('star_rating')
avg_c_p.show()

+-----------+----------------+-----------------+
|customer_id|product_category| avg(star_rating)|
+-----------+----------------+-----------------+
|   23267387|           Books|              5.0|
|   23267387|          Beauty|              4.0|
|   38214553|          Sports|              4.0|
|   50913245|      Automotive|3.923076923076923|
|   18116317|         Grocery|4.309523809523809|
|   50345651|           Books|4.886363636363637|
|   22073263|       Furniture|              4.8|
|   22073263|          Sports|              5.0|
|   22073263|          Beauty|4.841269841269841|
|   50736950|     Electronics|              5.0|
|   23267387|      Automotive|              4.5|
|   52496677|         Apparel|              5.0|
|   14539589|           Music|4.989986187845304|
|   50913245|       Furniture|              4.0|
|   15725862|           Music|4.750527797325827|
|   50441674|          Beauty|              2.5|
|   20018062|          Beauty|              5.0|
|   52496677|       

In [9]:
avg_c_p.count()

109

## Convert string type to numeric type and Split data into training and test

In [10]:
avg_c_p = avg_c_p.withColumn("customer_id", avg_c_p["customer_id"].cast(IntegerType()))
indexer = StringIndexer(inputCol="product_category", outputCol="product_id")
cus_prod = indexer.fit(avg_c_p).transform(avg_c_p)

### Retrieve the mapping between index and product category

In [22]:
indexer.fit(avg_c_p).labels

['Music',
 'Books',
 'Electronics',
 'Sports',
 'Grocery',
 'Beauty',
 'Camera',
 'Apparel',
 'Automotive',
 'Furniture',
 'Baby',
 'Personal_Care_Appliances']

### Split the data into training and test

In [11]:
(training, test) = cus_prod.randomSplit([0.8, 0.2])

In [12]:
training.show()

+-----------+--------------------+-----------------+----------+
|customer_id|    product_category| avg(star_rating)|product_id|
+-----------+--------------------+-----------------+----------+
|   23267387|               Books|              5.0|       1.0|
|   23267387|              Beauty|              4.0|       5.0|
|   18116317|             Grocery|4.309523809523809|       4.0|
|   22073263|           Furniture|              4.8|       9.0|
|   22073263|              Sports|              5.0|       3.0|
|   50345651|               Books|4.886363636363637|       1.0|
|   22073263|              Beauty|4.841269841269841|       5.0|
|   50736950|         Electronics|              5.0|       2.0|
|   14539589|               Music|4.989986187845304|       0.0|
|   52496677|             Apparel|              5.0|       7.0|
|   50913245|           Furniture|              4.0|       9.0|
|   15725862|               Music|4.750527797325827|       0.0|
|   50441674|              Beauty|      

## Build the recommendation model using ALS on the training data

In [13]:
als = ALS(maxIter=5, regParam=0.01, userCol="customer_id", itemCol="product_id", ratingCol="avg(star_rating)")
model = als.fit(training)

## Evaluate the model by computing the RMSE on the test data

In [18]:
predictions = model.transform(test)
predictions = predictions.dropna()
predictions.show()
evaluator = RegressionEvaluator(metricName="rmse", labelCol="product_id",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

+-----------+----------------+-----------------+----------+----------+
|customer_id|product_category| avg(star_rating)|product_id|prediction|
+-----------+----------------+-----------------+----------+----------+
|   50122160|           Books|4.997992883860961|       1.0|  4.687139|
|   51184997|           Books|              5.0|       1.0|  3.203855|
|   50881246|           Books|3.958036421219319|       1.0| 5.5383763|
|   20018062|           Books|4.964788732394366|       1.0| 4.7496414|
|   22073263|           Books|              4.0|       1.0|   2.48031|
|   50881246|          Camera|              4.0|       6.0| 2.4901977|
|   18116317|          Camera|              5.0|       6.0| 3.2218025|
|   23267387|          Camera|4.833333333333333|       6.0| 1.9963455|
|   23267387|          Sports|              5.0|       3.0| 3.7034066|
|   38214553|          Sports|              4.0|       3.0| 3.3653195|
|   50736950|          Beauty|             4.95|       5.0|  4.606846|
|   50

In [19]:
K = 3
userRecs = model.recommendForAllUsers(K)
userRecs.show(userRecs.count(), False)

+-----------+-------------------------------------------------+
|customer_id|recommendations                                  |
+-----------+-------------------------------------------------+
|52496677   |[[7, 5.0227623], [4, 5.0160265], [10, 5.00542]]  |
|51184997   |[[0, 4.144959], [2, 3.8595881], [4, 3.5448449]]  |
|52615377   |[[1, 4.466326], [4, 3.5931022], [7, 3.2025504]]  |
|18116317   |[[10, 4.9862623], [9, 4.4411407], [2, 4.3707666]]|
|14539589   |[[2, 5.009161], [5, 5.003492], [9, 4.980434]]    |
|20018062   |[[2, 5.0005], [5, 4.9974456], [4, 4.9720297]]    |
|50732546   |[[1, 4.683359], [4, 4.1587687], [2, 3.8515213]]  |
|37455882   |[[0, 4.062229], [2, 3.442569], [4, 2.8250575]]   |
|50345651   |[[2, 5.206237], [4, 5.006421], [3, 4.994061]]    |
|50736950   |[[7, 5.0055695], [3, 4.98691], [2, 4.9804196]]   |
|50441674   |[[8, 4.982744], [2, 4.73892], [3, 4.665714]]     |
|38214553   |[[0, 4.131078], [2, 3.8466623], [4, 3.5329733]]  |
|51381678   |[[0, 3.8819609], [2, 3.6146