# Amazon Review ETL

This is an ETL pipeline for Amazon pet product reviews data.

## Dependencies

In [1]:
# Download a Postgres driver to allow Spark to interact with Postgres
!curl -O https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  979k  100  979k    0     0   896k      0  0:00:01  0:00:01 --:--:--  896k


In [2]:
# Locate Spark
import findspark
findspark.init()

# Dependencies
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.sql.types import DateType
from pyspark.sql.functions import col
from config import rds, db_props

# Spark session adding the Postgres driver to Spark
spark = SparkSession.builder \
                    .appName('amz') \
                    .config('spark.driver.extraClassPath', 'postgresql-42.2.16.jar') \
                    .getOrCreate()
spark

## Extract

1. [Download the data](https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Pet_Products_v1_00.tsv.gz)
2. Unzip the downloaded file and move the data file to the same directory as this notebook

Source: https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt

In [3]:
# Read in data
df = spark.read.csv('pet_product_reviews.tsv', sep='\t', header=True, inferSchema=True)
df.show(2, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 marketplace       | US                                                                                                                                                                                                                                                           
 customer_id       | 28794885                                                                                                                                                                                                                                                     
 review_id         | REAKC26P07MDN                                                                                                                                             

In [4]:
# Schema and row count
df.printSchema()
df.count()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)



2643619

In [5]:
# Sample 4% of rows
df_sample = df.sample(fraction=0.04, seed=0)
df_sample.count()

105294

## Transform

In [6]:
# Review table
review_cols = ['review_id', 'customer_id', 'product_id', 'product_parent', df_sample['review_date'].cast(DateType())]
review_df = df_sample.select(review_cols).orderBy('review_id')
print(review_df.count())
review_df.show(5)

105294
+--------------+-----------+----------+--------------+-----------+
|     review_id|customer_id|product_id|product_parent|review_date|
+--------------+-----------+----------+--------------+-----------+
|R1001LGHA5NQKS|   37666116|B00CKFL93K|     338879043| 2014-04-14|
|R1003FAIY018FM|   16601850|B000FOX6A2|     673480362| 2013-04-05|
|R10057PI119T31|   28024191|B00CUH36ZM|     532130864| 2015-02-05|
|R10083Z2KCFFNC|   26901020|B005DCCSO6|     833354736| 2012-05-17|
|R1008Y8VDYJY5U|    1059494|B000G6QGIS|     817765635| 2015-08-18|
+--------------+-----------+----------+--------------+-----------+
only showing top 5 rows



In [7]:
# Product table
product_df = df_sample.select(['product_id', 'product_title']).dropDuplicates().orderBy('product_id')
print(product_df.count())
product_df.show(5)

42358
+----------+--------------------+
|product_id|       product_title|
+----------+--------------------+
|039480001X|  The Cat in the Hat|
|0763004227|Golden Retriever ...|
|0876051468|The Stray Cat Han...|
|0983794804|Pathway Dog Journ...|
|1223000893|Cat Sitter DVD Tr...|
+----------+--------------------+
only showing top 5 rows



In [8]:
# Customer table
customer_df = df_sample.groupBy('customer_id').count().orderBy('customer_id')
customer_df = customer_df.withColumnRenamed('count', 'review_count')
print(customer_df.count())
customer_df.show(5)

98111
+-----------+------------+
|customer_id|review_count|
+-----------+------------+
|      10689|           1|
|      11125|           1|
|      12423|           1|
|      13141|           1|
|      13303|           1|
+-----------+------------+
only showing top 5 rows



In [9]:
# Vine table
vine_cols = ['review_id', 'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase']
vine_df = df_sample.select(vine_cols).orderBy('review_id')
print(vine_df.count())
vine_df.show(5)

105294
+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R1001LGHA5NQKS|          5|            0|          0|   N|                Y|
|R1003FAIY018FM|          5|            0|          0|   N|                Y|
|R10057PI119T31|          5|            1|          1|   N|                Y|
|R10083Z2KCFFNC|          2|            0|          0|   N|                Y|
|R1008Y8VDYJY5U|          1|            0|          0|   N|                Y|
+--------------+-----------+-------------+-----------+----+-----------------+
only showing top 5 rows



### Load

In [10]:
# Connection to database on RDS
jdbc_url = f"jdbc:postgresql://{rds['endpoint']}:{rds['port']}/{db_props['name']}"

# Configure RDS settings
mode = db_props['mode']
properties = {
    "user": db_props['user'],
    "password": db_props['password'],
    "driver": db_props['driver']
}

In [None]:
# Write data to database
review_df.write.jdbc(url=jdbc_url, table='reviews', mode=mode, properties=properties)
product_df.write.jdbc(url=jdbc_url, table='products', mode=mode, properties=properties)
customer_df.write.jdbc(url=jdbc_url, table='customers', mode=mode, properties=properties)
vine_df.write.jdbc(url=jdbc_url, table='vine', mode=mode, properties=properties)

In [None]:
# Check if the data was loaded into the db
spark.read.format('jdbc') \
          .option("url", jdbc_url) \
          .option("dbtable", 'reviews') \
          .option("user", properties['user']) \
          .option("password", properties['password']) \
          .option("driver", properties['driver']) \
          .load().show(5)