# Amazon Review ETL

This is an ETL pipeline for Amazon pet product reviews data.

## Dependencies

In [None]:
# Download a Postgres driver to allow Spark to interact with Postgres
!curl -O https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

In [None]:
# Locate Spark
import findspark
findspark.init()

# Dependencies
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.sql.types import DateType
from pyspark.sql.functions import col
from config import rds, db_props

# Spark session adding the Postgres driver to Spark
spark = SparkSession.builder \
                    .appName('amz') \
                    .config('spark.driver.extraClassPath', 'postgresql-42.2.16.jar') \
                    .getOrCreate()
spark

## Extract

1. [Download the data](https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Pet_Products_v1_00.tsv.gz)
2. Unzip the downloaded file and move the data file to the same directory as this notebook

Source: https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt

In [None]:
# Read in data
df = spark.read.csv('pet_product_reviews.tsv', sep='\t', header=True, inferSchema=True)
df.show(2, vertical=True, truncate=False)

In [None]:
# Schema and row count
df.printSchema()
df.count()

## Transform

In [None]:
# Review table
review_cols = ['review_id', 'customer_id', 'product_id', 'product_parent', df['review_date'].cast(DateType())]
review_df = df.select(review_cols).orderBy('review_id')
print(review_df.count())
review_df.show(5)

In [None]:
# Product table
product_df = df.select(['product_id', 'product_title']).dropDuplicates().orderBy('product_id')
print(product_df.count())
product_df.show(5)

In [None]:
# Customer table
customer_df = df.groupBy('customer_id').count().orderBy('customer_id')
customer_df = customer_df.withColumnRenamed('count', 'review_count')
print(customer_df.count())
customer_df.show(5)

In [None]:
# Vine table
vine_cols = ['review_id', 'star_rating', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase']
vine_df = df.select(vine_cols).orderBy('review_id')
print(vine_df.count())
vine_df.show(5)

### Load

In [None]:
# Connection to database on RDS
jdbc_url = f"jdbc:postgresql://{rds['endpoint']}:{rds['port']}/{db_props['name']}"

# Configure RDS settings
mode = db_props['mode']
properties = {
    "user": db_props['user'],
    "password": db_props['password'],
    "driver": db_props['driver']
}

In [None]:
# Write data to database
review_df.write.jdbc(url=jdbc_url, table='reviews', mode=mode, properties=properties)
product_df.write.jdbc(url=jdbc_url, table='products', mode=mode, properties=properties)
customer_df.write.jdbc(url=jdbc_url, table='customers', mode=mode, properties=properties)
vine_df.write.jdbc(url=jdbc_url, table='vine', mode=mode, properties=properties)