In [None]:
%pyspark
from pyspark import SparkFiles
# Load a sample from S3 into a DataFrame
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/sample_us.tsv"

spark.sparkContext.addFile(url)

sample_usa_df = spark.read.option('header', 'true').csv(SparkFiles.get("sample_us.tsv"), inferSchema=True, sep='\t')
sample_usa_df.show(10)

In [None]:
%pyspark
# See the data types to compare them with the required database schema
sample_usa_df.dtypes

In [None]:
%pyspark
from pyspark import SparkFiles
# Load a shoes data from S3 into a DataFrame

shoes_df = spark.read.option('header', 'true').csv("s3a://amazon-reviews-pds/tsv/amazon_reviews_us_Shoes_v1_00.tsv.gz", inferSchema=True, sep='\t')
shoes_df.show(10)

In [None]:
%pyspark
from pyspark.sql.functions import to_date

# review_date should be in the format yyyy-mm-dd
#date_df = sample_usa_df.withColumn("date", to_date("review_date", "yyyy-mm-dd"))
date_df = shoes_df.withColumn("date", to_date("review_date", "yyyy-mm-dd"))
date_df.show(10)

In [None]:
%pyspark
from pyspark.sql.functions import col
# Created data frame to match review_id_table
# CREATE TABLE review_id_table (review_id TEXT PRIMARY KEY NOT NULL, customer_id INTEGER, product_id TEXT, product_parent INTEGER, 
# review_date DATE -- this should be in the formate yyyy-mm-dd );

review_df = date_df.select(["review_id", "customer_id", "product_id", "product_parent", col("date").alias("review_date")])
review_df.show(10)

In [None]:
%pyspark
# Number of reviews
review_df.count()

In [None]:
%pyspark
# Created data frame to match products table  -- This table will contain only unique values
# CREATE TABLE products (product_id TEXT PRIMARY KEY NOT NULL UNIQUE,product_title TEXT);
products_df = date_df.select(["product_id", "product_title"]).distinct()
products_df.show(10)

In [None]:
%pyspark
# Number of products
products_df.count()   # => 1901053

In [None]:
%pyspark
# Created data frame to match customer table -- Customer table for first data set
# CREATE TABLE customers (customer_id INT PRIMARY KEY NOT NULL UNIQUE,  customer_count INT);
counts_df = date_df.groupBy("customer_id").count().orderBy("customer_id")
counts_df.show(10)

In [None]:
%pyspark
# Check the data types
counts_df.dtypes

In [None]:
%pyspark
# Number of customers
customers_df.count()   # => 2816830

In [None]:
%pyspark
# Created data frame to match vine table
# CREATE TABLE vine_table (review_id TEXT PRIMARY KEY, star_rating INTEGER, helpful_votes INTEGER, total_votes INTEGER, vine TEXT);

vine_df = date_df.select(["review_id", "star_rating", "helpful_votes", "total_votes", "vine"])
vine_df.show(10)

In [None]:
%pyspark
# Number of vines
vine_df.count()  # => 4366916

In [None]:
%pyspark
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://<endpoiny>:<port>/<db>"
config = {"user":"<user>", 
          "password": "<pwd>", 
          "driver":"org.postgresql.Driver"}


In [None]:
%pyspark
# Append DataFrame to review_id_table in RDS
review_df.write.jdbc(url=jdbc_url, table='review_id_table', mode=mode, properties=config)

In [None]:
%pyspark
# Write dataframe to products table in RDS
products_df.write.jdbc(url=jdbc_url, table='products', mode=mode, properties=config)

In [None]:
%pyspark
# Write dataframe to customers table in RDS
customers_df.write.jdbc(url=jdbc_url, table='customers', mode=mode, properties=config)

In [None]:
%pyspark
# Write dataframe to vine_table table in RDS
vine_df.write.jdbc(url=jdbc_url, table='vine_table', mode=mode, properties=config)