# Setup

In [1]:
""" PySpark setup """

# Find the latest version of spark 3.0 from http://www-us.apache.org/dist/spark/ and enter as the spark version environment variable
import os
spark_version = 'spark-3.0.2'
os.environ['SPARK_VERSION'] = spark_version

# Install Java and Spark
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set environment variables
os.environ["JAVA_HOME"] = '/usr/lib/jvm/java-11-openjdk-amd64'
os.environ["SPARK_HOME"] = f'/content/{spark_version}-bin-hadoop2.7'

# Locate Spark
import findspark
findspark.init()

Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease    
Hit:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease    
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease                        
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:12 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:13 http://ppa.launchpad.net/gra

In [2]:
""" Postgres setup """

# Download a Postgres driver to allow Spark to interact with Postgres
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

# Path to driver
driver_path = '/content/postgresql-42.2.16.jar'

# # Set spark environment variable for driver
# os.environ["SPARK_CLASSPATH"] = driver_path

--2021-03-07 18:36:34--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar.5’


2021-03-07 18:36:35 (5.97 MB/s) - ‘postgresql-42.2.16.jar.5’ saved [1002883/1002883]



In [3]:
""" Spark session """

# Dependencies
from pyspark import SparkFiles
from pyspark.sql import SparkSession
# from config import db_password

# Spark session adding the Postgres driver to Spark
spark = SparkSession.builder.appName('etl').config('spark.driver.extraClassPath', driver_path).getOrCreate()
spark

# Extract

In [4]:
# Add user data file to Spark
bucket = 'https://dv-m16.s3-us-west-1.amazonaws.com/' # s3 bucket url
user_data = 'user_data.csv' # file name in bucket
spark.sparkContext.addFile(bucket + user_data) # add file to spark

# Read in user data
user_df = spark.read.csv(SparkFiles.get('user_data.csv'), sep=',', header=True, inferSchema=True)
user_df.show(5)

+---+----------+---------+-----------+-------------------+--------------+---------+
| id|first_name|last_name|active_user|     street_address|         state| username|
+---+----------+---------+-----------+-------------------+--------------+---------+
|  1|    Cletus|  Lithcow|      false|78309 Riverside Way|      Virginia|ibearham0|
|  2|       Caz|   Felgat|      false|83 Hazelcrest Place|       Alabama| wwaller1|
|  3|     Kerri|  Crowson|      false|     112 Eliot Pass|North Carolina|ichesnut2|
|  4|   Freddie|    Caghy|      false|    15 Merchant Way|      New York|  tsnarr3|
|  5|   Sadella|    Deuss|      false|   079 Acker Avenue|     Tennessee|fwherrit4|
+---+----------+---------+-----------+-------------------+--------------+---------+
only showing top 5 rows



In [5]:
# Add payment data file to Spark
user_payment = 'user_payment.csv'
spark.sparkContext.addFile(bucket + user_payment)

# Read in payment data
payment_df = spark.read.csv(SparkFiles.get('user_payment.csv'), sep=',', header=True, inferSchema=True)
payment_df.show(5)

+----------+---------+--------------------+
|billing_id| username|        cc_encrypted|
+----------+---------+--------------------+
|         1|ibearham0|a799fcafe47d7fb19...|
|         2| wwaller1|a799fcafe47d7fb19...|
|         3|ichesnut2|a799fcafe47d7fb19...|
|         4|  tsnarr3|a799fcafe47d7fb19...|
|         5|fwherrit4|a799fcafe47d7fb19...|
+----------+---------+--------------------+
only showing top 5 rows



# Transform

In [6]:
# Join dataframes
df = user_df.join(payment_df, on='username', how='inner')
print(df.count()) # num rows
df.show(5)

1000
+---------+---+----------+---------+-----------+-------------------+--------------+----------+--------------------+
| username| id|first_name|last_name|active_user|     street_address|         state|billing_id|        cc_encrypted|
+---------+---+----------+---------+-----------+-------------------+--------------+----------+--------------------+
|ibearham0|  1|    Cletus|  Lithcow|      false|78309 Riverside Way|      Virginia|         1|a799fcafe47d7fb19...|
| wwaller1|  2|       Caz|   Felgat|      false|83 Hazelcrest Place|       Alabama|         2|a799fcafe47d7fb19...|
|ichesnut2|  3|     Kerri|  Crowson|      false|     112 Eliot Pass|North Carolina|         3|a799fcafe47d7fb19...|
|  tsnarr3|  4|   Freddie|    Caghy|      false|    15 Merchant Way|      New York|         4|a799fcafe47d7fb19...|
|fwherrit4|  5|   Sadella|    Deuss|      false|   079 Acker Avenue|     Tennessee|         5|a799fcafe47d7fb19...|
+---------+---+----------+---------+-----------+-------------------

In [7]:
# Drop null values
df = df.dropna()
df.count()

939

In [8]:
# Filter for active users
df = df.filter('active_user == true')
print(df.count())
df.show(5)

475
+------------+---+----------+---------+-----------+--------------------+--------------------+----------+--------------------+
|    username| id|first_name|last_name|active_user|      street_address|               state|billing_id|        cc_encrypted|
+------------+---+----------+---------+-----------+--------------------+--------------------+----------+--------------------+
|  fstappard5|  6|    Fraser|  Korneev|       true|  76084 Novick Court|           Minnesota|         6|a799fcafe47d7fb19...|
|  lhambling6|  7|    Demott|   Rapson|       true|    86320 Dahle Park|District of Columbia|         7|a799fcafe47d7fb19...|
|   wheinerte| 15|   Sadella|    Jaram|       true|7528 Waxwing Terrace|         Connecticut|        15|a799fcafe47d7fb19...|
|droughsedgeg| 17|    Hewitt|  Trammel|       true|    2455 Corry Alley|      North Carolina|        17|a799fcafe47d7fb19...|
|   ydudeniei| 19|       Ted|  Knowlys|       true|      31 South Drive|                Ohio|        19|a799fcafe4

In [9]:
# Create user table
user_cols = ['id', 'first_name', 'last_name', 'username']
active_user_df = df.select(user_cols)
active_user_df.show(5)

+---+----------+---------+------------+
| id|first_name|last_name|    username|
+---+----------+---------+------------+
|  6|    Fraser|  Korneev|  fstappard5|
|  7|    Demott|   Rapson|  lhambling6|
| 15|   Sadella|    Jaram|   wheinerte|
| 17|    Hewitt|  Trammel|droughsedgeg|
| 19|       Ted|  Knowlys|   ydudeniei|
+---+----------+---------+------------+
only showing top 5 rows



In [10]:
# Create billing table
billing_cols = ['billing_id', 'street_address', 'state', 'username']
billing_info_df = df.select(billing_cols)
billing_info_df.show(5)

+----------+--------------------+--------------------+------------+
|billing_id|      street_address|               state|    username|
+----------+--------------------+--------------------+------------+
|         6|  76084 Novick Court|           Minnesota|  fstappard5|
|         7|    86320 Dahle Park|District of Columbia|  lhambling6|
|        15|7528 Waxwing Terrace|         Connecticut|   wheinerte|
|        17|    2455 Corry Alley|      North Carolina|droughsedgeg|
|        19|      31 South Drive|                Ohio|   ydudeniei|
+----------+--------------------+--------------------+------------+
only showing top 5 rows



In [11]:
# Create payment table
payment_cols = ['billing_id', 'cc_encrypted']
payment_info_df = df.select(payment_cols)
payment_info_df.show(5)

+----------+--------------------+
|billing_id|        cc_encrypted|
+----------+--------------------+
|         6|a799fcafe47d7fb19...|
|         7|a799fcafe47d7fb19...|
|        15|a799fcafe47d7fb19...|
|        17|a799fcafe47d7fb19...|
|        19|a799fcafe47d7fb19...|
+----------+--------------------+
only showing top 5 rows



# Load

In [13]:
""" RDS connection """

# Connection to database on RDS
connection_string = "dv-m16.cqzw4eyr0cqo.us-west-1.rds.amazonaws.com"
db_name = "company"
jdbc_url = f"jdbc:postgresql://{connection_string}:5432/{db_name}"

# Configure RDS settings
mode = "append" # action for the db table
properties = {
    "user": "postgres",
    "password": db_password,
    "driver": "org.postgresql.Driver"
}

In [14]:
# Write data to RDS database
active_user_df.write.jdbc(url=jdbc_url, table='active_user', mode=mode, properties=properties)
billing_info_df.write.jdbc(url=jdbc_url, table='billing_info', mode=mode, properties=properties)
payment_info_df.write.jdbc(url=jdbc_url, table='payment_info', mode=mode, properties=properties)

# Check if the data was loaded into the db
spark.read.format('jdbc') \
          .option("url", jdbc_url) \
          .option("dbtable", 'active_user') \
          .option("user", properties['user']) \
          .option("password", properties['password']) \
          .option("driver", properties['driver']) \
          .load().show(5)

+---+----------+---------+------------+
| id|first_name|last_name|    username|
+---+----------+---------+------------+
|  6|    Fraser|  Korneev|  fstappard5|
|  7|    Demott|   Rapson|  lhambling6|
| 15|   Sadella|    Jaram|   wheinerte|
| 17|    Hewitt|  Trammel|droughsedgeg|
| 19|       Ted|  Knowlys|   ydudeniei|
+---+----------+---------+------------+
only showing top 5 rows

