In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder \
        .appName("ETL-with-spark") \
		.config('spark.jars', 'driver/postgresql-42.6.0.jar') \
        .getOrCreate()

In [2]:
import json
def read_data(spark, table_name:str) -> DataFrame:
	with open('config/config.json', 'r') as config_file:
		config_data = json.load(config_file)
		host = config_data['staging']['host']
		port = config_data['staging']['port']
		database = config_data['staging']['database']
		user = config_data['staging']['user']
		password = config_data['staging']['password']

	url = 'jdbc:postgresql://{0}:{1}/{2}'.format(host, port, database)
	properties = {
		'user': user,
		'password': password,
		'driver': 'org.postgresql.Driver'}
	df = spark.read.jdbc(url=url, table=table_name, properties=properties)
	print(f'Read data from {table_name} successfully!')
	return df

In [None]:
green_taxi = read_data(spark, 'green_taxi')
green_taxi.show()

In [5]:
taxi_zone = read_data(spark, 'taxi_zone_lookup')
taxi_zone.show()

Read data from taxi_zone_lookup successfully!
+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Pa

In [8]:
df = green_taxi.join(taxi_zone.select('LocationID', 'Zone', 'Borough'), green_taxi.PULocationID == taxi_zone.LocationID)
df = df.withColumnRenamed('Zone', 'pickup_zone').withColumnRenamed('Borough', 'pickup_borough')
df.show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+-----------------+-------------------+----------------+----------+--------------+--------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|trip_time_in_mins|trip_distance_in_km|average_velocity|LocationID|   pickup_zone| Borough|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+-----

In [None]:
df.show()