In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder \
        .appName("ETL-with-spark") \
		.config('spark.jars', 'driver/postgresql-42.6.0.jar') \
        .getOrCreate()

In [2]:
import json
def read_data(spark, table_name:str) -> DataFrame:
	with open('config/config.json', 'r') as config_file:
		config_data = json.load(config_file)
		host = config_data['staging']['host']
		port = config_data['staging']['port']
		database = config_data['staging']['database']
		user = config_data['staging']['user']
		password = config_data['staging']['password']

	url = 'jdbc:postgresql://{0}:{1}/{2}'.format(host, port, database)
	properties = {
		'user': user,
		'password': password,
		'driver': 'org.postgresql.Driver'}
	df = spark.read.jdbc(url=url, table=table_name, properties=properties)
	print(f'Read data from {table_name} successfully!')
	return df

In [3]:
green_taxi = read_data(spark, 'green_taxi')
# green_taxi.show()

Read data from green_taxi successfully!


In [4]:
taxi_zone = read_data(spark, 'taxi_zone_lookup')
taxi_zone.show()

Read data from taxi_zone_lookup successfully!
+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Pa

In [5]:
df1 = green_taxi.join(taxi_zone.select('LocationID', 'Zone', 'Borough'), green_taxi['PULocationID'] == taxi_zone['LocationID'])
df1 = df1.withColumnRenamed('Zone', 'pickup_zone').withColumnRenamed('Borough', 'pickup_borough').withColumnRenamed('LocationID', 'pickup_locationid')
df1.show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+-----------------+-------------------+----------------+-----------------+--------------+--------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|trip_time_in_mins|trip_distance_in_km|average_velocity|pickup_locationid|   pickup_zone|pickup_borough|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+--

In [10]:
yellow_taxi = read_data(spark, 'yellow_taxi')


Read data from yellow_taxi successfully!


In [12]:
yellow_taxi.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- trip_time_in_mins: double (nullable = true)
 |-- trip_distance_in_km: double (nullable = true)
 |-- average_velocity: double (nullabl

In [13]:
df2 = yellow_taxi.join(taxi_zone.select('LocationID', 'Zone', 'Borough'), yellow_taxi.PULocationID == taxi_zone.LocationID)
df2 = df2.withColumnRenamed('Zone', 'pickup_zone').withColumnRenamed('Borough', 'pickup_borough').withColumnRenamed('LocationID', 'pickup_locationid')
df2.show()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "C:\tools\spark-3.3.2-bin-hadoop3\python\lib\py4j-0.10.9.5-src.zip\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "C:\tools\spark-3.3.2-bin-hadoop3\python\lib\py4j-0.10.9.5-src.zip\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "c:\Users\My PC\AppData\Local\Programs\Python\Python310\lib\socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 