In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder \
        .appName("ETL-with-spark") \
        .getOrCreate()

In [3]:
df = spark.read.parquet('data/green_taxi')

In [4]:
for i in ['passenger_count', 'RateCodeID', 'payment_type', 'trip_type']:
    df = df.withColumn(i, col(i).cast(IntegerType()))

In [6]:
df1 = df.filter(col('trip_distance') > 200)
df1.count()

141

In [5]:
df.show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RateCodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2022-01-01 07:14:21|  2022-01-01 07:15:33|                 N|         1|          42|          42|              1|         0.44|        3.5|  0.5|    0.

In [7]:
trip_time_in_mins = unix_timestamp(
    col('lpep_dropoff_datetime')) - unix_timestamp(col('lpep_pickup_datetime'))
df = df.withColumn('trip_time_in_mins', round(trip_time_in_mins / 60, 2))

df = df.withColumn('trip_distance_in_km', round(col('trip_distance') * 1.6, 2)) # convert miles into km
time_in_hours = col('trip_time_in_mins') / 60
df = df.withColumn('average_velocity',  round(col('trip_distance_in_km') / time_in_hours, 2)) # calculate average speed
df = df.filter(col('average_velocity') <= 120)
df.show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+-----------------+-------------------+----------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RateCodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|trip_time_in_mins|trip_distance_in_km|average_velocity|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+-----------------+-------------------+----------------

In [9]:
df1 = df.filter(col('average_velocity') <= 120)
df1.show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+-----------------+-------------------+----------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RateCodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|trip_time_in_mins|trip_distance_in_km|average_velocity|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+-----------------+-------------------+----------------

In [19]:
df1 = df.filter(col('trip_distance_in_km') > 200)
df1.show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+-------------------+-----------------+----------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|trip_distance_in_km|trip_time_in_mins|average_velocity|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+-------------------+-----------------+----------------

In [None]:
df_zone = spark.read.csv('data', inferSchema=True, header=True)
df_zone = df_zone.filter(col('Borough') != 'Unknown')
df_zone.show()

In [3]:
host = 'localhost'
port = '5432'
database = 'etl_with_spark_nytaxi'
user = 'postgres'
password = 'tanduy2407'
url = 'jdbc:postgresql://{0}:{1}/{2}'.format(host, port, database)
properties = {
    'user': user,
    'password': password,
    'driver': 'org.postgresql.Driver'}
df.write.jdbc(url=url, table='taxi_zone_lookup', mode='overwrite',
                properties=properties)

In [4]:
df.show()

+----------+-------------+--------------------+------------+
|locationid|      borough|                zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|  Green Zone|
|         3|        Bronx|Allerton/Pelham G...|  Green Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|  Green Zone|
|         6|Staten Island|Arrochar/Fort Wad...|  Green Zone|
|         7|       Queens|             Astoria|  Green Zone|
|         8|       Queens|        Astoria Park|  Green Zone|
|         9|       Queens|          Auburndale|  Green Zone|
|        10|       Queens|        Baisley Park|  Green Zone|
|        11|     Brooklyn|          Bath Beach|  Green Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly