In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder \
        .appName("ETL-with-spark") \
        .getOrCreate()

In [10]:
parquet_file_path = 'data/green_taxi'
df = spark.read.parquet(parquet_file_path)
for i in ['passenger_count', 'RateCodeID', 'payment_type', 'trip_type']:
    df = df.withColumn(i, col(i).cast(IntegerType()))

In [11]:
payment_type_mapping = when(col('id') == 1, 'Credit card'). \
                   when(col('id') == 2, 'Cash'). \
                   when(col('id') == 3, 'No charge'). \
                   when(col('id') == 4, 'Dispute'). \
                   when(col('id') == 5, 'Unknown').otherwise('Voided trip')
df_payment = df.select('payment_type').withColumnRenamed('payment_type', 'id').distinct().dropna().orderBy(col('id'))
df_payment = df_payment.withColumn('payment_type', payment_type_mapping)

df_triptype = df.select('trip_type').withColumnRenamed('trip_type', 'id').distinct().dropna().orderBy(col('id'))
df_triptype = df_triptype.withColumn('trip_type', when(col('id') == 1, 'Street-hail').
                                        when(col('id') == 2, 'Dispatch'))

rate_code_mapping = when(col('id') == 1, 'Standard rate'). \
                   when(col('id') == 2, 'JFK'). \
                   when(col('id') == 3, 'Newark'). \
                   when(col('id') == 4, 'Nassau or Westchester'). \
                   when(col('id') == 5, 'Negotiated fare').otherwise('Group ride')
df_ratecode = df.select('RateCodeID').withColumnRenamed('RateCodeID', 'id').distinct().dropna().orderBy(col('id'))
df_ratecode = df_ratecode.withColumn('rate_type', rate_code_mapping)

df_vendor = df.select('VendorID').withColumnRenamed('VendorID', 'id').distinct().dropna().orderBy(col('id'))
df_vendor = df_vendor.withColumn('vendor', when(col('id') == 1, 'Creative Mobile Technologies, LLC').
                                 when(col('id') == 2, 'VeriFone Inc.'))

trip_time_in_mins = unix_timestamp(col('lpep_dropoff_datetime')) - unix_timestamp(col('lpep_pickup_datetime'))
df = df.withColumn('trip_time_in_mins', round(trip_time_in_mins / 60, 2))

In [14]:
df_zone = spark.read.csv('data/taxi_zone_lookup.csv', inferSchema=True, header=True)
df_zone = df_zone.filter(col('Borough') != 'Unknown')
df_zone.show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [12]:
df.show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+-----------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RateCodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|trip_time_in_mins|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+-----------------+
|       2| 2022-02-01 07:20:21|  2022-02-01 07:24:30|                 N|         1|          43|         238|