In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import datetime

greentaxiDF = spark.read.load(
    "abfss://employeefs285@datalakefs285.dfs.core.windows.net/GreenTaxiData/2021_Green_Taxi_Trip_Data.csv",
    format='csv',
    inferSchema=True,
    header=True
)

display(greentaxiDF.show())

StatementMeta(spark, 2, 3, Finished, Available, Finished)

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2|01/01/2021 12:15:...| 01/01/2021 12:19:...|                 N|         1|          43|         151|              1|         1.01|        5.5|  0.5|    0.

In [14]:
greentaxiDF = greentaxiDF.where(col('passenger_count') > 0)\
    .filter(col('trip_distance') > 0.0)\
    .dropDuplicates()

StatementMeta(spark, 2, 4, Finished, Available, Finished)

In [15]:
greentaxiDF.printSchema()

StatementMeta(spark, 2, 5, Finished, Available, Finished)

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: string (nullable = true)
 |-- lpep_dropoff_datetime: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: string (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [16]:
spark.conf.set("spark.sql.session.timeZone", "UTC")

greentaxiDF = greentaxiDF.withColumn('lpep_pickup_datetime', 
    to_timestamp('lpep_pickup_datetime', "MM/dd/yyyy hh:mm:ss a"))

greentaxiDF = greentaxiDF.withColumn('lpep_dropoff_datetime', 
    to_timestamp('lpep_dropoff_datetime', "MM/dd/yyyy hh:mm:ss a"))


StatementMeta(spark, 2, 6, Finished, Available, Finished)

In [17]:
greentaxiDF1 = greentaxiDF.select(
    col("VendorID"),
    col("passenger_count").alias("PassengerCount"),
    col("trip_distance").alias("TripDistance"),
    col("lpep_pickup_datetime").alias("PickupTime"),
    col("lpep_dropoff_datetime").alias("DropTime"),
    col('PULocationID').alias("PickupLocationId"),
    col('DOLocationID').alias("DropLocationId"),
    col('RatecodeID'),
    col('total_amount').alias("TotalAmount"),
    col('payment_type').alias("PaymentType"))\
    .withColumn("TripYear", year("PickupTime"))\
    .withColumn("TripMonth", month("PickupTime"))\
    .withColumn("TripDay", dayofmonth("PickupTime"))\
    .withColumn("TripDurationinMinutes", round(unix_timestamp(col('DropTime')) - unix_timestamp(col('PickupTime'))))\
    .withColumn("TripType", when(col("RatecodeID") == 6,"SharedTrip").otherwise("SoloTrip"))\
    .drop("RatecodeID")

greentaxiDF1.printSchema()

StatementMeta(spark, 2, 7, Finished, Available, Finished)

root
 |-- VendorID: integer (nullable = true)
 |-- PassengerCount: integer (nullable = true)
 |-- TripDistance: double (nullable = true)
 |-- PickupTime: timestamp (nullable = true)
 |-- DropTime: timestamp (nullable = true)
 |-- PickupLocationId: integer (nullable = true)
 |-- DropLocationId: integer (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- PaymentType: integer (nullable = true)
 |-- TripYear: integer (nullable = true)
 |-- TripMonth: integer (nullable = true)
 |-- TripDay: integer (nullable = true)
 |-- TripDurationinMinutes: long (nullable = true)
 |-- TripType: string (nullable = false)



In [18]:
greentaxiDF1.toPandas()

StatementMeta(spark, 2, 8, Finished, Available, Finished)

Unnamed: 0,VendorID,PassengerCount,TripDistance,PickupTime,DropTime,PickupLocationId,DropLocationId,TotalAmount,PaymentType,TripYear,TripMonth,TripDay,TripDurationinMinutes,TripType
0,2,1,1.44,2021-01-01 08:59:00,2021-01-01 09:04:13,74,75,8.76,1,2021,1,1,313,SoloTrip
1,1,1,0.80,2021-01-01 11:56:17,2021-01-01 11:58:47,74,75,6.50,1,2021,1,1,150,SoloTrip
2,2,2,1.10,2021-01-01 12:15:47,2021-01-01 12:23:19,7,179,7.80,2,2021,1,1,452,SoloTrip
3,2,1,7.15,2021-01-01 15:46:21,2021-01-01 16:25:28,130,61,29.30,1,2021,1,1,2347,SoloTrip
4,2,1,1.77,2021-01-01 21:51:36,2021-01-01 21:59:26,74,168,9.30,1,2021,1,1,470,SoloTrip
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
621005,2,1,2.25,2021-12-31 17:30:04,2021-12-31 17:42:01,41,168,14.76,1,2021,12,31,717,SoloTrip
621006,1,1,1.50,2021-12-31 19:45:01,2021-12-31 19:51:45,43,151,9.35,1,2021,12,31,404,SoloTrip
621007,2,1,1.16,2021-12-31 20:43:03,2021-12-31 20:51:26,75,74,8.30,2,2021,12,31,503,SoloTrip
621008,2,1,0.20,2021-12-31 22:43:13,2021-12-31 22:44:24,41,41,4.30,2,2021,12,31,71,SoloTrip


In [19]:
spark.sql("CREATE DATABASE IF NOT EXISTS NYCTAXIDB")

greentaxiDF1.repartition(4).write.mode("overwrite").saveAsTable("NYCTAXIDB.greentaxitbl")

StatementMeta(spark, 2, 9, Finished, Available, Finished)