In [37]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import datetime

greentaxiDF = spark.read.load("abfss://datalakefilesystem@azuresdatalake.dfs.core.windows.net/GreenTaxiData/2021_Green_Taxi_Trip_Data.csv",
format= "csv", inferSchema = True, header = True)

display(greentaxiDF.show(100))

# Filter the Record


StatementMeta(Spark, 1, 38, Finished, Available, Finished)

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2|01/01/2021 12:15:...| 01/01/2021 12:19:...|                 N|         1|          43|         151|              1|         1.01|        5.5|  0.5|    0.

In [38]:
greentaxiDF = greentaxiDF.where(col('passenger_count') > 0).filter(col('trip_distance') > 0.0)\
.dropDuplicates()

StatementMeta(Spark, 1, 39, Finished, Available, Finished)

In [39]:
greentaxiDF.printSchema()

StatementMeta(Spark, 1, 40, Finished, Available, Finished)

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: string (nullable = true)
 |-- lpep_dropoff_datetime: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: string (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [40]:
spark.conf.set("spark.sql.session.timeZone", "UTC")

greentaxiDF = greentaxiDF.withColumn('lpep_pickup_datetime', to_timestamp('lpep_pickup_datetime', "MM/dd/yyyy HH:mm:ss a"))
greentaxiDF = greentaxiDF.withColumn('lpep_dropoff_datetime', to_timestamp('lpep_dropoff_datetime', "MM/dd/yyyy HH:mm:ss a"))

greentaxiDF1 = greentaxiDF.select(col("VendorID"),
                                 col("passenger_count").alias("PassengerCount"),
                                 col("trip_distance").alias("TripDistance"),
                                 col("lpep_pickup_datetime").alias("PickupTime"),
                                 col("lpep_dropoff_datetime").alias("DropTime"),
                                 col('PULocationID').alias("PickupLocationId"),
                                 col('DOLocationID').alias("DropLocationId"),
                                 col('RatecodeID'),
                                 col('total_amount').alias("TotalAmount"),
                                 col('payment_type').alias("PaymentType"))\
                                 .withColumn("TripYear", year("PickupTime"))\
                                 .withColumn("TripMonth", month("PickupTime"))\
                                 .withColumn("TripDay", dayofmonth("PickupTime"))\
                                 .withColumn("TripDurationinMinutes", round(unix_timestamp(col('DropTime')) - unix_timestamp(col('PickupTime'))))\
                                 .withColumn("TripType", when(col("RatecodeID") == 6,"SharedTrip").otherwise("SoloTrip"))\
                                 .drop("RatecodeID")

greentaxiDF1.printSchema()

StatementMeta(Spark, 1, 41, Finished, Available, Finished)

root
 |-- VendorID: integer (nullable = true)
 |-- PassengerCount: integer (nullable = true)
 |-- TripDistance: double (nullable = true)
 |-- PickupTime: timestamp (nullable = true)
 |-- DropTime: timestamp (nullable = true)
 |-- PickupLocationId: integer (nullable = true)
 |-- DropLocationId: integer (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- PaymentType: integer (nullable = true)
 |-- TripYear: integer (nullable = true)
 |-- TripMonth: integer (nullable = true)
 |-- TripDay: integer (nullable = true)
 |-- TripDurationinMinutes: long (nullable = true)
 |-- TripType: string (nullable = false)



In [41]:
greentaxiDF1.show()

StatementMeta(Spark, 1, 42, Finished, Available, Finished)

+--------+--------------+------------+-------------------+-------------------+----------------+--------------+-----------+-----------+--------+---------+-------+---------------------+--------+
|VendorID|PassengerCount|TripDistance|         PickupTime|           DropTime|PickupLocationId|DropLocationId|TotalAmount|PaymentType|TripYear|TripMonth|TripDay|TripDurationinMinutes|TripType|
+--------+--------------+------------+-------------------+-------------------+----------------+--------------+-----------+-----------+--------+---------+-------+---------------------+--------+
|       2|             1|        1.44|2021-01-01 08:59:00|2021-01-01 09:04:13|              74|            75|       8.76|          1|    2021|        1|      1|                  313|SoloTrip|
|       1|             1|         0.8|2021-01-01 11:56:17|2021-01-01 11:58:47|              74|            75|        6.5|          1|    2021|        1|      1|                  150|SoloTrip|
|       2|             2|         1