In [4]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar -xzf spark-3.5.0-bin-hadoop3.tgz

!pip install -q findspark


import os
import findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
findspark.init()


In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import avg
from pyspark.sql.functions import count
from pyspark.sql.functions import to_timestamp, to_date, unix_timestamp, sum as _sum

spark = SparkSession.builder \
    .appName("NYC Yellow Taxi Analysis") \
    .getOrCreate()


In [25]:
df = spark.read.parquet("/content/yellow_tripdata_2025-02.parquet")
df.printSchema()
df.show(10)

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)
 |-- cbd_congestion_fee: double (nullable = true)

+--------+--------------------+---------------------+---------------+------

In [26]:
df_clean = df.dropna(subset=["tpep_pickup_datetime", "tpep_dropoff_datetime", "fare_amount"])

df_clean = df_clean.filter((col("trip_distance") > 0) & (col("fare_amount") > 0))

df_clean = df_clean.withColumnRenamed("tpep_pickup_datetime", "pickup_datetime") \
                   .withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime")


In [27]:
avg_fare_by_passenger = df_clean.groupBy("passenger_count") \
    .agg(avg("fare_amount").alias("avg_fare_amount")) \
    .orderBy("passenger_count")

avg_fare_by_passenger.show()


+---------------+------------------+
|passenger_count|   avg_fare_amount|
+---------------+------------------+
|           NULL| 19.01169359560233|
|              0|15.587368796688333|
|              1| 17.41567037526011|
|              2|19.839337503387068|
|              3|19.493705196295647|
|              4|21.883914760002654|
|              5|16.307732906112065|
|              6|17.202291528002316|
|              7|              72.5|
|              8|              82.8|
+---------------+------------------+



In [28]:
top_pickups = df_clean.groupBy("PULocationID") \
    .agg(count("*").alias("trip_count")) \
    .orderBy(col("trip_count").desc()) \
    .limit(5)

top_pickups.show()


+------------+----------+
|PULocationID|trip_count|
+------------+----------+
|         161|    152706|
|         237|    151036|
|         236|    141102|
|         132|    115313|
|         162|    108204|
+------------+----------+



In [29]:
df_time = df_clean.withColumn("pickup_datetime", to_timestamp("pickup_datetime")) \
                  .withColumn("dropoff_datetime", to_timestamp("dropoff_datetime"))

df_time = df_time.withColumn("trip_duration_min",
    (unix_timestamp("dropoff_datetime") - unix_timestamp("pickup_datetime")) / 60)

daily_summary = df_time.withColumn("trip_date", to_date("pickup_datetime")) \
    .groupBy("trip_date") \
    .agg(
        _sum("trip_distance").alias("total_trip_distance"),
        avg("trip_duration_min").alias("avg_trip_duration_min")
    ) \
    .orderBy("trip_date")

daily_summary.show()


+----------+-------------------+---------------------+
| trip_date|total_trip_distance|avg_trip_duration_min|
+----------+-------------------+---------------------+
|2025-01-31| 113.92000000000002|   14.971111111111112|
|2025-02-01|  499441.1599999991|     14.6213505767715|
|2025-02-02|  889323.0300000012|   14.053696340444032|
|2025-02-03| 441829.38000000326|   15.703906269686064|
|2025-02-04|  592972.2300000058|   15.614737492132418|
|2025-02-05| 354700.30999999767|   16.029875050922985|
|2025-02-06|   649475.240000006|   15.828898243823442|
|2025-02-07|   549300.379999995|   15.831258945304025|
|2025-02-08|   925505.860000002|   15.059022976695546|
|2025-02-09|  780118.5200000054|   13.943378179524972|
|2025-02-10| 442250.15000000515|      16.336298904771|
|2025-02-11|  547174.8600000065|   15.575351862362217|
|2025-02-12|  959934.8600000043|   15.143362758628527|
|2025-02-13|  603698.9100000062|   16.243934328830075|
|2025-02-14|  908971.7900000163|    16.94685059564565|
|2025-02-1