In [3]:
import pyspark
print(pyspark.__version__)


3.5.3


In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("NYCTaxiAnalysis").getOrCreate()
df_taxi = spark.read.option("header", "true").csv("data/Sample NYC Data.csv")


In [5]:
df_taxi.printSchema()
df_taxi.show(5)


root
 |-- medallion: string (nullable = true)
 |-- hack_license: string (nullable = true)
 |-- vendor_id: string (nullable = true)
 |-- rate_code: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- pickup_longitude: string (nullable = true)
 |-- pickup_latitude: string (nullable = true)
 |-- dropoff_longitude: string (nullable = true)
 |-- dropoff_latitude: string (nullable = true)

+--------------------+--------------------+---------+---------+------------------+---------------+----------------+---------------+----------------+---------------+-----------------+----------------+
|           medallion|        hack_license|vendor_id|rate_code|store_and_fwd_flag|pickup_datetime|dropoff_datetime|passenger_count|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|
+--------------------+--------------------+

In [7]:
pip install shapely

Collecting shapely
  Downloading shapely-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading shapely-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: shapely
Successfully installed shapely-2.0.7
Note: you may need to restart the kernel to use updated packages.


In [6]:
import json
from shapely.geometry import shape

with open("data/nyc-boroughs.geojson", "r") as f:
    geojson_data = json.load(f)

# Extract features
features = geojson_data["features"]

# Sort features by borough code and area (descending)
def feature_sort_key(f):
    borough_code = f["properties"]["boroughCode"]
    polygon_area = shape(f["geometry"]).area
    return (borough_code, polygon_area * -1)  # negative for descending on area

features_sorted = sorted(features, key=feature_sort_key)


In [7]:
# Broadcast the GeoJSON
bc_features = spark.sparkContext.broadcast(features_sorted)


In [10]:
from pyspark.sql.functions import unix_timestamp, col

df_taxi = df_taxi.withColumn(
    "pickup_ts",
    unix_timestamp(col("pickup_datetime"), "dd-MM-yy HH:mm")
)

df_taxi = df_taxi.withColumn(
    "dropoff_ts",
    unix_timestamp(col("dropoff_datetime"), "dd-MM-yy HH:mm")
)

df_taxi = df_taxi.withColumn(
    "duration_sec",
    col("dropoff_ts") - col("pickup_ts")
)


In [11]:
df_taxi.select("duration_sec").summary("count", "min", "max", "mean", "stddev").show()


+-------+------------------+
|summary|      duration_sec|
+-------+------------------+
|  count|             99999|
|    min|                 0|
|    max|              9180|
|   mean| 650.6807068070681|
| stddev|469.78651920883175|
+-------+------------------+



In [12]:
from pyspark.sql.functions import col

# Remove negative durations, zero durations, and those above 10800 seconds (3 hours).
df_taxi = df_taxi.filter((col("duration_sec") > 0) & (col("duration_sec") <= 10800))


In [13]:
# Define a UDF to Map Coordinates → Borough
from shapely.geometry import Point
from pyspark.sql.functions import udf

def get_borough(lon, lat, features):
    if lon is None or lat is None:
        return None
    point = Point(float(lon), float(lat))
    for f in features:
        polygon = shape(f["geometry"])
        if polygon.contains(point):
            return f["properties"]["borough"]
    return None

def udf_get_borough(lon, lat):
    return get_borough(lon, lat, bc_features.value)

borough_udf = udf(udf_get_borough)


In [14]:
# Apply the UDF
from pyspark.sql.functions import col

df_taxi = df_taxi.withColumn("pickup_borough",
    borough_udf(col("pickup_longitude"), col("pickup_latitude"))
)

df_taxi = df_taxi.withColumn("dropoff_borough",
    borough_udf(col("dropoff_longitude"), col("dropoff_latitude"))
)


In [15]:
# caching the DataFrame
df_taxi.cache()
df_taxi.count()  # trigger the cache


99549

In [16]:
# partion and sort
from pyspark.sql.window import Window
import pyspark.sql.functions as F

window_spec = Window.partitionBy("medallion").orderBy("pickup_ts")


In [17]:
# Compute Idle Time
# Compare the pickup time of the current trip to the dropoff time of the previous trip:
df_taxi = df_taxi.withColumn(
  "prev_dropoff_ts",
  F.lag("dropoff_ts").over(window_spec)
)
df_taxi = df_taxi.withColumn(
  "idle_time_sec",
  col("pickup_ts") - col("prev_dropoff_ts")
)
# Replace null with 0 for the first trip
df_taxi = df_taxi.fillna({"idle_time_sec": 0})


In [18]:
# ignore very large idle times (e.g., >4 hours) as new sessions. Maybe we do not need this.
df_taxi = df_taxi.withColumn(
  "idle_time_sec",
  F.when(col("idle_time_sec") < 14400, col("idle_time_sec")).otherwise(0)
)


In [19]:
### Required Queries:

In [20]:
# Utilization (per Taxi/Driver)
# “Fraction of time a taxi is occupied.”
#Summation of driving time vs. total time (driving + idle)

df_trip_time = df_taxi.groupBy("medallion") \
    .agg(F.sum("duration_sec").alias("sum_trip_time"))

df_idle_time = df_taxi.groupBy("medallion") \
    .agg(F.sum("idle_time_sec").alias("sum_idle_time"))

df_util = df_trip_time.join(df_idle_time, on="medallion") \
    .withColumn(
        "utilization",
        F.col("sum_trip_time") / (F.col("sum_trip_time") + F.col("sum_idle_time"))
    )


In [21]:
# Average Time to Next Fare (per Destination Borough)

# "lead" so we can shift the idle time back one row
df_taxi = df_taxi.withColumn(
  "idle_time_for_this_dropoff",
  F.lag("idle_time_sec").over(window_spec)
)
# group by dropoff_borough
df_avg_idle = df_taxi.groupBy("dropoff_borough") \
    .agg(F.avg("idle_time_for_this_dropoff").alias("avg_idle_sec"))
df_avg_idle.show()


+---------------+------------------+
|dropoff_borough|      avg_idle_sec|
+---------------+------------------+
|         Queens|1186.9237217099749|
|           NULL|1124.9278460716193|
|       Brooklyn|1163.2585433206766|
|  Staten Island|            2736.0|
|      Manhattan|1011.4870562796565|
|          Bronx| 1261.851851851852|
+---------------+------------------+



In [22]:
# Number of Trips that Start and End in the Same Borough

df_same = df_taxi.filter(col("pickup_borough") == col("dropoff_borough"))
count_same = df_same.count()

# broken down by borough
df_same.groupBy("pickup_borough").count().show()



+--------------+-----+
|pickup_borough|count|
+--------------+-----+
|        Queens| 1369|
|      Brooklyn| 1062|
| Staten Island|    1|
|     Manhattan|83463|
|         Bronx|   49|
+--------------+-----+



In [23]:
# Number of Trips from One Borough to Another

df_diff = df_taxi.filter(col("pickup_borough") != col("dropoff_borough"))
count_diff = df_diff.count()

# Per borough pair
df_diff.groupBy("pickup_borough", "dropoff_borough").count().show()


+--------------+---------------+-----+
|pickup_borough|dropoff_borough|count|
+--------------+---------------+-----+
|      Brooklyn|      Manhattan|  773|
|        Queens|          Bronx|  100|
|      Brooklyn|         Queens|  115|
|        Queens|  Staten Island|    2|
|     Manhattan|  Staten Island|    9|
|     Manhattan|       Brooklyn| 1923|
|     Manhattan|         Queens| 3943|
|     Manhattan|          Bronx|  244|
|        Queens|      Manhattan| 3697|
|         Bronx|      Manhattan|   25|
|        Queens|       Brooklyn|  597|
|         Bronx|         Queens|    2|
| Staten Island|         Queens|    1|
+--------------+---------------+-----+



In [None]:
### Save

In [25]:
df_util.coalesce(1).write.csv("save", header=True)
