## Jupyter Notebook

### Data Ingestion and Combination

In [95]:
import pandas as pd

df = pd.read_csv("input/sample_nyc_data.csv")

In [96]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("NYC Taxi Analysis").getOrCreate()

df_spark = spark.read.csv("input/sample_nyc_data.csv", header=True, inferSchema=True).cache()

df_selected = df_spark.select("medallion", 
                        "pickup_datetime", "pickup_longitude", "pickup_latitude", 
                        "dropoff_datetime", "dropoff_longitude", "dropoff_latitude")

In [97]:
import geopandas as gpd

boroughs = gpd.read_file("input/nyc-boroughs.geojson")
boroughs = boroughs.sort_values(by="boroughCode", ascending=True)

boroughs_list = [(row["borough"], row["geometry"]) for _, row in boroughs.iterrows()]

sc = SparkSession.builder.getOrCreate().sparkContext
boroughs_bc = sc.broadcast(boroughs_list)



In [98]:
from shapely.geometry import Point

def get_borough(lat, lon):
    """Returns the borough name for given latitude and longitude."""
    if lat is None or lon is None:  
        return "Unknown"
    
    point = Point(lon, lat)
    
    for borough_name, polygon in boroughs_bc.value:
        if polygon.contains(point):
            return borough_name
    return "Unknown"

In [99]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

get_borough_udf = udf(get_borough, StringType())

In [100]:
from pyspark.sql.functions import col
df_selected = df_selected.withColumn("pickup_borough", get_borough_udf(df_selected["pickup_latitude"], df_selected["pickup_longitude"]))
df_selected = df_selected.withColumn("dropoff_borough", get_borough_udf(df_selected["dropoff_latitude"], df_selected["dropoff_longitude"]))
df_selected = df_selected.filter(col("dropoff_borough") != "Unknown")
df_selected.select("dropoff_borough").distinct().show(truncate=False)

df_selected.select("medallion").distinct().count()

+---------------+
|dropoff_borough|
+---------------+
|Queens         |
|Brooklyn       |
|Staten Island  |
|Manhattan      |
|Bronx          |
+---------------+



6373

### QUERY 1

In [101]:
from pyspark.sql.functions import unix_timestamp, to_timestamp, col

df_selected = df_selected.withColumn("pickup_datetime", to_timestamp(col("pickup_datetime"), "dd-MM-yy HH:mm"))
df_selected = df_selected.withColumn("dropoff_datetime", to_timestamp(col("dropoff_datetime"), "dd-MM-yy HH:mm"))

df_selected = df_selected.withColumn("pickup_ts", unix_timestamp(col("pickup_datetime")))
df_selected = df_selected.withColumn("dropoff_ts", unix_timestamp(col("dropoff_datetime")))


In [102]:
from pyspark.sql.functions import col, lag, sum as spark_sum, when

df_selected = df_selected.withColumn("duration", col("dropoff_ts") - col("pickup_ts"))
df_selected = df_selected.select("medallion", "pickup_borough", "dropoff_borough", "pickup_ts", "dropoff_ts", "duration")
df_selected.select("medallion").distinct().count()

6373

In [103]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag, sum as spark_sum

window_spec = Window.partitionBy("medallion").orderBy("pickup_ts")

df_utilization = df_selected.withColumn("prev_dropoff", lag("dropoff_ts").over(window_spec))
df_utilization = df_utilization.withColumn("idle_time", when(
    (col("prev_dropoff").isNotNull()) & (col("pickup_ts") - col("prev_dropoff") <= 14400),
    col("pickup_ts") - col("prev_dropoff")).otherwise(None))

idle_time = df_utilization.groupBy("medallion").agg(spark_sum("idle_time").alias("total_idle_time"))
idle_time = idle_time.fillna(0, subset=["total_idle_time"])

In [104]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum as spark_sum, col

window_spec = Window.partitionBy("medallion")

occupied_time = df_selected.groupBy("medallion").agg(spark_sum("duration").alias("total_occupied_time"))
occupied_time = occupied_time.fillna(0, subset=["total_occupied_time"])
idle_time = df_utilization.groupBy("medallion").agg(spark_sum("idle_time").alias("total_idle_time"))

df_final_utilization = occupied_time.join(idle_time, on="medallion", how="inner")

df_final_utilization = df_final_utilization.cache()

df_final_utilization.select("medallion",'total_idle_time', "total_occupied_time").distinct().show(10)

+--------------------+---------------+-------------------+
|           medallion|total_idle_time|total_occupied_time|
+--------------------+---------------+-------------------+
|0F621E366CFE63044...|          30600|              19020|
|223670562219093D6...|          14580|               6120|
|496036713FC662D71...|           8520|               3660|
|4F4CA97166A04A455...|          21900|               9360|
|59DF6039EC312EE6D...|          23040|              15900|
|5CCB4924B158F945B...|          23520|              18780|
|618BB39CEEAE5E9A6...|           8640|              12000|
|6AFD7E44A278CFD00...|           8640|               3960|
|72EAFBA3FB9F0507C...|          13800|              11580|
|73039762E0F4B253E...|          23400|              14400|
+--------------------+---------------+-------------------+
only showing top 10 rows



In [105]:
df_final_utilization = occupied_time.join(idle_time, on="medallion", how="inner")

df_final_utilization = df_final_utilization.withColumn(
    "utilization",
    when(col("total_occupied_time") + col("total_idle_time") > 0,
         col("total_occupied_time") / (col("total_occupied_time") + col("total_idle_time"))
    ).otherwise(0)
)
df_utilization = df_utilization.withColumn("idle_time", when(col("idle_time").isNull(), 0).otherwise(col("idle_time")))
df_utilization.show(10)

+--------------------+--------------+---------------+----------+----------+--------+------------+---------+
|           medallion|pickup_borough|dropoff_borough| pickup_ts|dropoff_ts|duration|prev_dropoff|idle_time|
+--------------------+--------------+---------------+----------+----------+--------+------------+---------+
|002E3B405B6ABEA23...|     Manhattan|      Manhattan|1358047920|1358048340|     420|        NULL|        0|
|002E3B405B6ABEA23...|     Manhattan|      Manhattan|1358049900|1358051700|    1800|  1358048340|     1560|
|002E3B405B6ABEA23...|     Manhattan|      Manhattan|1358052180|1358052720|     540|  1358051700|      480|
|002E3B405B6ABEA23...|        Queens|      Manhattan|1358079060|1358080920|    1860|  1358052720|        0|
|002E3B405B6ABEA23...|     Manhattan|      Manhattan|1358081940|1358082840|     900|  1358080920|     1020|
|002E3B405B6ABEA23...|     Manhattan|      Manhattan|1358083320|1358084040|     720|  1358082840|      480|
|002E3B405B6ABEA23...|     M

In [106]:
from pyspark.sql.functions import round

df_final_utilization = df_final_utilization.withColumn("utilization", round(col("utilization"), 2))
df_final_utilization = df_final_utilization.filter(col("total_idle_time").isNotNull())

df_final_utilization.select("medallion", "total_occupied_time", "total_idle_time", "utilization").show(10)

+--------------------+-------------------+---------------+-----------+
|           medallion|total_occupied_time|total_idle_time|utilization|
+--------------------+-------------------+---------------+-----------+
|0F621E366CFE63044...|              19020|          30600|       0.38|
|223670562219093D6...|               6120|          14580|        0.3|
|496036713FC662D71...|               3660|           8520|        0.3|
|4F4CA97166A04A455...|               9360|          21900|        0.3|
|59DF6039EC312EE6D...|              15900|          23040|       0.41|
|5CCB4924B158F945B...|              18780|          23520|       0.44|
|618BB39CEEAE5E9A6...|              12000|           8640|       0.58|
|6AFD7E44A278CFD00...|               3960|           8640|       0.31|
|72EAFBA3FB9F0507C...|              11580|          13800|       0.46|
|73039762E0F4B253E...|              14400|          23400|       0.38|
+--------------------+-------------------+---------------+-----------+
only s

### QUERY 2

In [107]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lead, avg, col

window_spec = Window.partitionBy("medallion").orderBy("dropoff_ts")
df_selected = df_selected.withColumn("next_pickup_ts", lead("pickup_ts").over(window_spec))
df_selected = df_selected.withColumn("wait_time", col("next_pickup_ts") - col("dropoff_ts"))
df_selected_wait = df_selected.withColumn("wait_time", 
    when(col("wait_time") <= 14400, col("wait_time")).otherwise(None)
)
df_selected_wait = df_selected_wait.filter(col("wait_time").isNotNull())
df_selected_wait = df_selected_wait.filter(col("dropoff_borough") != "Unknown")

avg_wait_time = df_selected_wait.groupBy("dropoff_borough").agg(
    round(avg("wait_time"), 2).alias("avg_wait_time")
)
results = avg_wait_time.collect()

for row in results:
    print(f"Borough: {row['dropoff_borough']}, Average Wait Time: {row['avg_wait_time']} seconds")

Borough: Queens, Average Wait Time: 3054.28 seconds
Borough: Brooklyn, Average Wait Time: 2634.88 seconds
Borough: Staten Island, Average Wait Time: 4710.0 seconds
Borough: Manhattan, Average Wait Time: 1047.16 seconds
Borough: Bronx, Average Wait Time: 2575.73 seconds


### QUERY 3 

In [108]:
same_borough_count = df_selected.filter(
    (col("pickup_borough") == col("dropoff_borough")) &
    (col("pickup_borough") != "Unknown") & 
    (col("dropoff_borough") != "Unknown")
).count()

print(f"Total trips that started and ended in the same borough: {same_borough_count}")

Total trips that started and ended in the same borough: 86074


### QUERY 4

In [109]:
different_borough_count = df_selected.filter(
    (col("pickup_borough") != col("dropoff_borough")) &
    (col("pickup_borough") != "Unknown") & 
    (col("dropoff_borough") != "Unknown")
).count()

print(f"Total trips that started in one borough and ended in another: {different_borough_count}")


Total trips that started in one borough and ended in another: 11433
