## Nupyter Jotebook

### Data Ingestion and Combination

In [1]:
import pandas as pd

df = pd.read_csv("input/sample_nyc_data.csv")

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("NYC Taxi Analysis").getOrCreate()

df_spark = spark.read.csv("input/sample_nyc_data.csv", header=True, inferSchema=True).cache()

df_selected = df_spark.select("medallion", 
                        "pickup_datetime", "pickup_longitude", "pickup_latitude", 
                        "dropoff_datetime", "dropoff_longitude", "dropoff_latitude")

In [3]:
!pip install geopandas
import geopandas as gpd

boroughs = gpd.read_file("input/nyc-boroughs.geojson")
boroughs = boroughs.sort_values(by="boroughCode", ascending=True)

boroughs_list = [(row["borough"], row["geometry"]) for _, row in boroughs.iterrows()]

sc = SparkSession.builder.getOrCreate().sparkContext
boroughs_bc = sc.broadcast(boroughs_list)

boroughs[["borough", "boroughCode", "geometry"]].head()

Collecting geopandas
  Using cached geopandas-1.0.1-py3-none-any.whl.metadata (2.2 kB)
Collecting pyogrio>=0.7.2 (from geopandas)
  Downloading pyogrio-0.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (5.5 kB)
Collecting pyproj>=3.3.0 (from geopandas)
  Downloading pyproj-3.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)
Collecting shapely>=2.0.0 (from geopandas)
  Downloading shapely-2.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Using cached geopandas-1.0.1-py3-none-any.whl (323 kB)
Downloading pyogrio-0.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading pyproj-3.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m0

Unnamed: 0,borough,boroughCode,geometry
51,Manhattan,1,"POLYGON ((-74.01675 40.69334, -74.0154 40.6930..."
72,Manhattan,1,"POLYGON ((-73.92641 40.87762, -73.9263 40.8774..."
71,Manhattan,1,"POLYGON ((-73.92134 40.80085, -73.92031 40.799..."
70,Manhattan,1,"POLYGON ((-73.93805 40.78083, -73.93779 40.780..."
69,Manhattan,1,"POLYGON ((-73.9418 40.76905, -73.94286 40.7683..."


In [4]:
from shapely.geometry import Point

def get_borough(lat, lon):
    """Returns the borough name for given latitude and longitude."""
    if lat is None or lon is None:  
        return "Unknown"
    
    point = Point(lon, lat)
    
    for borough_name, polygon in boroughs_bc.value:
        if polygon.contains(point):
            return borough_name
    return "Unknown"

In [5]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

get_borough_udf = udf(get_borough, StringType())

In [6]:
df_selected = df_selected.withColumn("pickup_borough", get_borough_udf(df_selected["pickup_latitude"], df_selected["pickup_longitude"]))
df_selected = df_selected.withColumn("dropoff_borough", get_borough_udf(df_selected["dropoff_latitude"], df_selected["dropoff_longitude"]))
df_selected.select("dropoff_borough").distinct().show(truncate=False)

df_selected.select("medallion").distinct().count()

+---------------+
|dropoff_borough|
+---------------+
|Queens         |
|Unknown        |
|Brooklyn       |
|Staten Island  |
|Manhattan      |
|Bronx          |
+---------------+



6444

### QUERY 1

In [7]:
df_selected.select("pickup_datetime").show(10, truncate=False)
df_selected.printSchema()

df_selected.select("pickup_datetime").distinct().show(20, truncate=False)

+---------------+
|pickup_datetime|
+---------------+
|01-01-13 15:11 |
|06-01-13 00:18 |
|05-01-13 18:49 |
|07-01-13 23:54 |
|07-01-13 23:25 |
|07-01-13 15:27 |
|08-01-13 11:01 |
|07-01-13 12:39 |
|07-01-13 18:15 |
|07-01-13 15:33 |
+---------------+
only showing top 10 rows

root
 |-- medallion: string (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_datetime: string (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- pickup_borough: string (nullable = true)
 |-- dropoff_borough: string (nullable = true)

+---------------+
|pickup_datetime|
+---------------+
|13-01-13 04:09 |
|13-01-13 11:30 |
|13-01-13 10:44 |
|13-01-13 10:42 |
|13-01-13 09:35 |
|13-01-13 08:21 |
|13-01-13 04:55 |
|13-01-13 03:37 |
|13-01-13 00:49 |
|13-01-13 03:16 |
|13-01-13 12:47 |
|10-01-13 15:34 |
|13-01-13 05:24 |
|13-01-

In [None]:
from pyspark.sql.functions import unix_timestamp, to_timestamp, col

df_selected = df_selected.withColumn("pickup_datetime", to_timestamp(col("pickup_datetime"), "dd-MM-yy HH:mm"))
df_selected = df_selected.withColumn("dropoff_datetime", to_timestamp(col("dropoff_datetime"), "dd-MM-yy HH:mm"))

df_selected = df_selected.withColumn("pickup_ts", unix_timestamp(col("pickup_datetime")))
df_selected = df_selected.withColumn("dropoff_ts", unix_timestamp(col("dropoff_datetime")))

df_selected.select("pickup_datetime", "pickup_ts", "dropoff_datetime", "dropoff_ts").show(10, truncate=False)


+-------------------+----------+-------------------+----------+
|pickup_datetime    |pickup_ts |dropoff_datetime   |dropoff_ts|
+-------------------+----------+-------------------+----------+
|2013-01-01 15:11:00|1357053060|2013-01-01 15:18:00|1357053480|
|2013-01-06 00:18:00|1357431480|2013-01-06 00:22:00|1357431720|
|2013-01-05 18:49:00|1357411740|2013-01-05 18:54:00|1357412040|
|2013-01-07 23:54:00|1357602840|2013-01-07 23:58:00|1357603080|
|2013-01-07 23:25:00|1357601100|2013-01-07 23:34:00|1357601640|
|2013-01-07 15:27:00|1357572420|2013-01-07 15:38:00|1357573080|
|2013-01-08 11:01:00|1357642860|2013-01-08 11:08:00|1357643280|
|2013-01-07 12:39:00|1357562340|2013-01-07 13:10:00|1357564200|
|2013-01-07 18:15:00|1357582500|2013-01-07 18:20:00|1357582800|
|2013-01-07 15:33:00|1357572780|2013-01-07 15:49:00|1357573740|
+-------------------+----------+-------------------+----------+
only showing top 10 rows



In [None]:
from pyspark.sql.functions import col, lag, sum as spark_sum, when

df_selected = df_selected.withColumn("duration", col("dropoff_ts") - col("pickup_ts"))
df_selected = df_selected.select("medallion", "pickup_borough", "dropoff_borough", "pickup_ts", "dropoff_ts", "duration")

df_selected.select("medallion").distinct().count()

6444

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag, sum as spark_sum

window_spec = Window.partitionBy("medallion").orderBy("pickup_ts")

df_utilization = df_selected.withColumn("prev_dropoff", lag("dropoff_ts").over(window_spec))
df_utilization = df_utilization.withColumn("idle_time", when(col("prev_dropoff").isNotNull(), col("pickup_ts") - col("prev_dropoff")).otherwise(None))

idle_time = df_utilization.groupBy("medallion").agg(spark_sum("idle_time").alias("total_idle_time"))

df_selected.select("medallion").distinct().count()

6444

In [11]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum as spark_sum, col

window_spec = Window.partitionBy("medallion")

occupied_time = df_selected.groupBy("medallion").agg(spark_sum("duration").alias("total_occupied_time"))
idle_time = df_utilization.groupBy("medallion").agg(spark_sum("idle_time").alias("total_idle_time"))

df_final_utilization = occupied_time.join(idle_time, on="medallion", how="inner")

df_final_utilization = df_final_utilization.cache()

df_final_utilization.select("medallion", "total_occupied_time").distinct().show(10)
df_final_utilization.select("medallion").distinct().count()

+--------------------+-------------------+
|           medallion|total_occupied_time|
+--------------------+-------------------+
|0F621E366CFE63044...|              19020|
|223670562219093D6...|               6120|
|496036713FC662D71...|               3660|
|4F4CA97166A04A455...|               9360|
|5803D6EAD49AEAA82...|               1620|
|59DF6039EC312EE6D...|              15900|
|5CCB4924B158F945B...|              18780|
|618BB39CEEAE5E9A6...|              12000|
|6AFD7E44A278CFD00...|               3960|
|72EAFBA3FB9F0507C...|              11580|
+--------------------+-------------------+
only showing top 10 rows



6444

In [None]:
df_final_utilization = occupied_time.join(idle_time, on="medallion", how="inner")

df_final_utilization = df_final_utilization.withColumn(
    "utilization",
    when(col("total_occupied_time") + col("total_idle_time") > 0,
         col("total_occupied_time") / (col("total_occupied_time") + col("total_idle_time"))
    ).otherwise(0)
)


df_utilization.show(10)

+--------------------+--------------+---------------+----------+----------+--------+------------+---------+
|           medallion|pickup_borough|dropoff_borough| pickup_ts|dropoff_ts|duration|prev_dropoff|idle_time|
+--------------------+--------------+---------------+----------+----------+--------+------------+---------+
|002E3B405B6ABEA23...|     Manhattan|      Manhattan|1358047920|1358048340|     420|        NULL|     NULL|
|002E3B405B6ABEA23...|     Manhattan|      Manhattan|1358049900|1358051700|    1800|  1358048340|     1560|
|002E3B405B6ABEA23...|     Manhattan|      Manhattan|1358052180|1358052720|     540|  1358051700|      480|
|002E3B405B6ABEA23...|        Queens|      Manhattan|1358079060|1358080920|    1860|  1358052720|    26340|
|002E3B405B6ABEA23...|     Manhattan|      Manhattan|1358081940|1358082840|     900|  1358080920|     1020|
|002E3B405B6ABEA23...|     Manhattan|      Manhattan|1358083320|1358084040|     720|  1358082840|      480|
|002E3B405B6ABEA23...|     M

In [None]:
from pyspark.sql.functions import round

df_final_utilization = df_final_utilization.withColumn("utilization", round(col("utilization"), 2))

df_final_utilization.select("medallion", "total_occupied_time", "total_idle_time", "utilization").show(10)

+--------------------+-------------------+---------------+-----------+
|           medallion|total_occupied_time|total_idle_time|utilization|
+--------------------+-------------------+---------------+-----------+
|0F621E366CFE63044...|              19020|          30600|       0.38|
|223670562219093D6...|               6120|          14580|        0.3|
|496036713FC662D71...|               3660|          57120|       0.06|
|4F4CA97166A04A455...|               9360|          38820|       0.19|
|5803D6EAD49AEAA82...|               1620|           NULL|        0.0|
|59DF6039EC312EE6D...|              15900|          38880|       0.29|
|5CCB4924B158F945B...|              18780|          23520|       0.44|
|618BB39CEEAE5E9A6...|              12000|          34800|       0.26|
|6AFD7E44A278CFD00...|               3960|           8640|       0.31|
|72EAFBA3FB9F0507C...|              11580|          13800|       0.46|
+--------------------+-------------------+---------------+-----------+
only s

### QUERY 2

In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lead, avg, col

window_spec = Window.partitionBy("medallion").orderBy("dropoff_ts")

df_selected = df_selected.withColumn("next_pickup_ts", lead("pickup_ts").over(window_spec))

df_selected = df_selected.withColumn("wait_time", col("next_pickup_ts") - col("dropoff_ts"))

df_selected = df_selected.filter(col("wait_time").isNotNull())

df_selected.select("medallion", "dropoff_borough", "dropoff_ts", "next_pickup_ts", "wait_time").show(10)

+--------------------+---------------+----------+--------------+---------+
|           medallion|dropoff_borough|dropoff_ts|next_pickup_ts|wait_time|
+--------------------+---------------+----------+--------------+---------+
|000318C2E3E638158...|      Manhattan|1358052900|    1358070300|    17400|
|000318C2E3E638158...|        Unknown|1358070420|    1358070960|      540|
|000318C2E3E638158...|        Unknown|1358071440|    1358071500|       60|
|000318C2E3E638158...|        Unknown|1358071860|    1358073060|     1200|
|000318C2E3E638158...|        Unknown|1358073660|    1358074020|      360|
|000318C2E3E638158...|        Unknown|1358074320|    1358074560|      240|
|000318C2E3E638158...|        Unknown|1358075040|    1358075400|      360|
|000318C2E3E638158...|        Unknown|1358076420|    1358077020|      600|
|000318C2E3E638158...|        Unknown|1358077380|    1358077920|      540|
|000318C2E3E638158...|        Unknown|1358078280|    1358079060|      780|
+--------------------+---

In [None]:
avg_wait_time = df_selected.groupBy("dropoff_borough").agg(
    avg("wait_time").alias("avg_wait_time")
)

avg_wait_time.show(10)

+---------------+------------------+
|dropoff_borough|     avg_wait_time|
+---------------+------------------+
|         Queens| 6368.423432682425|
|        Unknown|12206.935332708528|
|       Brooklyn| 6554.840325610519|
|  Staten Island|           13935.0|
|      Manhattan|2048.9211563256895|
|          Bronx| 4973.719008264463|
+---------------+------------------+



In [16]:
df_selected.groupBy("dropoff_borough").count().show()

+---------------+-----+
|dropoff_borough|count|
+---------------+-----+
|         Queens| 4865|
|        Unknown| 2134|
|       Brooklyn| 3194|
|  Staten Island|   12|
|      Manhattan|82987|
|          Bronx|  363|
+---------------+-----+



### QUERY 3 

In [17]:
from pyspark.sql.functions import col

same_borough_count = df_selected.filter(
    (col("pickup_borough") == col("dropoff_borough")) &
    (col("pickup_borough") != "Unknown") & 
    (col("dropoff_borough") != "Unknown")
).count()

print(f"Total trips that started and ended in the same borough: {same_borough_count}")

Total trips that started and ended in the same borough: 81053


### QUERY 4

In [18]:
different_borough_count = df_selected.filter(
    (col("pickup_borough") != col("dropoff_borough")) &
    (col("pickup_borough") != "Unknown") & 
    (col("dropoff_borough") != "Unknown")
).count()

print(f"Total trips that started in one borough and ended in another: {different_borough_count}")


Total trips that started in one borough and ended in another: 10240
