## Nupyter Jotebook

### Data Ingestion and Combination

In [230]:
import pandas as pd

# Load NYC taxi data
df = pd.read_csv("input/sample_nyc_data.csv")

In [231]:
from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder.appName("NYC Taxi Analysis").getOrCreate()

# Load data into PySpark DataFrame
df_spark = spark.read.csv("input/sample_nyc_data.csv", header=True, inferSchema=True)

df_selected = df_spark.select("medallion", 
                        "pickup_datetime", "pickup_longitude", "pickup_latitude", 
                        "dropoff_datetime", "dropoff_longitude", "dropoff_latitude")

In [232]:
import geopandas as gpd

# Load borough boundaries from GeoJSON
boroughs = gpd.read_file("input/nyc-boroughs.geojson")
boroughs = boroughs.sort_values(by="boroughCode", ascending=True)

# Check sorted data
boroughs[["borough", "boroughCode", "geometry"]].head()

Unnamed: 0,borough,boroughCode,geometry
51,Manhattan,1,"POLYGON ((-74.01675 40.69334, -74.0154 40.6930..."
72,Manhattan,1,"POLYGON ((-73.92641 40.87762, -73.9263 40.8774..."
71,Manhattan,1,"POLYGON ((-73.92134 40.80085, -73.92031 40.799..."
70,Manhattan,1,"POLYGON ((-73.93805 40.78083, -73.93779 40.780..."
69,Manhattan,1,"POLYGON ((-73.9418 40.76905, -73.94286 40.7683..."


In [233]:
from shapely.geometry import Point

def get_borough(lat, lon):
    """Returns the borough name for a given latitude and longitude."""
    point = Point(lon, lat)
    for _, row in boroughs.iterrows():
        if row['geometry'].contains(point):
            return row['borough']
    return "Unknown"

In [234]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Convert function into a Spark UDF
get_borough_udf = udf(get_borough, StringType())

In [235]:
df_selected = df_selected.withColumn("pickup_borough", get_borough_udf(df_selected["pickup_latitude"], df_selected["pickup_longitude"]))
df_selected = df_selected.withColumn("dropoff_borough", get_borough_udf(df_selected["dropoff_latitude"], df_selected["dropoff_longitude"]))
df_selected.select("dropoff_borough").distinct().show(truncate=False)

df_selected.select("medallion").distinct().count()

+---------------+
|dropoff_borough|
+---------------+
|Queens         |
|Unknown        |
|Brooklyn       |
|Staten Island  |
|Manhattan      |
|Bronx          |
+---------------+



6444

### QUERY 1

In [236]:
df_selected.select("pickup_datetime").show(10, truncate=False)
df_selected.printSchema()

df_selected.select("pickup_datetime").distinct().show(20, truncate=False)

+---------------+
|pickup_datetime|
+---------------+
|01-01-13 15:11 |
|06-01-13 00:18 |
|05-01-13 18:49 |
|07-01-13 23:54 |
|07-01-13 23:25 |
|07-01-13 15:27 |
|08-01-13 11:01 |
|07-01-13 12:39 |
|07-01-13 18:15 |
|07-01-13 15:33 |
+---------------+
only showing top 10 rows

root
 |-- medallion: string (nullable = true)
 |-- pickup_datetime: string (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_datetime: string (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- pickup_borough: string (nullable = true)
 |-- dropoff_borough: string (nullable = true)

+---------------+
|pickup_datetime|
+---------------+
|13-01-13 04:09 |
|13-01-13 11:30 |
|13-01-13 10:44 |
|13-01-13 10:42 |
|13-01-13 09:35 |
|13-01-13 08:21 |
|13-01-13 04:55 |
|13-01-13 03:37 |
|13-01-13 00:49 |
|13-01-13 03:16 |
|13-01-13 12:47 |
|10-01-13 15:34 |
|13-01-13 05:24 |
|13-01-

In [237]:
from pyspark.sql.functions import unix_timestamp, to_timestamp, col

# Convert to TimestampType first
df_selected = df_selected.withColumn("pickup_datetime", to_timestamp(col("pickup_datetime"), "dd-MM-yy HH:mm"))
df_selected = df_selected.withColumn("dropoff_datetime", to_timestamp(col("dropoff_datetime"), "dd-MM-yy HH:mm"))

# Convert to Unix epoch time (milliseconds)
df_selected = df_selected.withColumn("pickup_ts", unix_timestamp(col("pickup_datetime")))
df_selected = df_selected.withColumn("dropoff_ts", unix_timestamp(col("dropoff_datetime")))

# Show results
df_selected.select("pickup_datetime", "pickup_ts", "dropoff_datetime", "dropoff_ts").show(10, truncate=False)


+-------------------+----------+-------------------+----------+
|pickup_datetime    |pickup_ts |dropoff_datetime   |dropoff_ts|
+-------------------+----------+-------------------+----------+
|2013-01-01 15:11:00|1357053060|2013-01-01 15:18:00|1357053480|
|2013-01-06 00:18:00|1357431480|2013-01-06 00:22:00|1357431720|
|2013-01-05 18:49:00|1357411740|2013-01-05 18:54:00|1357412040|
|2013-01-07 23:54:00|1357602840|2013-01-07 23:58:00|1357603080|
|2013-01-07 23:25:00|1357601100|2013-01-07 23:34:00|1357601640|
|2013-01-07 15:27:00|1357572420|2013-01-07 15:38:00|1357573080|
|2013-01-08 11:01:00|1357642860|2013-01-08 11:08:00|1357643280|
|2013-01-07 12:39:00|1357562340|2013-01-07 13:10:00|1357564200|
|2013-01-07 18:15:00|1357582500|2013-01-07 18:20:00|1357582800|
|2013-01-07 15:33:00|1357572780|2013-01-07 15:49:00|1357573740|
+-------------------+----------+-------------------+----------+
only showing top 10 rows



In [238]:
from pyspark.sql.functions import col

# Compute trip duration in seconds
df_selected = df_selected.withColumn("duration", col("dropoff_ts") - col("pickup_ts"))
df_selected = df_selected.filter((col("duration") > 0) & (col("duration") <= 14400))
df_selected = df_selected.select("medallion", "pickup_borough", "dropoff_borough", "pickup_ts", "dropoff_ts", "duration")

# Show final dataset
df_selected.select("medallion").distinct().count()

6435

In [239]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag, sum as spark_sum

# Define window partitioned by taxi (medallion) and ordered by pickup time
window_spec = Window.partitionBy("medallion").orderBy("pickup_ts")

# Get previous trip's drop-off time
df_previous_trip = df_selected.withColumn("prev_dropoff", lag("dropoff_ts").over(window_spec))

# Compute idle time (time between last drop-off and next pickup)
df_previous_trip = df_previous_trip.withColumn("idle_time", col("pickup_ts") - col("prev_dropoff"))
df_previous_trip = df_previous_trip.filter(col("idle_time").isNotNull() & (col("idle_time") <= 14400))
idle_time = df_previous_trip.groupBy("medallion").agg(spark_sum("idle_time").alias("total_idle_time"))
df_selected.select("medallion").distinct().count()

6435

In [240]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum as spark_sum, col

# Define a window for each taxi
window_spec = Window.partitionBy("medallion")

# Compute total occupied time (sum of trip durations per taxi)
df_selected_sum = df_selected.withColumn("total_occupied_time", spark_sum("duration").over(window_spec))

df_selected_sum.select("medallion", "total_occupied_time").distinct().show(10)
df_selected.select("medallion").distinct().count()

+--------------------+-------------------+
|           medallion|total_occupied_time|
+--------------------+-------------------+
|000318C2E3E638158...|              13920|
|002B4CFC5B8920A87...|              11100|
|002E3B405B6ABEA23...|              10260|
|0030AD2648D81EE87...|               1980|
|0035520A854E4F276...|               8700|
|0036961468659D0BF...|              11700|
|003889E315BFDD985...|               4740|
|0038EF45118925A51...|              10920|
|003D87DB553C6F00F...|              12780|
|003EEA559FA618008...|              14580|
+--------------------+-------------------+
only showing top 10 rows



6435

In [241]:
df_utilization = df_selected_sum.select("medallion", "total_occupied_time").distinct().join(
    idle_time, on="medallion", how="inner"
)

# Show merged data
df_utilization.show(10)

+--------------------+-------------------+---------------+
|           medallion|total_occupied_time|total_idle_time|
+--------------------+-------------------+---------------+
|000318C2E3E638158...|              13920|          17400|
|002B4CFC5B8920A87...|              11100|          17700|
|002E3B405B6ABEA23...|              10260|          16140|
|0030AD2648D81EE87...|               1980|            720|
|0035520A854E4F276...|               8700|          14880|
|0036961468659D0BF...|              11700|          19740|
|003889E315BFDD985...|               4740|           9480|
|0038EF45118925A51...|              10920|          15120|
|003D87DB553C6F00F...|              12780|          12180|
|003EEA559FA618008...|              14580|          38640|
+--------------------+-------------------+---------------+
only showing top 10 rows



In [242]:
from pyspark.sql.functions import round

# Compute utilization and round to 2 decimal places
df_utilization = df_utilization.withColumn(
    "utilization", round(col("total_occupied_time") / (col("total_occupied_time") + col("total_idle_time")), 2)
)

# Show utilization percentage rounded to 2 decimal places
df_utilization.select("medallion", "total_occupied_time", "total_idle_time", "utilization").show(10)

+--------------------+-------------------+---------------+-----------+
|           medallion|total_occupied_time|total_idle_time|utilization|
+--------------------+-------------------+---------------+-----------+
|000318C2E3E638158...|              13920|          17400|       0.44|
|002B4CFC5B8920A87...|              11100|          17700|       0.39|
|002E3B405B6ABEA23...|              10260|          16140|       0.39|
|0030AD2648D81EE87...|               1980|            720|       0.73|
|0035520A854E4F276...|               8700|          14880|       0.37|
|0036961468659D0BF...|              11700|          19740|       0.37|
|003889E315BFDD985...|               4740|           9480|       0.33|
|0038EF45118925A51...|              10920|          15120|       0.42|
|003D87DB553C6F00F...|              12780|          12180|       0.51|
|003EEA559FA618008...|              14580|          38640|       0.27|
+--------------------+-------------------+---------------+-----------+
only s

### QUERY 2

In [243]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lead, avg, col

# Define window partitioned by taxi (medallion) and ordered by drop-off time
window_spec = Window.partitionBy("medallion").orderBy("dropoff_ts")

# Get the next trip's pickup time
df_selected = df_selected.withColumn("next_pickup_ts", lead("pickup_ts").over(window_spec))

# Compute the time difference (wait time for next fare)
df_selected = df_selected.withColumn("wait_time", col("next_pickup_ts") - col("dropoff_ts"))

df_selected = df_selected.filter(col("wait_time").isNotNull())

# Show results
df_selected.select("medallion", "dropoff_borough", "dropoff_ts", "next_pickup_ts", "wait_time").show(10)

+--------------------+---------------+----------+--------------+---------+
|           medallion|dropoff_borough|dropoff_ts|next_pickup_ts|wait_time|
+--------------------+---------------+----------+--------------+---------+
|000318C2E3E638158...|      Manhattan|1358052900|    1358070300|    17400|
|000318C2E3E638158...|        Unknown|1358070420|    1358070960|      540|
|000318C2E3E638158...|        Unknown|1358071440|    1358071500|       60|
|000318C2E3E638158...|        Unknown|1358071860|    1358073060|     1200|
|000318C2E3E638158...|        Unknown|1358073660|    1358074020|      360|
|000318C2E3E638158...|        Unknown|1358074320|    1358074560|      240|
|000318C2E3E638158...|        Unknown|1358075040|    1358075400|      360|
|000318C2E3E638158...|        Unknown|1358076420|    1358077020|      600|
|000318C2E3E638158...|        Unknown|1358077380|    1358077920|      540|
|000318C2E3E638158...|        Unknown|1358078280|    1358079060|      780|
+--------------------+---

In [244]:
avg_wait_time = df_selected.groupBy("dropoff_borough").agg(
    avg("wait_time").alias("avg_wait_time")
)

# Show results
avg_wait_time.show(10)

+---------------+------------------+
|dropoff_borough|     avg_wait_time|
+---------------+------------------+
|         Queens|           6423.75|
|        Unknown|12119.191374663073|
|       Brooklyn| 6589.554579673777|
|  Staten Island|           13935.0|
|      Manhattan|  2052.45890956484|
|          Bronx| 4989.473684210527|
+---------------+------------------+



In [245]:
df_selected.groupBy("dropoff_borough").count().show()

+---------------+-----+
|dropoff_borough|count|
+---------------+-----+
|         Queens| 4832|
|        Unknown| 1855|
|       Brooklyn| 3188|
|  Staten Island|   12|
|      Manhattan|82866|
|          Bronx|  361|
+---------------+-----+

