# Project 2

### Imports

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.functions import max, col, unix_timestamp, lit


### Query 0 - Data Cleansing and Setup

In [2]:
spark = SparkSession.builder \
    .appName("Project") \
    .getOrCreate()

In [3]:

df_small = spark.read.csv("input/sorted_data_smaller.csv", header=False, inferSchema=True)

In [4]:
columns = [
    "medallion",
    "hack_license",
    "pickup_datetime",
    "dropoff_datetime",
    "trip_time_in_secs",
    "trip_distance",
    "pickup_longitude",
    "pickup_latitude",
    "dropoff_longitude",
    "dropoff_latitude",
    "payment_type",
    "fare_amount",
    "surcharge",
    "mta_tax",
    "tip_amount",
    "tolls_amount",
    "total_amount"
]
df_small = df_small.toDF(*columns)

In [5]:
df_small.show(5)  # See the first 5 rows
df_small.printSchema()

+--------------------+--------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
|           medallion|        hack_license|    pickup_datetime|   dropoff_datetime|trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|payment_type|fare_amount|surcharge|mta_tax|tip_amount|tolls_amount|total_amount|
+--------------------+--------------------+-------------------+-------------------+-----------------+-------------+----------------+---------------+-----------------+----------------+------------+-----------+---------+-------+----------+------------+------------+
|5EE2C4D3BF57BDB45...|E96EF8F6E6122591F...|2013-01-01 00:00:09|2013-01-01 00:00:36|               26|          0.1|       -73.99221|      40.725124|       -73.991646|       40.726658|         CSH|        2.5|

In [6]:

# Remove rows with missing or 0.0 coordinates
df_clean = df_small.filter(
    (col("pickup_longitude").isNotNull()) & (col("pickup_longitude") != 0.0) &
    (col("pickup_latitude").isNotNull()) & (col("pickup_latitude") != 0.0) &
    (col("dropoff_longitude").isNotNull()) & (col("dropoff_longitude") != 0.0) &
    (col("dropoff_latitude").isNotNull()) & (col("dropoff_latitude") != 0.0)
)

# Remove rows with missing medallions or licenses
df_clean = df_clean.filter(
    (col("medallion").isNotNull()) & (col("medallion") != "") &
    (col("hack_license").isNotNull()) & (col("hack_license") != "")
)
# Tme Model - this supports time-based queries
df_clean = df_clean.withColumn("pickup_ts", unix_timestamp("pickup_datetime")) \
                   .withColumn("dropoff_ts", unix_timestamp("dropoff_datetime"))

In [7]:
print("Original row count:", df_small.count())
print("Cleaned row count:", df_clean.count())

Original row count: 2443340
Cleaned row count: 2395511


### Query 1: Frequent Routes

In [8]:
import math
from pyspark.sql.functions import col, udf, desc
from pyspark.sql.types import StringType

def to_cell(lat, lon):
    if not (40.5 <= lat <= 41.8 and -74.25 <= lon <= -73.7):
        return None
    lat0 = 41.474937
    lon0 = -74.913585
    meters_per_deg_lat = 111320
    meters_per_deg_lon = 40075000 * math.cos(math.radians(lat0)) / 360
    cell_x = int((lon - lon0) * meters_per_deg_lon / 500) + 1
    cell_y = int((lat0 - lat) * meters_per_deg_lat / 500) + 1
    if 1 <= cell_x <= 300 and 1 <= cell_y <= 300:
        return f"{cell_x}.{cell_y}"
    return None

to_cell_udf = udf(to_cell, StringType())

df_filtered = df_clean.withColumn("pickup_cell", to_cell_udf(col("pickup_latitude"), col("pickup_longitude"))) \
                      .withColumn("dropoff_cell", to_cell_udf(col("dropoff_latitude"), col("dropoff_longitude"))) \
                      .filter(col("pickup_cell").isNotNull() & col("dropoff_cell").isNotNull())

routes = df_filtered.groupBy("pickup_cell", "dropoff_cell").count()

top10_routes = routes.orderBy(desc("count")).limit(10)

top10_routes.select(
    col("pickup_cell").alias("start_cell"),
    col("dropoff_cell").alias("end_cell"),
    col("count").alias("Number_of_Rides")
).show(truncate=False)


+----------+--------+---------------+
|start_cell|end_cell|Number_of_Rides|
+----------+--------+---------------+
|155.160   |154.162 |2200           |
|154.162   |155.160 |2042           |
|156.159   |154.162 |2004           |
|157.161   |154.162 |1935           |
|154.162   |156.161 |1854           |
|154.162   |156.159 |1763           |
|154.162   |157.161 |1751           |
|158.159   |157.161 |1711           |
|155.160   |156.159 |1606           |
|154.162   |155.164 |1578           |
+----------+--------+---------------+



In [9]:
### PART 2


In [15]:
from datetime import timedelta
import time

# As the data isn't a stream meaning we don't have a Kafka or something set up, this is the next best thing: It starts sliding the window to imitate 
# the streaming data that is coming and updates the routes accordingly.

# Uncomment this line to achieve longer run time with full data (i stopped at 23 minutes)
# df_sample = df_clean


df_sample = df_clean.limit(10000)

df_pd = df_sample.withColumn("pickup_cell", to_cell_udf(col("pickup_latitude"), col("pickup_longitude"))) \
                 .withColumn("dropoff_cell", to_cell_udf(col("dropoff_latitude"), col("dropoff_longitude"))) \
                 .filter(col("pickup_cell").isNotNull() & col("dropoff_cell").isNotNull()) \
                 .orderBy("dropoff_datetime") \
                 .select("pickup_datetime", "dropoff_datetime", "pickup_cell", "dropoff_cell") \
                 .collect()

print("Collected sample into memory:", len(df_pd), "rows")

window_minutes = 30
results = []
prev_top10_keys = []

for i, row in enumerate(df_pd):
    now = row["dropoff_datetime"]
    pickup_time = row["pickup_datetime"]
    window_start = now - timedelta(minutes=window_minutes)

    j = i
    while j >= 0 and df_pd[j]["dropoff_datetime"] >= window_start:
        j -= 1
    window_rows = df_pd[j+1:i+1]

    route_counts = {}
    for r in window_rows:
        key = (r["pickup_cell"], r["dropoff_cell"])
        route_counts[key] = route_counts.get(key, 0) + 1

    top10 = sorted(route_counts.items(), key=lambda x: -x[1])[:10]
    top10_keys = [route for route, _ in top10]

    if top10_keys != prev_top10_keys:
        prev_top10_keys = top10_keys.copy()

        while len(top10) < 10:
            top10.append(((None, None), None))

        delay = round(time.time() - time.mktime(now.timetuple()), 3)

        row_out = [pickup_time, now]
        for (start, end), count in top10:
            row_out.extend([start, end, count])
        row_out.append(delay)
        results.append(row_out)

        if len(results) <= 5:
            print(f"\nUpdate #{len(results)}")
            print("Pickup:", pickup_time)
            print("Dropoff:", now)
            for idx, ((s, e), c) in enumerate(top10, 1):
                print(f"  Route {idx}: {s} → {e} — {c} rides")
            print("Delay:", delay, "seconds")

print("\nDone. Total updates triggered:", len(results))

if len(results) > 5:
    print("\nLast 5 updates:")
    for row in results[-5:]:
        print(f"\nPickup: {row[0]}, Dropoff: {row[1]}")
        for i in range(2, 32, 3):
            print(f"  Route {(i - 2)//3 + 1}: {row[i]} → {row[i+1]} — {row[i+2]} rides")
        print("Delay:", row[-1], "seconds")


Collected sample into memory: 9997 rows

Update #1
Pickup: 2013-01-01 00:00:09
Dropoff: 2013-01-01 00:00:36
  Route 1: 154.167 → 154.167 — 1 rides
  Route 2: None → None — None rides
  Route 3: None → None — None rides
  Route 4: None → None — None rides
  Route 5: None → None — None rides
  Route 6: None → None — None rides
  Route 7: None → None — None rides
  Route 8: None → None — None rides
  Route 9: None → None — None rides
  Route 10: None → None — None rides
Delay: 385920091.993 seconds

Update #2
Pickup: 2013-01-01 00:00:00
Dropoff: 2013-01-01 00:03:00
  Route 1: 154.167 → 154.167 — 1 rides
  Route 2: 151.172 → 158.152 — 1 rides
  Route 3: None → None — None rides
  Route 4: None → None — None rides
  Route 5: None → None — None rides
  Route 6: None → None — None rides
  Route 7: None → None — None rides
  Route 8: None → None — None rides
  Route 9: None → None — None rides
  Route 10: None → None — None rides
Delay: 385919947.993 seconds

Update #3
Pickup: 2013-01-01 00:02