## Start Spark and Load Data

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
        .appName("Walrus_NYC_FootTraffic_311")
        .getOrCreate()
)

# local paths
turnstile_path = "/home/wlevine/Walrus/RawData/MTA_Subway_Hourly_Ridership__2020-2024_20251124 (1).csv"
sr311_path = "/home/wlevine/Walrus/RawData/311_Service_Requests_from_2010_to_Present_20251124.csv"

# loading week of turnstile data
turnstile_raw = (
    spark.read
         .option("header", "true")
         .option("inferSchema", "true")
         .csv(turnstile_path)
)

# loading same-week 311 data
sr311_raw = (
    spark.read
         .option("header", "true")
         .option("inferSchema", "true")
         .csv(sr311_path)
)

turnstile_raw.show(5)
sr311_raw.show(5)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/24 11:48:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/24 11:48:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/11/24 11:48:53 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/11/24 11:48:53 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
25/11/24 11:48:53 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
25/11/24 11:48:53 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.
                                                                                

+--------------------+------------+------------------+--------------------+---------+--------------+--------------------+---------+---------+---------+---------+--------------------+
|   transit_timestamp|transit_mode|station_complex_id|     station_complex|  borough|payment_method| fare_class_category|ridership|transfers| latitude|longitude|        Georeference|
+--------------------+------------+------------------+--------------------+---------+--------------+--------------------+---------+---------+---------+---------+--------------------+
|02/05/2024 12:00:...|      subway|               349|       Junius St (3)| Brooklyn|     metrocard|Metrocard - Full ...|        4|        1|40.663513|-73.90245|POINT (-73.90245 ...|
|02/05/2024 12:00:...|      subway|               272|         36 St (M,R)|   Queens|     metrocard|Metrocard - Unlim...|        1|        0| 40.75204|-73.92878|POINT (-73.92878 ...|
|02/05/2024 12:00:...|      subway|                84|85 St-Forest Pkwy...|   Queens|

25/11/24 11:48:59 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----------+--------------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+--------------------+-----------------+-----------------+---------------------+---------------------+------------+----------------+--------------------+-------------+------+--------+----------------------+------------------------------+---------------+----------+---------+--------------------------+--------------------------+----------------------+------------------+------------+------------+--------------------+---------------------+-------------------+------------------------+---------+----------------------+--------------+---------------+--------------------+
|Unique Key|        Created Date|         Closed Date|Agency|         Agency Name|      Complaint Type|          Descriptor|       Location Type|Incident Zip|    Incident Address|         Street Name|   Cross Street 1|   Cross Street 2|Intersection Str

## Turnstile Data

In [2]:
from pyspark.sql.functions import (
    col,
    to_timestamp,
    to_date,
    hour,
    dayofweek,
    when
)

# start from the raw turnstile pull and keep the columns we actually care about
turnstile_sel = (
    turnstile_raw
        .select(
            "transit_timestamp",
            "transit_mode",
            "station_complex_id",
            "station_complex",
            "borough",
            "payment_method",
            "fare_class_category",
            "ridership",
            "transfers",
            "latitude",
            "longitude",
            "Georeference"
        )
)

# parse the timestamp string into a proper timestamp column
# format here matches: 02/05/2024 12:00:00 AM
turnstile_sel = turnstile_sel.withColumn(
    "dt",
    to_timestamp(col("transit_timestamp"), "MM/dd/yyyy hh:mm:ss a")
)

# drop any weird rows where timestamp didn't parse
turnstile_sel = turnstile_sel.filter(col("dt").isNotNull())

# basic time breakdowns that we'll use later
turnstile_sel = (
    turnstile_sel
        .withColumn("date", to_date(col("dt")))
        .withColumn("hour", hour(col("dt")))
        .withColumn("weekday_num", dayofweek(col("dt")))  # 1 = Sun, 7 = Sat
        .withColumn(
            "is_weekend",
            when(col("weekday_num").isin(1, 7), 1).otherwise(0)
        )
)

# make sure ridership + transfers are numeric and get a total flow metric
turnstile_sel = (
    turnstile_sel
        .withColumn("ridership", col("ridership").cast("int"))
        .withColumn("transfers", col("transfers").cast("int"))
        .withColumn("total_flow", col("ridership") + col("transfers"))
)

# bucket the day into something that plays nicely in visuals
turnstile_sel = (
    turnstile_sel
        .withColumn(
            "tod_bucket",
            when((col("hour") >= 7) & (col("hour") < 10), "AM Peak")
            .when((col("hour") >= 16) & (col("hour") < 19), "PM Peak")
            .otherwise("Off-Peak")
        )
)

# final ordered version of the table we'll use going forward
turnstile_final = (
    turnstile_sel
        .select(
            "dt",
            "date",
            "hour",
            "weekday_num",
            "is_weekend",
            "tod_bucket",
            "transit_mode",
            "station_complex_id",
            "station_complex",
            "borough",
            "payment_method",
            "fare_class_category",
            "ridership",
            "transfers",
            "total_flow",
            "latitude",
            "longitude",
            "Georeference"
        )
)

# quick sanity check
turnstile_final.show(20, truncate=False)
turnstile_final.printSchema()

+-------------------+----------+----+-----------+----------+----------+------------+------------------+---------------------------------+---------+--------------+--------------------------------+---------+---------+----------+---------+----------+----------------------------+
|dt                 |date      |hour|weekday_num|is_weekend|tod_bucket|transit_mode|station_complex_id|station_complex                  |borough  |payment_method|fare_class_category             |ridership|transfers|total_flow|latitude |longitude |Georeference                |
+-------------------+----------+----+-----------+----------+----------+------------+------------------+---------------------------------+---------+--------------+--------------------------------+---------+---------+----------+---------+----------+----------------------------+
|2024-02-05 00:00:00|2024-02-05|0   |2          |0         |Off-Peak  |subway      |349               |Junius St (3)                    |Brooklyn |metrocard     |Metroca

In [3]:
from pyspark.sql import functions as F

ts = turnstile_final

In [4]:
# fill any nulls on ridership/transfers and total_flow
ts = (
    ts.withColumn("ridership", F.coalesce("ridership", F.lit(0)))
      .withColumn("transfers", F.coalesce("transfers", F.lit(0)))
      .withColumn("total_flow", F.coalesce("total_flow", F.col("ridership") + F.col("transfers")))
)

# no need to keep rows with 0 total flow
ts = ts.filter(F.col("total_flow") > 0)

In [5]:
# quick label for weekday on charts
ts = ts.withColumn("day_name", F.date_format("date", "E"))

In [6]:
# cleaner label than 0/1 when slicing
ts = ts.withColumn(
    "week_part",
    F.when(F.col("is_weekend") == 1, F.lit("Weekend")).otherwise(F.lit("Weekday"))
)

In [7]:
# hour bucket across the whole week (mon 0:00 = 0, ... )
ts = ts.withColumn(
    "hour_of_week",
    F.col("weekday_num") * F.lit(24) + F.col("hour")
)

In [8]:
# clean up payment method a bit
ts = ts.withColumn("payment_method_clean", F.trim(F.lower("payment_method")))

# high level bucket here so plots don't get crazy
ts = ts.withColumn(
    "payment_group",
    F.when(F.col("payment_method_clean") == "metrocard", "MetroCard")
     .when(F.col("payment_method_clean") == "omny", "OMNY")
     .otherwise("Other")
)

In [9]:
# collapse all those long fare strings into something usable
ts = ts.withColumn(
    "fare_bucket",
    F.when(F.col("fare_class_category").like("%Full Fare%"), "Full Fare")
     .when(F.col("fare_class_category").like("%Unlimited%"), "Unlimited")
     .when(F.col("fare_class_category").like("%Discount%"), "Discount/Reduced")
     .when(F.col("fare_class_category").like("%Fair Fare%"), "Fair Fares")
     .otherwise("Other")
)

In [10]:
# clean up any weird spacing in station names
ts = ts.withColumn("station_clean", F.regexp_replace("station_complex", r"\s+", " "))

# rounded lat/lon to group or join easier with 311 later
ts = (
    ts.withColumn("lat_4", F.round("latitude", 4))
      .withColumn("lon_4", F.round("longitude", 4))
)

In [11]:
# this is the version i'll use going forward
turnstile_model_df = ts

turnstile_model_df.printSchema()
turnstile_model_df.show(5, truncate=False)

root
 |-- dt: timestamp (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- weekday_num: integer (nullable = true)
 |-- is_weekend: integer (nullable = false)
 |-- tod_bucket: string (nullable = false)
 |-- transit_mode: string (nullable = true)
 |-- station_complex_id: string (nullable = true)
 |-- station_complex: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- fare_class_category: string (nullable = true)
 |-- ridership: integer (nullable = false)
 |-- transfers: integer (nullable = false)
 |-- total_flow: integer (nullable = false)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Georeference: string (nullable = true)
 |-- day_name: string (nullable = true)
 |-- week_part: string (nullable = false)
 |-- hour_of_week: integer (nullable = true)
 |-- payment_method_clean: string (nullable = true)
 |-- payment_group: string (nullable = false)

## 311 Data

In [12]:
from pyspark.sql.functions import (
    to_timestamp, to_date, hour, dayofweek, date_format,
    trim, lower, regexp_replace, col, when,
    substring_index, concat_ws, round
)
from pyspark.sql.types import DoubleType

# start with the raw frame
df311 = sr311_raw

# keep only the columns we actually use in this project
df311 = (
    df311.select(
        "Created Date",
        "Closed Date",
        "Agency",
        "Agency Name",
        "Complaint Type",
        "Descriptor",
        "Location Type",
        "Incident Zip",
        "Incident Address",
        "Street Name",
        "Cross Street 1",
        "Cross Street 2",
        "Intersection Street 1",
        "Intersection Street 2",
        "City",
        "Borough",
        "Latitude",
        "Longitude"
    )
)

# rename to cleaner column names
df311 = (
    df311.withColumnRenamed("Created Date", "created_dt")
         .withColumnRenamed("Closed Date", "closed_dt")
         .withColumnRenamed("Complaint Type", "complaint")
         .withColumnRenamed("Descriptor", "descriptor")
         .withColumnRenamed("Location Type", "location_type")
         .withColumnRenamed("Incident Zip", "zip")
         .withColumnRenamed("Incident Address", "address")
         .withColumnRenamed("Street Name", "street")
         .withColumnRenamed("Cross Street 1", "cross1")
         .withColumnRenamed("Cross Street 2", "cross2")
         .withColumnRenamed("Intersection Street 1", "inter1")
         .withColumnRenamed("Intersection Street 2", "inter2")
)

# convert timestamp
df311 = (
    df311.withColumn("dt", to_timestamp("created_dt", "MM/dd/yyyy hh:mm:ss a"))
         .withColumn("date", to_date("dt"))
         .withColumn("hour", hour("dt"))
         .withColumn("weekday_num", dayofweek("dt"))
         .withColumn("day_name", date_format("dt", "EEEE"))
)

# weekend flag
df311 = df311.withColumn(
    "is_weekend",
    when(col("weekday_num").isin(1,7), 1).otherwise(0)
)

# coarse time-of-day bucket
df311 = df311.withColumn(
    "tod_bucket",
    when((col("hour") >= 7) & (col("hour") < 10), "AM Peak")
    .when((col("hour") >= 16) & (col("hour") < 19), "PM Peak")
    .otherwise("Off-Peak")
)

# normalize complaint type categories
df311 = (
    df311.withColumn("complaint_clean", lower(trim(col("complaint"))))
         .withColumn("complaint_clean", regexp_replace("complaint_clean", r"\s+", "_"))
)

# group complaints into higher-level buckets
df311 = df311.withColumn(
    "complaint_group",
    when(col("complaint_clean").rlike("noise|music|party"), "noise")
    .when(col("complaint_clean").rlike("sanitation|dirty|sweeping"), "sanitation")
    .when(col("complaint_clean").rlike("rodent|rat|mouse"), "rodent")
    .when(col("complaint_clean").rlike("heat|hot_water"), "heat_hot_water")
    .otherwise("other")
)

# clean borough
df311 = (
    df311.withColumn("borough_clean", trim(col("Borough")))
         .withColumn("borough_clean", lower(col("borough_clean")))
)

# latitude / longitude fix
df311 = (
    df311.withColumn("lat", col("Latitude").cast(DoubleType()))
         .withColumn("lon", col("Longitude").cast(DoubleType()))
         .withColumn("lat_4", round(col("lat"), 4))
         .withColumn("lon_4", round(col("lon"), 4))
)

# final ordered columns
sr311_final = df311.select(
    "dt","date","hour","weekday_num","day_name","is_weekend","tod_bucket",
    "complaint","descriptor","complaint_clean","complaint_group",
    "agency","Agency Name",
    "location_type","zip","address","street","cross1","cross2","inter1","inter2",
    "city","borough_clean",
    "lat","lon","lat_4","lon_4"
)

sr311_final.show(20, truncate=False)

+-------------------+----------+----+-----------+--------+----------+----------+--------------------------+--------------------------------------------+--------------------------+---------------+------+--------------------------------------------------+--------------------------+-----+------------------------------+------------------------+-----------------+-----------------------+-----------------+-----------------------+----------------+-------------+--------------+---------------+-------+--------+
|dt                 |date      |hour|weekday_num|day_name|is_weekend|tod_bucket|complaint                 |descriptor                                  |complaint_clean           |complaint_group|agency|Agency Name                                       |location_type             |zip  |address                       |street                  |cross1           |cross2                 |inter1           |inter2                 |city            |borough_clean|lat           |lon            |lat_4 

## Joining Data

In [13]:
from pyspark.sql import functions as F

turnstile_final = (
    turnstile_final
        .withColumn("lat_4", F.round(F.col("latitude"), 4))
        .withColumn("lon_4", F.round(F.col("longitude"), 4))
)

In [14]:
turnstile_final.printSchema()

root
 |-- dt: timestamp (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- weekday_num: integer (nullable = true)
 |-- is_weekend: integer (nullable = false)
 |-- tod_bucket: string (nullable = false)
 |-- transit_mode: string (nullable = true)
 |-- station_complex_id: string (nullable = true)
 |-- station_complex: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- fare_class_category: string (nullable = true)
 |-- ridership: integer (nullable = true)
 |-- transfers: integer (nullable = true)
 |-- total_flow: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Georeference: string (nullable = true)
 |-- lat_4: double (nullable = true)
 |-- lon_4: double (nullable = true)



In [15]:
turn_with_grid = turnstile_final.withColumn(
    "grid_key",
    F.concat_ws("_", F.col("lat_4"), F.col("lon_4"))
)

sr311_with_grid = sr311_final.withColumn(
    "grid_key",
    F.concat_ws("_", F.col("lat_4"), F.col("lon_4"))
)

In [16]:
from pyspark.sql import functions as F

# aggregate 311 complaints within each hour + grid
sr311_agg = (
    sr311_with_grid
        .groupBy("date", "hour", "grid_key")
        .agg(
            F.count("*").alias("complaints_hour"),
            F.sum( F.when(F.col("complaint_group") == "noise", 1).otherwise(0) ).alias("noise_hour"),
            F.sum( F.when(F.col("complaint_group") == "heat_hot_water", 1).otherwise(0) ).alias("heat_hour"),
            F.sum( F.when(F.col("complaint_group") == "other", 1).otherwise(0) ).alias("other_hour")
        )
)

In [17]:
joined = (
    turn_with_grid
        .join(
            sr311_agg,
            on=["date", "hour", "grid_key"],
            how="left"
        )
)

In [18]:
joined = joined.fillna({
    "complaints_hour": 0,
    "noise_hour": 0,
    "heat_hour": 0,
    "other_hour": 0
})

In [19]:
joined.printSchema()

root
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- grid_key: string (nullable = false)
 |-- dt: timestamp (nullable = true)
 |-- weekday_num: integer (nullable = true)
 |-- is_weekend: integer (nullable = false)
 |-- tod_bucket: string (nullable = false)
 |-- transit_mode: string (nullable = true)
 |-- station_complex_id: string (nullable = true)
 |-- station_complex: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- fare_class_category: string (nullable = true)
 |-- ridership: integer (nullable = true)
 |-- transfers: integer (nullable = true)
 |-- total_flow: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- Georeference: string (nullable = true)
 |-- lat_4: double (nullable = true)
 |-- lon_4: double (nullable = true)
 |-- complaints_hour: long (nullable = false)
 |-- noise_hour: long (nullable = false)
 |-- heat_hour: long (nul

## Exporting

In [20]:
from pyspark.sql import functions as F

# ridership by grid / hour / time-of-day bucket
turn_panel = (
    turn_with_grid
        .groupBy(
            "date",
            "hour",
            "weekday_num",
            "is_weekend",
            "tod_bucket",
            "grid_key",
            "borough",
            "station_complex"
        )
        .agg(
            F.sum("total_flow").alias("flow_sum"),
            F.sum("ridership").alias("ridership_sum"),
            F.sum("transfers").alias("transfers_sum")
        )
)

turn_panel.show(5, truncate=False)
turn_panel.printSchema()



+----------+----+-----------+----------+----------+----------------+--------+----------------------------------+--------+-------------+-------------+
|date      |hour|weekday_num|is_weekend|tod_bucket|grid_key        |borough |station_complex                   |flow_sum|ridership_sum|transfers_sum|
+----------+----+-----------+----------+----------+----------------+--------+----------------------------------+--------+-------------+-------------+
|2024-02-05|0   |2          |0         |Off-Peak  |40.8572_-73.8676|Bronx   |Pelham Pkwy (2,5)                 |19      |18           |1            |
|2024-02-05|0   |2          |0         |Off-Peak  |40.8839_-73.8626|Bronx   |219 St (2,5)                      |6       |6            |0            |
|2024-02-05|1   |2          |0         |Off-Peak  |40.6227_-74.0284|Brooklyn|86 St (R)                         |10      |9            |1            |
|2024-02-05|1   |2          |0         |Off-Peak  |40.7114_-73.8896|Queens  |Middle Village-Metropol

                                                                                

In [21]:
# complaint counts per grid / hour / time-of-day
sr311_panel = (
    sr311_with_grid
        .groupBy(
            "date",
            "hour",
            "weekday_num",
            "is_weekend",
            "tod_bucket",
            "grid_key",
            "borough_clean"
        )
        .agg(
            F.count("*").alias("complaints_total"),
            F.sum(F.when(F.col("complaint_group") == "noise", 1).otherwise(0)).alias("complaints_noise"),
            F.sum(F.when(F.col("complaint_group") == "heat_hot_water", 1).otherwise(0)).alias("complaints_heat"),
            F.sum(F.when(F.col("complaint_group") == "other", 1).otherwise(0)).alias("complaints_other")
        )
)

sr311_panel.show(5, truncate=False)
sr311_panel.printSchema()

[Stage 12:=====>                                                  (1 + 10) / 11]

+----------+----+-----------+----------+----------+----------------+-------------+----------------+----------------+---------------+----------------+
|date      |hour|weekday_num|is_weekend|tod_bucket|grid_key        |borough_clean|complaints_total|complaints_noise|complaints_heat|complaints_other|
+----------+----+-----------+----------+----------+----------------+-------------+----------------+----------------+---------------+----------------+
|2024-02-12|23  |2          |0         |Off-Peak  |40.844_-73.8195 |bronx        |1               |0               |0              |1               |
|2024-02-12|23  |2          |0         |Off-Peak  |40.8944_-73.8577|bronx        |1               |0               |1              |0               |
|2024-02-12|20  |2          |0         |Off-Peak  |40.6155_-73.9938|brooklyn     |1               |0               |0              |1               |
|2024-02-12|20  |2          |0         |Off-Peak  |40.7282_-73.982 |manhattan    |1               |1

                                                                                

In [22]:
# join on time + grid
hourly_panel = (
    turn_panel.alias("t")
        .join(
            sr311_panel.alias("c"),
            on=[
                "date",
                "hour",
                "weekday_num",
                "is_weekend",
                "tod_bucket",
                "grid_key"
            ],
            how="left"
        )
        # keep station + borough names from turnstile, 311 borough as backup
        .withColumn(
            "borough_final",
            F.coalesce(F.col("t.borough"), F.col("c.borough_clean"))
        )
        .fillna(
            {
                "complaints_total": 0,
                "complaints_noise": 0,
                "complaints_heat": 0,
                "complaints_other": 0,
            }
        )
)

hourly_panel.show(10, truncate=False)
hourly_panel.printSchema()

                                                                                

+----------+----+-----------+----------+----------+----------------+---------+----------------------------------+--------+-------------+-------------+-------------+----------------+----------------+---------------+----------------+-------------+
|date      |hour|weekday_num|is_weekend|tod_bucket|grid_key        |borough  |station_complex                   |flow_sum|ridership_sum|transfers_sum|borough_clean|complaints_total|complaints_noise|complaints_heat|complaints_other|borough_final|
+----------+----+-----------+----------+----------+----------------+---------+----------------------------------+--------+-------------+-------------+-------------+----------------+----------------+---------------+----------------+-------------+
|2024-02-05|0   |2          |0         |Off-Peak  |40.8572_-73.8676|Bronx    |Pelham Pkwy (2,5)                 |19      |18           |1            |NULL         |0               |0               |0              |0               |Bronx        |
|2024-02-05|0   

In [23]:
# pick a folder for cleaned data
output_path = "/home/wlevine/Walrus/Processed/hourly_panel"

(
    hourly_panel
        .repartition(50)          # bump this up/down depending on cluster
        .write
        .mode("overwrite")
        .parquet(output_path)
)

25/11/24 11:53:10 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/11/24 11:53:10 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
25/11/24 11:53:10 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
25/11/24 11:53:10 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
25/11/24 11:53:10 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
25/11/24 11:53:10 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 58.46% for 13 writers
25/11/24 11:53:10 WARN MemoryManager: Total allocation exceeds 95.

In [24]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
        .appName("Walrus_Modeling")
        .getOrCreate()
)

panel_path = "/home/wlevine/Walrus/Processed/hourly_panel"
panel = spark.read.parquet(panel_path)

panel.show(5)
panel.printSchema()

25/11/24 11:53:21 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+----------+----+-----------+----------+----------+----------------+---------+--------------------+--------+-------------+-------------+-------------+----------------+----------------+---------------+----------------+-------------+
|      date|hour|weekday_num|is_weekend|tod_bucket|        grid_key|  borough|     station_complex|flow_sum|ridership_sum|transfers_sum|borough_clean|complaints_total|complaints_noise|complaints_heat|complaints_other|borough_final|
+----------+----+-----------+----------+----------+----------------+---------+--------------------+--------+-------------+-------------+-------------+----------------+----------------+---------------+----------------+-------------+
|2024-02-10|   3|          7|         1|  Off-Peak|  40.6932_-73.99| Brooklyn|Court St (R)/Boro...|      12|           12|            0|         NULL|               0|               0|              0|               0|     Brooklyn|
|2024-02-07|  14|          4|         0|  Off-Peak|  40.6814_-73.88| Bro