### Preprocessing of traffic data


In [3]:
import os
import sys
sys.path.append("../")
from scripts.traffic_volume_scrape import get_traffic_data
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, stddev, mean, col, unix_timestamp, abs, round, to_date, count, concat, sum, lpad, weekofyear, avg, when
from pyspark.sql import Window
from pyspark.sql.types import LongType, IntegerType, DoubleType, StringType
import geopandas as gpd
import folium
import numpy as np

Scrape the traffic data from Open NYC database

In [4]:
get_traffic_data()

https://data.cityofnewyork.us/resource/7ym2-wayt.csv?$limit=1673725
../data/landing/traffic_data/traffic_data.csv
Completed traffic data download
https://data.cityofnewyork.us/resource/7ym2-wayt.csv?$limit=1673725
../data/raw/traffic_data/traffic_data.csv
Completed traffic data download


In [5]:
# create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("Preprocess Traffic Data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

weather_sdf_list = []
# read the PSV file into a DataFrame
traffic_df = spark.read.format("csv") \
    .option("header", "true") \
    .load(f"../data/raw/traffic_data/traffic_data.csv")

num_instances, num_features = traffic_df.count(), len(traffic_df.columns)
print(f"The shape of the traffic dataframe: {num_instances} x {num_features}")

your 131072x1 screen size is bogus. expect trouble
24/08/25 13:55:02 WARN Utils: Your hostname, DESKTOP-N0VCA8U resolves to a loopback address: 127.0.1.1; using 172.23.106.248 instead (on interface eth0)
24/08/25 13:55:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/25 13:55:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/25 13:55:06 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/08/25 13:55:06 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.

The shape of the traffic dataframe: 1673725 x 14


                                                                                

Feature engineer date

In [6]:
traffic_df = traffic_df.withColumn(
    "date",
    to_date(
        concat(
            col("yr"), 
            lit("-"), 
            lpad(col("m").cast("string"), 2, '0'),
            lit("-"), 
            lpad(col("d").cast("string"), 2, '0')
        ), 
        "yyyy-MM-dd"
    )
)
num_instances, num_features = traffic_df.count(), len(traffic_df.columns)
print(f"The shape of the traffic dataframe: {num_instances} x {num_features}")

[Stage 4:>                                                          (0 + 4) / 4]

The shape of the traffic dataframe: 1673725 x 15


                                                                                

In [7]:
# define the start and end dates
start_date = "2022-10-01"
end_date = "2023-03-31"

# filter df to only include the specified date range
traffic_df = traffic_df.filter(
    (col("date") >= start_date) & (col("date") <= end_date)
)

traffic_df = traffic_df.orderBy("date")

num_instances, num_features = traffic_df.count(), len(traffic_df.columns)
print(f"The shape of the traffic dataframe: {num_instances} x {num_features}")

[Stage 7:>                                                          (0 + 4) / 4]

The shape of the traffic dataframe: 32544 x 15


                                                                                

Filter outliers

In [8]:
stats = traffic_df.agg(
    mean("vol").alias("mean"),
    stddev("vol").alias("stddev")
).collect()[0]

column_mean = stats["mean"]
column_stddev = stats["stddev"]

bound_sd = np.sqrt(2*np.log(traffic_df.count()))

traffic_df = traffic_df.filter(
    (col("vol") >= column_mean - bound_sd * column_stddev) &
    (col("vol") <= column_mean + bound_sd * column_stddev)
)

num_instances, num_features = traffic_df.count(), len(traffic_df.columns)
print(f"The shape of the traffic dataframe: {num_instances} x {num_features}")



The shape of the traffic dataframe: 32305 x 15


                                                                                

Attempt to aggregate into daily representation

In [12]:
traffic_by_boro_df = (traffic_df
          .groupBy("boro", "date")  # Group by borough, year, month, and day
          .agg(sum("vol").alias("total_vol"))  # Aggregate to sum up the volume
         )

traffic_by_boro_df = traffic_by_boro_df.orderBy('date')

num_instances, num_features = traffic_by_boro_df.count(), len(traffic_by_boro_df.columns)
print(f"The shape of the traffic dataframe: {num_instances} x {num_features}")

                                                                                

The shape of the traffic dataframe: 125 x 3


                                                                                

boro,date,total_vol
Brooklyn,2022-10-12,9706.0
Brooklyn,2022-10-13,7211.0
Brooklyn,2022-10-14,7663.0
Brooklyn,2022-10-15,7031.0
Brooklyn,2022-10-16,6456.0
Brooklyn,2022-10-17,31677.0
Manhattan,2022-10-18,126877.0
Brooklyn,2022-10-18,1881.0
Brooklyn,2022-10-19,1938.0
Manhattan,2022-10-19,118928.0


In [10]:
# Export to data/curated
traffic_by_boro_df.write.mode('overwrite').parquet('../data/curated/boro_traffic_data.parquet')

                                                                                