In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, avg, max, min, count, sum as spark_sum,
    year, month, dayofweek, when, lit, round as spark_round,
    greatest, coalesce, dense_rank, row_number
)
from pyspark.sql.window import Window
from pyspark.sql.types import (
    StructType, StructField, 
    StringType, IntegerType, DoubleType, DateType
)
import time

# Cell 2: Initialize Spark
spark = SparkSession.builder \
    .appName("EPA Air Quality Analysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()


In [None]:
# EPA data schema
epa_schema = StructType([
    StructField("State Code", StringType()),
    StructField("County Code", StringType()),
    StructField("Site Num", StringType()),
    StructField("Parameter Code", IntegerType()),
    StructField("POC", IntegerType()),
    StructField("Latitude", DoubleType()),
    StructField("Longitude", DoubleType()),
    StructField("Datum", StringType()),
    StructField("Parameter Name", StringType()),
    StructField("Date Local", DateType()),
    StructField("Units of Measure", StringType()),
    StructField("Event Type", StringType()),
    StructField("Observation Count", IntegerType()),
    StructField("Observation Percent", DoubleType()),
    StructField("Arithmetic Mean", DoubleType()),
    StructField("1st Max Value", DoubleType()),
    StructField("1st Max Hour", IntegerType()),
    StructField("AQI", IntegerType()),
    StructField("Method Code", StringType()),
    StructField("Method Name", StringType()),
    StructField("Local Site Name", StringType()),
    StructField("Address", StringType()),
    StructField("State Name", StringType()),
    StructField("County Name", StringType()),
    StructField("City Name", StringType()),
    StructField("CBSA Name", StringType()),
    StructField("Date of Last Change", DateType())
])

# Load PM2.5 data
pm25_df = spark.read \
    .option("header", "true") \
    .schema(epa_schema) \
    .csv("data/epa_raw/daily_88101_*.csv")

print(f"PM2.5 records loaded: {pm25_df.count():,}")

# Load Ozone data
ozone_df = spark.read \
    .option("header", "true") \
    .schema(epa_schema) \
    .csv("data/epa_raw/daily_44201_*.csv")

print(f"Ozone records loaded: {ozone_df.count():,}")

ozone_count = ozone_df.count()
print(f"✓ Ozone records loaded: {ozone_count:,}")

# Show sample data
print("\nSample PM2.5 data:")
pm25_df.select("Date Local", "State Name", "City Name", 
               "Arithmetic Mean", "AQI").show(5, truncate=False)

25/11/10 22:17:04 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: data/epa_raw/daily_88101_*.csv.
java.io.FileNotFoundException: File data/epa_raw/daily_88101_*.csv does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:56)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:381)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.org$apache$spark$sql$catalyst$analysis$ResolveDataSource$$loadV1BatchSource(ResolveDataSource.scala:143)
	at org.apache.spark.sql.catalyst.

PM2.5 records loaded: 2,500,821


25/11/10 22:17:05 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: data/epa_raw/daily_44201_*.csv.
java.io.FileNotFoundException: File data/epa_raw/daily_44201_*.csv does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:56)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:381)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.org$apache$spark$sql$catalyst$analysis$ResolveDataSource$$loadV1BatchSource(ResolveDataSource.scala:143)
	at org.apache.spark.sql.catalyst.

Ozone records loaded: 775,458
✓ Ozone records loaded: 775,458

Sample PM2.5 data:
+----------+-----------------+---------+---------------+----+
|Date Local|State Name       |City Name|Arithmetic Mean|AQI |
+----------+-----------------+---------+---------------+----+
|NULL      |FAIRHOPE, Alabama|Alabama  |1.0            |NULL|
|NULL      |FAIRHOPE, Alabama|Alabama  |1.0            |NULL|
|NULL      |FAIRHOPE, Alabama|Alabama  |1.0            |11  |
|NULL      |FAIRHOPE, Alabama|Alabama  |1.0            |NULL|
|NULL      |FAIRHOPE, Alabama|Alabama  |1.0            |NULL|
+----------+-----------------+---------+---------------+----+
only showing top 5 rows


25/11/10 22:17:05 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Sample Duration, Observation Count, 1st Max Value, Local Site Name, State Name
 Schema: Date Local, Arithmetic Mean, AQI, State Name, City Name
Expected: Date Local but found: Sample Duration
CSV file: file:///Users/yuqianwang/Documents/IDS706/ids706_pyspark_data_processing/data/epa_raw/daily_88101_2021.csv


In [None]:
# filtering PM2.5 data
pm25_filtered = pm25_df
pm25_filtered = pm25_filtered.filter(
    (col("Date Local") >= "2021-01-01") & # filter recent data 

    # remove invalid and outlier measurements
    (col("Arithmetic Mean").isNotNull()) &
    (col("Arithmetic Mean") >= 0) &
    (col("Arithmetic Mean") < 500) &  # Remove extreme outliers
    (col("AQI").isNotNull()) &
    (col("Observation Percent") >= 75) & # Require at least 75% data completeness

    # filter to focus on metropolitan areas
    (col("CBSA Name").isNotNull()) &  # Must be in a Core Based Statistical Area
    (col("City Name").isNotNull())  
)

# filtering Ozone data
ozone_filtered = ozone_df
ozone_filtered = ozone_df.filter(
    (col("Date Local") >= "2021-01-01") &
    (col("Arithmetic Mean").isNotNull()) &
    (col("Arithmetic Mean") >= 0) &
    (col("Arithmetic Mean") < 0.2) &  # Ozone in ppm
    (col("Observation Percent") >= 75)
)

# Prepare PM2.5 for join
pm25_join = pm25_filtered.select(
    col("State Code"),
    col("County Code"),
    col("Site Num"),
    col("Date Local"),
    col("Arithmetic Mean").alias("PM25_Mean"),
    col("AQI").alias("PM25_AQI"),
    col("State Name"),
    col("City Name"),
    col("Latitude"),
    col("Longitude")
)

# Prepare Ozone for join
ozone_join = ozone_filtered.select(
    col("State Code"),
    col("County Code"),
    col("Site Num"),
    col("Date Local"),
    col("Arithmetic Mean").alias("Ozone_Mean"),
    col("AQI").alias("Ozone_AQI")
)

# Join operation
combined_df = pm25_join.join(ozone_join, ["State Code", "County Code", "Site Num", "Date Local"], "left")
print(f"Joined records: {combined_df.count()}")

In [14]:
# add year, month, and season, and quality category
enriched_df = combined_df \
    .withColumn("Year", year(col("Date Local"))) \
    .withColumn("Month", month(col("Date Local"))) \
    .withColumn("Season",
        when(col("Month").isin([12, 1, 2]), "Winter")
        .when(col("Month").isin([3, 4, 5]), "Spring")
        .when(col("Month").isin([6, 7, 8]), "Summer")
        .otherwise("Fall")
    ) \
    .withColumn("PM25_Category",
        when(col("PM25_AQI") <= 50, "Good")
        .when(col("PM25_AQI") <= 100, "Moderate")
        .when(col("PM25_AQI") <= 150, "Unhealthy for Sensitive")
        .when(col("PM25_AQI") <= 200, "Unhealthy")
        .otherwise("Very Unhealthy")
    ) \
    .withColumn("Combined_AQI",
        greatest(col("PM25_AQI"), coalesce(col("Ozone_AQI"), col("PM25_AQI")))
    )



Sample Combined PM2.5 and Ozone data:


25/11/10 22:48:37 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: State Code, County Code, Site Num, Sample Duration, Event Type, Observation Count, Observation Percent, 1st Max Value, Local Site Name, State Name, County Name
 Schema: State Code, County Code, Site Num, Date Local, Observation Percent, Arithmetic Mean, 1st Max Value, AQI, State Name, City Name, CBSA Name
Expected: Date Local but found: Sample Duration
CSV file: file:///Users/yuqianwang/Documents/IDS706/ids706_pyspark_data_processing/data/epa_raw/daily_88101_2022.csv
25/11/10 22:48:37 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: State Code, County Code, Site Num, Sample Duration, Event Type, Observation Count, Observation Percent, 1st Max Value, Local Site Name, State Name, County Name
 Schema: State Code, County Code, Site Num, Date Local, Observation Percent, Arithmetic Mean, 1st Max Value, AQI, State Name, City Name, CBSA Name
Expected: Date Local but found: Sam

+----------+----------+---------+---------+--------+--------+----------+---------+---------+
|Date Local|State Name|City Name|PM25_Mean|PM25_Max|PM25_AQI|Ozone_Mean|Ozone_Max|Ozone_AQI|
+----------+----------+---------+---------+--------+--------+----------+---------+---------+
+----------+----------+---------+---------+--------+--------+----------+---------+---------+



25/11/11 01:00:38 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 929176 ms exceeds timeout 120000 ms
25/11/11 01:00:38 WARN SparkContext: Killing executors is not supported by current scheduler.
25/11/11 01:00:41 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:342)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$