# PM2.5 Dataset from 2021 to 2023 analysis

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, avg, max, min, count, sum as spark_sum,
    year, month, dayofweek, when, lit, round as spark_round,
    greatest, coalesce, dense_rank, row_number
)
from pyspark.sql.window import Window
from pyspark.sql.types import (
    StructType, StructField, 
    StringType, IntegerType, DoubleType, DateType
)
import time

# Start Spark
# Most basic Spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PM25").getOrCreate()


## Load data

In [None]:
# EPA data schema
epa_schema = StructType([
    StructField("State Code", StringType()),
    StructField("County Code", StringType()),
    StructField("Site Num", StringType()),
    StructField("Parameter Code", IntegerType()),
    StructField("POC", IntegerType()),
    StructField("Latitude", DoubleType()),
    StructField("Longitude", DoubleType()),
    StructField("Datum", StringType()),
    StructField("Parameter Name", StringType()),
    StructField("Date Local", DateType()),
    StructField("Units of Measure", StringType()),
    StructField("Event Type", StringType()),
    StructField("Observation Count", IntegerType()),
    StructField("Observation Percent", DoubleType()),
    StructField("Arithmetic Mean", DoubleType()),
    StructField("1st Max Value", DoubleType()),
    StructField("1st Max Hour", IntegerType()),
    StructField("AQI", IntegerType()),
    StructField("Method Code", StringType()),
    StructField("Method Name", StringType()),
    StructField("Local Site Name", StringType()),
    StructField("Address", StringType()),
    StructField("State Name", StringType()),
    StructField("County Name", StringType()),
    StructField("City Name", StringType()),
    StructField("CBSA Name", StringType()),
    StructField("Date of Last Change", DateType())
])

# Load PM2.5 data
pm25_df = spark.read \
    .option("header", "true") \
    .schema(epa_schema) \
    .csv("data/epa_raw/daily_88101_*.csv")

25/11/11 22:58:57 WARN FileStreamSink: Assume no metadata directory. Error while looking for metadata directory in the path: data/epa_raw/daily_88101_*.csv.
java.io.FileNotFoundException: File data/epa_raw/daily_88101_*.csv does not exist
	at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:917)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:1238)
	at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:907)
	at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:462)
	at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:56)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:381)
	at org.apache.spark.sql.catalyst.analysis.ResolveDataSource.org$apache$spark$sql$catalyst$analysis$ResolveDataSource$$loadV1BatchSource(ResolveDataSource.scala:143)
	at org.apache.spark.sql.catalyst.

## Filter, Join and group by operation 

In [6]:
# filtering PM2.5 data
pm25_filtered = pm25_df
pm25_filtered = pm25_filtered.filter(
    (col("Date Local") >= "2021-01-01") & # filter recent data 

    # remove invalid and outlier measurements
    (col("Arithmetic Mean").isNotNull()) &
    (col("Arithmetic Mean") >= 0) &
    (col("Arithmetic Mean") < 500) &  # Remove extreme outliers
    (col("AQI").isNotNull()) &
    (col("Observation Percent") >= 75) & # Require at least 75% data completeness

    # filter to focus on metropolitan areas
    (col("CBSA Name").isNotNull()) &  # Must be in a Core Based Statistical Area
    (col("City Name").isNotNull())  
)

# join to find stations in the sma city
city_stations = pm25_filtered.groupBy("State Name", "City Name").agg(
    count("*").alias("measurement_count"),
    avg("Arithmetic Mean").alias("city_avg")
)

result_df = pm25_filtered.join(city_stations, ["State Name", "City Name"], "left")

## Column transformation

In [10]:
# Transformations to enrich data
enriched_df = result_df \
    .withColumn("Year", year(col("Date Local"))) \
    .withColumn("Month", month(col("Date Local"))) \
    .withColumn("Season",
        when(col("Month").isin([12, 1, 2]), "Winter")
        .when(col("Month").isin([3, 4, 5]), "Spring")
        .when(col("Month").isin([6, 7, 8]), "Summer")
        .otherwise("Fall")
    ) \
    .withColumn("AQI_Category",
        when(col("AQI") <= 50, "Good")
        .when(col("AQI") <= 100, "Moderate")
        .when(col("AQI") <= 150, "Unhealthy for Sensitive")
        .otherwise("Unhealthy")
    ) \
    .withColumn("PM25_Rounded", spark_round(col("Arithmetic Mean"), 2))

## SQL queries

In [19]:
# 1. Top polluted cities
# Fixed SQL query
query1 = spark.sql("""
    SELECT 
        `State Name`,
        `City Name`,
        ROUND(AVG(`Arithmetic Mean`), 2) as Avg_PM25,
        COUNT(*) as Days
    FROM pm25_data
    GROUP BY `State Name`, `City Name`
    HAVING COUNT(*) >= 100
    ORDER BY Avg_PM25 DESC
    LIMIT 10
""")
query1.show()

25/11/11 23:09:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Sample Duration, Event Type, Observation Count, 1st Max Value, Local Site Name, State Name, County Name
 Schema: Date Local, Observation Percent, Arithmetic Mean, AQI, State Name, City Name, CBSA Name
Expected: Date Local but found: Sample Duration
CSV file: file:///Users/yuqianwang/Documents/IDS706/ids706_pyspark_data_processing/data/epa_raw/daily_88101_2021.csv
25/11/11 23:09:07 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: Sample Duration, Event Type, Observation Count, 1st Max Value, Local Site Name, State Name, County Name
 Schema: Date Local, Observation Percent, Arithmetic Mean, AQI, State Name, City Name, CBSA Name
Expected: Date Local but found: Sample Duration
CSV file: file:///Users/yuqianwang/Documents/IDS706/ids706_pyspark_data_processing/data/epa_raw/daily_88101_2023.csv
25/11/11 23:09:07 WARN CSVHeaderChecker: CSV header does not conform to the schema

+----------+---------+--------+----+
|State Name|City Name|Avg_PM25|Days|
+----------+---------+--------+----+
+----------+---------+--------+----+

