# Load and Combine CBSA Annual AQI Data (2020-2024) into a Single DataFrame
Read and merge AQI data for each year from 2020 to 2024 into one consolidated DataFrame for analysis.

In [0]:
#df for C - Sahil
aqi_cbsa_df_2020 = spark.read.csv("/Volumes/workspace/2235_team2/aqi/annual_aqi_by_cbsa_2020.csv", header=True, inferSchema=True)
aqi_cbsa_df_2021 = spark.read.csv("/Volumes/workspace/2235_team2/aqi/annual_aqi_by_cbsa_2021.csv", header=True, inferSchema=True)
aqi_cbsa_df_2022 = spark.read.csv("/Volumes/workspace/2235_team2/aqi/annual_aqi_by_cbsa_2022.csv", header=True, inferSchema=True)
aqi_cbsa_df_2023 = spark.read.csv("/Volumes/workspace/2235_team2/aqi/annual_aqi_by_cbsa_2023.csv", header=True, inferSchema=True)
aqi_cbsa_df_2024 = spark.read.csv("/Volumes/workspace/2235_team2/aqi/annual_aqi_by_cbsa_2024.csv", header=True, inferSchema=True)

aqi_cbsa_df = aqi_cbsa_df_2020.union(aqi_cbsa_df_2021).union(aqi_cbsa_df_2022).union(aqi_cbsa_df_2023).union(aqi_cbsa_df_2024)
#aqi_cbsa_df.show(2)
display(aqi_cbsa_df)
aqi_cbsa_df.count()

# Identify Top 10 CBSAs with Most "Unhealthy for Sensitive Groups Days"
This section groups the consolidated AQI data by CBSA, calculates the total number of "Unhealthy for Sensitive Groups Days" for each CBSA, and displays the top 10 CBSAs with the highest totals. The results are saved as a table for further analysis.

In [0]:
from pyspark.sql.functions import col, sum as _sum

# C1: Top 10 CBSAs with most "Unhealthy for Sensitive Groups Days"
top10_ufsg_df = aqi_cbsa_df.groupBy("CBSA").agg(
    _sum("Unhealthy for Sensitive Groups Days").alias("Total_Unhealthy_for_Sensitive_Groups_Days")
).orderBy(col("Total_Unhealthy_for_Sensitive_Groups_Days").desc()).limit(10)

display(top10_ufsg_df)
top10_ufsg_df.write.mode("overwrite").saveAsTable('workspace.2235_team2.top10_ufsg_df_C1')

# Identify Top 10 CBSAs with Most "Unhealthy Days"
This section groups the AQI data by CBSA, calculates the total number of "Unhealthy Days" for each CBSA, and selects the top 10 CBSAs with the highest totals. The results are saved as a table for further analysis.

In [0]:
# C1: Top 10 CBSAs with most "Unhealthy Days"
top10_unhealthy_df = aqi_cbsa_df.groupBy("CBSA").agg(
    _sum("Unhealthy Days").alias("Total_Unhealthy_Days")
).orderBy(col("Total_Unhealthy_Days").desc()).limit(10)
#display(top10_unhealthy_df)
top10_unhealthy_df.write.mode("overwrite").saveAsTable('workspace.2235_team2.top10_unhealthy_df_C1')

# Identify Top 10 CBSAs with Most "Hazardous Days"
This section groups the AQI data by CBSA, calculates the total number of "Hazardous Days" for each CBSA, and selects the top 10 CBSAs with the highest totals. The results are saved as a table for further analysis.

In [0]:
# C1.Top 10 CBSAs with most "Hazardous Days"
top10_hazardous_df = aqi_cbsa_df.groupBy("CBSA").agg(
    _sum("Hazardous Days").alias("Total_Hazardous_Days")
).orderBy(col("Total_Hazardous_Days").desc()).limit(10)
#display(top10_hazardous_df)
top10_hazardous_df.write.mode("overwrite").saveAsTable('workspace.2235_team2.top10_hazardous_df_C1')

# Analyze Yearly Trends of Unhealthy and Hazardous Days by CBSA
This section groups the AQI data by CBSA and year, calculating annual totals for "Unhealthy for Sensitive Groups Days," "Unhealthy Days," and "Hazardous Days." The results are ordered by CBSA and year, and saved as a table for trend analysis.

In [0]:
#C2
trend_df_cbsa = aqi_cbsa_df.groupBy("CBSA", "Year").agg(
    _sum("Unhealthy for Sensitive Groups Days").alias("Total_Unhealthy_for_Sensitive_Groups_Days"),
    _sum("Unhealthy Days").alias("Total_Unhealthy_Days"),
    _sum("Hazardous Days").alias("Total_Hazardous_Days")
).orderBy("CBSA", "Year")
#display(trend_df_cbsa)
trend_df_cbsa.write.mode("overwrite").saveAsTable('workspace.2235_team2.trend_df_cbsa_C2')

# Join AQI Data with CBSA Type Information
This section reads CBSA type information from a separate file and performs a left join with the AQI data, enriching each record with its corresponding CBSA type(Macro vs Micro). The resulting DataFrame is used for further analysis and is not written to a table.

In [0]:
#C3#Do not write this section to table
cbsas_path = "/Volumes/workspace/2235_team2/aqi/cbsas.csv"
cbsas_df = spark.read.csv(cbsas_path, header=True, inferSchema=True)
#display(cbsas_df)

aqi__cbsa_with_type_df = aqi_cbsa_df.join(
    cbsas_df.select(col("CBSA Code").alias("CBSA_Code"), "CBSA Type"),
    aqi_cbsa_df["CBSA Code"] == col("CBSA_Code"),
    "left"
)
#display(aqi__cbsa_with_type_df)

# Analyze Yearly AQI Trends by CBSA Type
This section groups the AQI data by year and CBSA type, calculating both the sum and average of "Unhealthy for Sensitive Groups Days," "Unhealthy Days," and "Hazardous Days" for each group. The results are ordered by year and CBSA type, and saved as a table for further analysis.

In [0]:
#C3
from pyspark.sql.functions import sum as _sum, avg

aqi__cbsa_metromicro_trend_df = aqi__cbsa_with_type_df \
    .withColumnRenamed("CBSA Type", "CBSA_Type") \
    .groupBy("Year", "CBSA_Type").agg(
        _sum("Unhealthy for Sensitive Groups Days").alias("Sum_Unhealthy_for_Sensitive_Groups_Days"),
        avg("Unhealthy for Sensitive Groups Days").alias("Avg_Unhealthy_for_Sensitive_Groups_Days"),
        _sum("Unhealthy Days").alias("Sum_Unhealthy_Days"),
        avg("Unhealthy Days").alias("Avg_Unhealthy_Days"),
        _sum("Hazardous Days").alias("Sum_Hazardous_Days"),
        avg("Hazardous Days").alias("Avg_Hazardous_Days")
    ).orderBy("Year", "CBSA_Type")
#display(aqi__cbsa_metromicro_trend_df)
aqi__cbsa_metromicro_trend_df.write.mode("overwrite").saveAsTable('workspace.2235_team2.aqi__cbsa_metromicro_trend_df_C3')

# Summarize Pollutant Days Distribution by CBSA Type
This section groups the AQI data by CBSA type and calculates both the sum and average number of days for each major pollutant (CO, NO₂, Ozone, PM2.5, and PM10). The results provide a distribution overview by CBSA type and are saved as a table for further analysis.

In [0]:
#C4

from pyspark.sql.functions import sum as _sum, avg

aqi__cbsa_pollutant_dist_df = aqi__cbsa_with_type_df.withColumnRenamed("CBSA Type", "CBSA_Type").groupBy("CBSA_Type").agg(
    _sum("Days CO").alias("Sum_Days_CO"),
    avg("Days CO").alias("Avg_Days_CO"),
    _sum("Days NO2").alias("Sum_Days_NO2"),
    avg("Days NO2").alias("Avg_Days_NO2"),
    _sum("Days Ozone").alias("Sum_Days_Ozone"),
    avg("Days Ozone").alias("Avg_Days_Ozone"),
    _sum("`Days PM2.5`").alias("Sum_Days_PM2_5"),
    avg("`Days PM2.5`").alias("Avg_Days_PM2_5"),
    _sum("Days PM10").alias("Sum_Days_PM10"),
    avg("Days PM10").alias("Avg_Days_PM10")
)
#display(aqi__cbsa_pollutant_dist_df)
aqi__cbsa_pollutant_dist_df.write.mode("overwrite").saveAsTable('workspace.2235_team2.aqi__cbsa_pollutant_dist_df_C4')

# Analyze Yearly Pollutant Days Trends by CBSA Type
This section groups the AQI data by year and CBSA type, calculating both the sum and average number of days for each major pollutant (CO, NO₂, Ozone, PM2.5, and PM10). The results show yearly trends in pollutant days distribution by CBSA type and are saved as a table for further analysis.

In [0]:
#C4
aqi__cbsa_pollutant_trend_dist_df = aqi__cbsa_with_type_df.withColumnRenamed("CBSA Type", "CBSA_Type").groupBy("Year", "CBSA_Type").agg(
    _sum("Days CO").alias("Sum_Days_CO"),
    avg("Days CO").alias("Avg_Days_CO"),
    _sum("Days NO2").alias("Sum_Days_NO2"),
    avg("Days NO2").alias("Avg_Days_NO2"),
    _sum("Days Ozone").alias("Sum_Days_Ozone"),
    avg("Days Ozone").alias("Avg_Days_Ozone"),
    _sum("`Days PM2.5`").alias("Sum_Days_PM2_5"),
    avg("`Days PM2.5`").alias("Avg_Days_PM2_5"),
    _sum("Days PM10").alias("Sum_Days_PM10"),
    avg("Days PM10").alias("Avg_Days_PM10")
)
#display(aqi__cbsa_pollutant_trend_dist_df)
aqi__cbsa_pollutant_trend_dist_df.write.mode("overwrite").saveAsTable('workspace.2235_team2.aqi__cbsa_pollutant_trend_dist_df_C4')