# Loading Annual AQI Data by County (2020–2024) in PySpark
-     spark.read.csv: Reads a CSV file into a Spark DataFrame.
-     header=True: Indicates that the first row of the file contains column names.
-     inferSchema=True: Automatically infers the data types of each column.
Each line loads the annual AQI data by county for a different year (2020–2024) into a separate DataFrame.

In [0]:
aqi_county_df_2020 = spark.read.csv("/Volumes/workspace/2235_team2/aqi/annual_aqi_by_county_2020.csv", header=True, inferSchema=True)
aqi_county_df_2021 = spark.read.csv("/Volumes/workspace/2235_team2/aqi/annual_aqi_by_county_2021.csv", header=True, inferSchema=True)
aqi_county_df_2022 = spark.read.csv("/Volumes/workspace/2235_team2/aqi/annual_aqi_by_county_2022.csv", header=True, inferSchema=True)
aqi_county_df_2023 = spark.read.csv("/Volumes/workspace/2235_team2/aqi/annual_aqi_by_county_2023.csv", header=True, inferSchema=True)
aqi_county_df_2024 = spark.read.csv("/Volumes/workspace/2235_team2/aqi/annual_aqi_by_county_2024.csv", header=True, inferSchema=True)


# Combining Annual AQI DataFrames Across Years
- unionByName: Combines DataFrames by matching column names.
- allowMissingColumns=True: Allows for missing columns in some DataFrames; missing columns are filled with null values.
This code merges the AQI data from 2020 to 2024 into a single DataFrame, aqi_county_df, ensuring all available columns are included even if some are missing in certain years.

In [0]:
aqi_county_df = (
    aqi_county_df_2020
    .unionByName(aqi_county_df_2021, allowMissingColumns=True)
    .unionByName(aqi_county_df_2022, allowMissingColumns=True)
    .unionByName(aqi_county_df_2023, allowMissingColumns=True)
    .unionByName(aqi_county_df_2024, allowMissingColumns=True)
)

# Displaying the Combined AQI DataFrame
- display(aqi_county_df):
This command displays the contents of the combined AQI DataFrame (aqi_county_df) in a tabular format.

In [0]:
display(aqi_county_df)

# Displaying the Schema of the Combined AQI DataFrame
- aqi_county_df.printSchema():
Prints the schema of the aqi_county_df DataFrame, showing each column's name and data type.

In [0]:
aqi_county_df.printSchema()

# Loading Annual AQI Data by CBSA (2020–2024) in PySpark
- spark.read.csv: Loads a CSV file into a Spark DataFrame.
- header=True: Uses the first row as column headers.
- inferSchema=True: Automatically infers the data types of each column.

In [0]:
aqi_cbsa_df_2020 = spark.read.csv("/Volumes/workspace/2235_team2/aqi/annual_aqi_by_cbsa_2020.csv", header=True, inferSchema=True)
aqi_cbsa_df_2021 = spark.read.csv("/Volumes/workspace/2235_team2/aqi/annual_aqi_by_cbsa_2021.csv", header=True, inferSchema=True)
aqi_cbsa_df_2022 = spark.read.csv("/Volumes/workspace/2235_team2/aqi/annual_aqi_by_cbsa_2022.csv", header=True, inferSchema=True)
aqi_cbsa_df_2023 = spark.read.csv("/Volumes/workspace/2235_team2/aqi/annual_aqi_by_cbsa_2023.csv", header=True, inferSchema=True)
aqi_cbsa_df_2024 = spark.read.csv("/Volumes/workspace/2235_team2/aqi/annual_aqi_by_cbsa_2024.csv", header=True, inferSchema=True)

# Combining Annual AQI CBSA DataFrames Across Years
- unionByName: Merges DataFrames by matching column names.
- allowMissingColumns=True: Allows for missing columns in some DataFrames; missing columns are filled with null values.

In [0]:
aqi_cbsa_df = (
    aqi_cbsa_df_2020
    .unionByName(aqi_cbsa_df_2021, allowMissingColumns=True)
    .unionByName(aqi_cbsa_df_2022, allowMissingColumns=True)
    .unionByName(aqi_cbsa_df_2023, allowMissingColumns=True)
    .unionByName(aqi_cbsa_df_2024, allowMissingColumns=True)
)

# Displaying the Combined AQI CBSA DataFrame
- display(aqi_cbsa_df):
Shows the contents of the combined AQI CBSA DataFrame (aqi_cbsa_df) in a tabular format.

In [0]:
display(aqi_cbsa_df)

Show Schema for aqi_cbsa_df

In [0]:
aqi_cbsa_df.printSchema()

Adding Urban_Rural Column to aqi_cbsa_df

In [0]:
from pyspark.sql.functions import lit
from pyspark.sql.types import StringType

aqi_cbsa_df = aqi_cbsa_df.withColumn("Urban_Rural", lit("").cast(StringType()))

In [0]:
display(aqi_cbsa_df)

Upload SourceData

In [0]:
sourcedata_df = spark.read.csv("/Volumes/workspace/2235_team2/aqi/SourceData.csv", header=True, inferSchema=True)

In [0]:
display(sourcedata_df)

Assigning Urban_Rural values to aqi_cbsa_df

In [0]:
from pyspark.sql import functions as F

# Create a list of all unique CBSA Title values from sourcedata_df
cbsa_title_list = [row['CBSA Title'] for row in sourcedata_df.select('CBSA Title').distinct().collect()]

# Update the Urban_Rural column in aqi_cbsa_df
aqi_cbsa_df = aqi_cbsa_df.withColumn(
    'Urban_Rural',
    F.when(F.col('CBSA').isin(cbsa_title_list), F.lit('Urban')).otherwise(F.lit('Rural'))
)

In [0]:
display(aqi_cbsa_df)

Downloading existing aqi_cbsa_df

In [0]:
# Convert to Pandas DataFrame
pdf = aqi_cbsa_df.toPandas()

# Save as CSV
pdf.to_csv('/Volumes/workspace/2235_team2/aqi/Test_Download.csv', index=False)

Selecting revelant columns

In [0]:
from pyspark.sql.functions import col

aqi_cols = ["Year", "Urban_Rural", "Max AQI", "Median AQI", "90th Percentile AQI"]
df_aqi = aqi_cbsa_df.select([col(c) for c in aqi_cols])

Aggregate by Year and Urban/Rural

In [0]:
from pyspark.sql.functions import avg

df_grouped = (
    df_aqi.groupBy("Year", "Urban_Rural")
    .agg(
        avg("Max AQI").alias("Avg_Max_AQI"),
        avg("Median AQI").alias("Avg_Median_AQI"),
        avg("90th Percentile AQI").alias("Avg_90th_Percentile_AQI")
    )
    .orderBy("Year", "Urban_Rural")
)

Display and Visualize

In [0]:
display(df_grouped)

Databricks visualization. Run in Databricks to view.

In [0]:
%sql
USE CATALOG workspace;
USE SCHEMA `2235_team2`;

In [0]:
df_grouped.write.mode("overwrite").saveAsTable('workspace.2235_team2.urban_rural')


In [0]:
fips_code_county.show(2)
fips_code_state.show(2)

In [0]:
fips_code_county = fips_code_county.withColumn('County', trim(fips_code_county['County']))
fips_code_county.show(2)
fips_code_county.printSchema()

In [0]:
fips_code_state = fips_code_state.withColumn('State', lower(trim(fips_code_state['State'])))


In [0]:
fips_code_state = fips_code_state.withColumn('State Code', trim(fips_code_state['State Code']))

In [0]:
fips_code_state.show(5)

In [0]:
# Yash (Start D from here)
aqi_cbsa_merged_df = aqi_cbsa_df.join(fips_code_county,aqi_cbsa_df.County == fips_code_county.County & aqi_cbsa_df.State == fips_code_county.State, how='left')
aqi_cbsa_merged_df.show(5)

In [0]:
aqi_cbsa_merged_df.count()

In [0]:
# Assume df is your Spark DataFrame and 'column_name' is the column to check for nulls
from pyspark.sql.functions import col
#aqi_cbsa_merged_nulls = aqi_cbsa_merged_df.filter(aqi_cbsa_merged_df['FIPS County Code'].isNull())
null_count = aqi_cbsa_merged_df.filter(col('FIPS County Code').isNull()).count()
print(f"Number of nulls: {null_count}")

In [0]:
aqi_cbsa_merged_df.count()

In [0]:
aqi_cbsa_merged_final_df = aqi_cbsa_merged_df.join(fips_code_state,aqi_cbsa_merged_df.State ==fips_code_state['State Code'] , how='inner')
aqi_cbsa_merged_final_df.show(5)

In [0]:
null_count = aqi_cbsa_merged_final_df.filter(col('FIPS State Code').isNull()).count()
print(f"Number of nulls: {null_count}")

In [0]:
aqi_county_df.printSchema()

In [0]:

# Assume df is your Spark DataFrame and 'column_name' is the column you want to trim
aqi_county_df_trimmed = aqi_county_df.withColumn('County', trim(aqi_county_df['County']))
aqi_county_df_trimmed = aqi_county_df.withColumn('State', lower(trim(aqi_county_df['State'])))
aqi_county_df_trimmed.show(5)

In [0]:
aqi_county_df = aqi_county_df_trimmed
aqi_county_df.show(2)

In [0]:
aqi_county_merged_df = aqi_county_df.join(fips_code_county,aqi_county_df.County == fips_code_county.County, how='inner')
aqi_county_merged_df.show(2)

In [0]:
aqi_county_merged_df.count()

In [0]:
null_count = aqi_county_merged_df.filter(col('FIPS County Code').isNull()).count()
print(f"Number of nulls: {null_count}")

In [0]:
aqi_county_merged_final_df = aqi_county_merged_df.join(fips_code_state,aqi_county_merged_df.State ==fips_code_state['State'] , how='inner')
aqi_county_merged_final_df.show(4)

In [0]:
aqi_county_merged_final_df.count()

In [0]:
null_count = aqi_county_merged_final_df.filter(col('FIPS State Code').isNull()).count()
print(f"Number of nulls: {null_count}")

In [0]:
aqi_cbsa_df = aqi_cbsa_merged_final_df
aqi_county_df = aqi_county_merged_final_df

In [0]:
aqi_cbsa_df.show(5)

In [0]:
aqi_county_df.show(5)

In [0]:
aqi_cbsa_df.count()

In [0]:
aqi_county_df.count()