#AQ

In [6]:

# Import Data
import pandas as pd
import arcpy
import os

# 1. Specify File Paths
file_paths = [r"F:\\Research and Analysis\\Air Quality\\Annual Reports DRI\\AQ data 2023.xlsx",
              r"F:\\Research and Analysis\\Air Quality\\Annual Reports DRI\\AQ data 2022.xlsx",
              r"F:\\Research and Analysis\\Air Quality\\Annual Reports DRI\\AQ data 2021.xlsx",
              r"F:\\Research and Analysis\\Air Quality\\Annual Reports DRI\\AQ data 2020.xlsx",
              r"F:\\Research and Analysis\\Air Quality\\Annual Reports DRI\\AQ data 2019.xlsx"
              ]

# 2. Read Data from Each File Daily Data
dfs = []  # List to store DataFrames from each file
sheet_name = 'daily'  # Name of the sheet to read

for file_path in file_paths:
    df = pd.read_excel(file_path, sheet_name=sheet_name, header=[0, 1])
    dfs.append(df)

# 3. Concatenate DAirrames
DailyAir_df= pd.concat(dfs, ignore_index=True)

# Melt the DataFrame to long format
df_long = DailyAir_df.melt(id_vars=[('SITE', 'date')], var_name=['id', 'variable'], value_name='value')

# Rename columns for clarity
df_long = df_long.rename(columns={('SITE', 'date'): 'date'})

# 4. Clean and Transform Data
df_long['date'] = df_long['date'].dt.strftime('%Y-%m-%d')  # Convert date to string format

df_long['date'] = df_long['date'].astype(str)
#rename variable values
# These are the values currenlty in the variable column in sde 
# CO - 8 hr max(ppm)
# NO2 - 1 hr max(ppm)
# O3 - 8 hr max(ppm)
# PM10 - 24 hr max (mg/m3)
# PM 2.5 - 24 hr max(mg/m3)


# drop rows with variable name RH,BP, RWD, RWD.1, RWS Tmp
df_long = df_long[~df_long['variable'].isin(['RH', 'BP', 'RWD', 'RWD.1', 'RWS', 'Tmp'])]
#Rename Variables in variable column
df_long['variable'] = df_long['variable'].replace({
    'COmax': 'CO - 1 hr max (ppm)', 
    'max8hrCO': 'CO - 8 hr max (ppm)', 
    'NO2_avg': 'N02 - annual mean (ppm)', 
    'NO2max': 'N02 - 1 hr max (ppm)', 
    'O3max': 'O3 - 1 hr max (ppm)', 
    'max8hrO3': 'O3 - 8 hr max (ppm)', 
    #'PM10max': 'PM 10 - 24 hr max (mg/m3)',
    'PM10avg': 'PM 10 - 24 hr max (mg/m3)',
    #'PM2.5max': 'PM 2.5 - 24 hr max (mg/m3)',
    #'PM2.5avg': 'PM 2.5 - annual mean (mg/m3)',
    'PM2.5avg.1': 'PM 2.5 - 24 hr max (mg/m3)',
    'PM2.5avg': 'PM 2.5 - 24 hr max (mg/m3)',
    #'PM2.5avg.1': 'PM 2.5 - annual mean (mg/m3)'
})


# Remove or replace non-numeric values in 'value' column
# Drop rows with null values in the 'value' column
df_long = df_long.dropna(subset=['value'])
df_long['value'] = pd.to_numeric(df_long['value'], errors='coerce')

# Convert 'value' column to float64
df_long['value'] = df_long['value'].astype('float64')
#FORMAT DATAFRAMES
# Define the path to the scratch geodatabase





In [7]:
df_long.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21677 entries, 0 to 38198
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      21677 non-null  object 
 1   id        21677 non-null  object 
 2   variable  21677 non-null  object 
 3   value     21228 non-null  float64
dtypes: float64(1), object(3)
memory usage: 846.8+ KB


In [14]:
type_mapping = {
    #'int64': 'LONG',
    'float64': 'DOUBLE',
    'object': 'TEXT',
    'string': 'TEXT',
    #'datetime64[ns]': 'DATE'
}
 
# Set up geodatabase and output table name
gdb_path = r"F:\Research and Analysis\Workspace\Sarah\Scratch.gdb"
output_table = "AQ_temp"
output_path = f"{gdb_path}\\{output_table}"

#Delete existing table if it exists
if arcpy.Exists(output_path):
    arcpy.management.Delete(output_path)
    print(f"Deleted existing table: {output_table}")

# Create the table in the geodatabase
arcpy.management.CreateTable(gdb_path, output_table)

# # Identify date columns (assume they are 'object' type but contain dates)
# date_columns = [col for col in df_long.columns if pd.api.types.is_datetime64_any_dtype(df_long[col])]


# Add fields based on DataFrame dtypes
for col_name, dtype in df_long.dtypes.items():
    arcgis_type = type_mapping.get(str(dtype), 'TEXT')  # Default to TEXT if dtype is unknown
    if arcgis_type == 'TEXT':
        arcpy.management.AddField(output_path, col_name, arcgis_type, field_length=255)
    else:
        arcpy.management.AddField(output_path, col_name, arcgis_type)
 
# Insert data into the table
with arcpy.da.InsertCursor(output_path, df_long.columns.tolist()) as cursor:
    for _, row in df_long.iterrows():
        cursor.insertRow(row.tolist())
 
print(f"Table '{output_table}' created and populated in {gdb_path}")

Table 'AQ_temp' created and populated in F:\Research and Analysis\Workspace\Sarah\Scratch.gdb


In [24]:
#OLD# 4. Save DataFrame to CSV (Temporary File)
temp_csv = r"C:\Users\snewsome\Documents\Monitoring data updates\temp_air_quality.csv"
df_long.to_csv(temp_csv, index=False)

# 5. Convert CSV to Table in Geodatabase
gdb_path = r"F:\Research and Analysis\Workspace\Sarah\Scratch.gdb"
table_name = "AirQuality_Temp"
output_table = os.path.join(gdb_path, table_name)

if arcpy.Exists(output_table):
    arcpy.Delete_management(output_table)  # Ensure old data is removed

arcpy.TableToTable_conversion(temp_csv, gdb_path, table_name)

# 6. Append to Final Feature Class
final_fc = os.path.join(gdb_path, "AirQuality_Final")

if not arcpy.Exists(final_fc):
    # Create the feature class if it does not exist
    arcpy.CreateFeatureclass_management(gdb_path, "AirQuality_Final", "POINT")

# Append new data
arcpy.Append_management(output_table, final_fc, "NO_TEST")

print("Data successfully appended to", final_fc)

Data successfully appended to F:\Research and Analysis\Workspace\Sarah\Scratch.gdb\AirQuality_Final
