## Import Lib

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from zkyhaxpy import io_tools, gis_tools, pd_tools
import rasterio
import os
import shutil
import numpy as np
from tqdm.notebook import tqdm
import aqi

## Define constants

In [None]:
dir_predicted_pm25_root = r'../data/predicted_pm25_chiangmai'
dir_predicted_pm25_daily = os.path.join(dir_predicted_pm25_root, 'daily')
dir_predicted_pm25_monthly = os.path.join(dir_predicted_pm25_root, 'monthly')
io_tools.create_folders(dir_predicted_pm25_daily, dir_predicted_pm25_monthly)


In [None]:

aqi_color_codes = {
        -9:"Grey",
        0: "Green",
        1: "Yellow",
        2: "Orange",
        3: "Red",
        4: "Purple",
        5: "Maroon"
    }    


## Define functions

### pm25_to_aqi_level

In [None]:


def pm25_to_aqi_level(pm25_concentration: float) -> int:
    """
    Converts PM2.5 concentration to AQI class (as integer).

    Args:
        pm25_concentration (float): PM2.5 concentration in µg/m³.

    Returns:
        int: AQI level (0 to 5) based on EPA guidelines.
    """
    aqi_value = aqi.to_aqi([(aqi.POLLUTANT_PM25, str(pm25_concentration))])
    if aqi_value <= 50:
        return 0  # Good
    elif 50 < aqi_value <= 100:
        return 1  # Moderate
    elif 100 < aqi_value <= 150:
        return 2  # Unhealthy for Sensitive Groups
    elif 150 < aqi_value <= 200:
        return 3  # Unhealthy
    elif 200 < aqi_value <= 300:
        return 4  # Very Unhealthy
    else:
        return 5  # Hazardous


### aqi_level_to_color

In [None]:

def aqi_level_to_color(aqi_class: int) -> str:
    """
    Converts AQI level (as integer) to color code.

    Args:
        aqi_class (int): AQI level (0 to 5).

    Returns:
        str: Color code corresponding to the AQI level.
    """
    color_codes = {
        0: "Green",
        1: "Yellow",
        2: "Orange",
        3: "Red",
        4: "Purple",
        5: "Maroon"
    }
    return color_codes.get(aqi_class, "Unknown")


## Load data for training model

In [None]:
df_chiangmai_grid = pd.read_parquet(r'../data/df_chiangmai_grid.parquet')

lat_min = df_chiangmai_grid.lat.min()
lat_max = df_chiangmai_grid.lat.max()
lon_min = df_chiangmai_grid.lon.min()
lon_max = df_chiangmai_grid.lon.max()

In [None]:
df_extracted_aod055 = pd.read_csv(r'../data/df_extracted_aod055.csv')
del(df_extracted_aod055['row'])
del(df_extracted_aod055['col'])
del(df_extracted_aod055['tile_id'])

df_extracted_dem = pd.read_csv(r'../data/df_extracted_dem.csv')
del(df_extracted_dem['row'])
del(df_extracted_dem['col'])


path_df_openaq = r'../data/df_openaq.parquet'
if os.path.exists(path_df_openaq):
    df_openaq = pd.read_parquet(path_df_openaq)
    print(f'{path_df_openaq} has been loaded')
else:
    gdf_openaq = gpd.read_file('../data/gdf_openaq.gpkg')
    print('gdf_openaq has been loaded.')
    if gdf_openaq.index.name is None:
        gdf_openaq = gdf_openaq.set_index('measurement_id')
    
    df_openaq = gdf_openaq.drop(columns=['geometry']).copy()
    df_openaq.to_parquet(path_df_openaq)
    print(f'{path_df_openaq} has been saved')

In [None]:
df_openaq = df_openaq[df_openaq['value'] != -999].copy()
df_openaq

In [None]:
df_extracted_aod055 = df_extracted_aod055[df_extracted_aod055['aod_055'] >= 0].copy()
df_extracted_aod055 = df_extracted_aod055.set_index('measurement_id')
df_extracted_aod055

In [None]:
df_extracted_dem = df_extracted_dem.copy()
df_extracted_dem = df_extracted_dem.set_index('measurement_id')
df_extracted_dem

In [None]:
df_joined = df_openaq.merge(df_extracted_aod055, how='inner', left_index=True, right_index=True).copy()
df_joined = df_joined.merge(df_extracted_dem, how='inner', left_index=True, right_index=True).copy()
df_joined = df_joined.rename(columns={'value':'pm25'})


df_joined

In [None]:
df_pm25 = df_joined[(df_joined['lat'].between(lat_min, lat_max)) & (df_joined['long'].between(lon_min, lon_max))]
df_pm25 = df_pm25.reindex(columns=['pm25', 'aod_055', 'dem', 'sensorType']).copy()
df_pm25

In [None]:
from scipy.stats import pearsonr
corr = pearsonr(df_pm25['pm25'], df_pm25['aod_055'])

In [None]:
import plotly.express as px


fig = px.scatter(df_pm25, x="aod_055", y=f"pm25", color="sensorType", title=f"Scatterplot - OpenAQ's PM2.5 and MODIS' Aerosol Optical Depth (AOD) in Chiangmai (2021 - Present)")
fig.update_layout(
    yaxis_title='PM2.5 (µg/m3)', 
    xaxis_title='AOD band 550 nm', 
    )

fig.show()
fig.write_html(r'./scatter-pm25-aod.html', include_plotlyjs=False, full_html=False, div_id=f'scatter-pm25-aod')

In [None]:
sns.scatterplot(data=df_pm25, x='aod_055', y='pm25', hue='pm25')

## Model Training

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold

# Load your sample DataFrame (replace with your actual data)
# Assuming your DataFrame is named 'df' and contains columns 'pm25', 'aod_055', and 'dem'
# You can replace the sample data with your actual data




### Fitting OLS

In [None]:


# Define features (X) and target (y)
X = df_pm25[['aod_055', ]]
y = df_pm25['pm25']


# Split data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

import statsmodels.api as sm

# Fit OLS model
X_train_ols = sm.add_constant(X_train)  # Add constant term
ols_model = sm.OLS(y_train, X_train_ols).fit()

# Get summary of OLS model
print(ols_model.summary())


In [None]:

# Add constant term to test data
X_test_ols = sm.add_constant(X_test)

# Predict pm25 values
y_pred = ols_model.predict(X_test_ols)

# Evaluate the model (optional)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")

## Predict grid in Chiangmai 

### Load Chiangmai grid & DEM

In [None]:

df_chiangmai_dem = pd.read_parquet(r'..\data\df_chiangmai_dem.parquet')
df_chiangmai_joined = df_chiangmai_grid.merge(df_chiangmai_dem, how='inner', left_index=True, right_index=True)
df_chiangmai_joined



## Predict

In [None]:
dir_chiangmai_aod_daily = r'../data/chiangmai_aod_daily'
list_file_aod_daily = io_tools.get_list_files(dir_chiangmai_aod_daily, '.parquet$')
pbar_aod_daily = tqdm(list_file_aod_daily)
# rerun = input('Rerun? (Y/N)')
rerun = 'Y'
if rerun.upper()=='Y':
    rerun_f = True
else:
    rerun_f = False

for path_aod_daily in pbar_aod_daily:
    
    year_month = os.path.basename(path_aod_daily)[23:30]
    tile_id = os.path.basename(path_aod_daily)[31:37]
    
    path_out_daily = os.path.join(dir_predicted_pm25_daily, f'df_predict_pm25-{year_month}-{tile_id}.parquet')

    if os.path.exists(path_out_daily):
        if not rerun_f:
            continue

    df_aod_daily = pd.read_parquet(path_aod_daily)
    del(df_aod_daily['year_month'])
    del(df_aod_daily['tile_id'])
    df_predict_pm25 = pd.melt(df_aod_daily, ignore_index=False, )
    df_predict_pm25 = df_predict_pm25.dropna().copy()
    df_predict_pm25 = df_predict_pm25.rename(columns={'variable':'date', 'value':'aod_055'})
    df_predict_pm25 = df_chiangmai_dem.merge(df_predict_pm25, how='inner', left_index=True, right_index=True) 
    X_predict = df_predict_pm25[['aod_055', 'dem']]


    #OLS
    X_predict = df_predict_pm25[['aod_055',]]
    X_predict_ols = sm.add_constant(X_predict)
    y_pred = ols_model.predict(X_predict_ols)

    assert(len(y_pred)) == (len(df_predict_pm25))
    df_predict_pm25['pm25_pred'] = y_pred

    df_predict_pm25 = df_predict_pm25.merge(df_chiangmai_grid, how='left', left_index=True, right_index=True).copy()
    
    df_predict_pm25.to_parquet(path_out_daily)
        

## Save into monthly image

In [None]:
df_list_files_predicted_daily = io_tools.get_list_files(dir_predicted_pm25_daily, 'df_predict_pm25-.*.parquet', return_df=True)
df_list_files_predicted_daily['year_month'] = df_list_files_predicted_daily['file_nm'].str.slice(16,23)
df_list_files_predicted_daily['tile_id'] = df_list_files_predicted_daily['file_nm'].str.slice(24,30)


#Aggregate data in each month by calculating median for each grid
list_df_predict_pm25_monthly_median = []
list_df_predict_pm25_monthly_mean = []
print('Aggregating into monthly mean & median')
for year_month, df_list_files_curr in tqdm(df_list_files_predicted_daily.groupby('year_month')):    
    df_predict_pm25 = pd.concat([pd.read_parquet(file_path) for file_path in df_list_files_curr['file_path']])    
    df_predict_pm25_monthly_median = df_predict_pm25.groupby(['lat', 'lon', 'dem']).agg(year_month=('pm25_pred', 'median')).rename(columns={'year_month':year_month})
    df_predict_pm25_monthly_mean = df_predict_pm25.groupby(['lat', 'lon', 'dem']).agg(year_month=('pm25_pred', 'mean')).rename(columns={'year_month':year_month})
    list_df_predict_pm25_monthly_median.append(df_predict_pm25_monthly_median)
    list_df_predict_pm25_monthly_mean.append(df_predict_pm25_monthly_mean)
df_predict_pm25_monthly_median = pd.concat(list_df_predict_pm25_monthly_median, axis=1)  
df_predict_pm25_monthly_mean = pd.concat(list_df_predict_pm25_monthly_mean, axis=1)  

#Fill missing value from cloudy effect by Interpolation
print('Interpolating monthly median...')
df_predict_pm25_monthly_median = df_predict_pm25_monthly_median.interpolate(axis=1).copy()  
print('Interpolating monthly mean...')
df_predict_pm25_monthly_mean = df_predict_pm25_monthly_mean.interpolate(axis=1).copy()  

#Save 
print('Saving...')
df_predict_pm25_monthly_median.to_parquet(r'../data/df_predict_pm25_monthly_median.parquet')
df_predict_pm25_monthly_mean.to_parquet(r'../data/df_predict_pm25_monthly_mean.parquet')
print('Done.')