## Import Lib

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from zkyhaxpy import io_tools, gis_tools, pd_tools
import rasterio
import os
import shutil
import numpy as np
from tqdm.notebook import tqdm
import aqi
import numpy as np
from scipy import stats
import plotly.express as px


## Define constants

In [2]:

aqi_color_codes = {
        -9:"Grey",
        0: "Green",
        1: "Yellow",
        2: "Orange",
        3: "Red",
        4: "Purple",
        5: "Maroon"
    }    


## Define functions

### pm25_to_aqi_level

In [3]:


def pm25_to_aqi_level(pm25_concentration: float) -> int:
    """
    Converts PM2.5 concentration to AQI class (as integer).

    Args:
        pm25_concentration (float): PM2.5 concentration in µg/m³.

    Returns:
        int: AQI level (0 to 5) based on EPA guidelines.
    """
    aqi_value = aqi.to_aqi([(aqi.POLLUTANT_PM25, str(pm25_concentration))])
    if aqi_value <= 50:
        return 0  # Good
    elif 50 < aqi_value <= 100:
        return 1  # Moderate
    elif 100 < aqi_value <= 150:
        return 2  # Unhealthy for Sensitive Groups
    elif 150 < aqi_value <= 200:
        return 3  # Unhealthy
    elif 200 < aqi_value <= 300:
        return 4  # Very Unhealthy
    else:
        return 5  # Hazardous


### aqi_level_to_color

In [4]:

def aqi_level_to_color(aqi_class: int) -> str:
    """
    Converts AQI level (as integer) to color code.

    Args:
        aqi_class (int): AQI level (0 to 5).

    Returns:
        str: Color code corresponding to the AQI level.
    """
    color_codes = {
        0: "Green",
        1: "Yellow",
        2: "Orange",
        3: "Red",
        4: "Purple",
        5: "Maroon"
    }
    return color_codes.get(aqi_class, "Unknown")


## Load data for training model

In [5]:
df_chiangmai_grid = pd.read_parquet(r'../data/df_chiangmai_grid.parquet')

lat_min = df_chiangmai_grid.lat.min()
lat_max = df_chiangmai_grid.lat.max()
lon_min = df_chiangmai_grid.lon.min()
lon_max = df_chiangmai_grid.lon.max()

In [6]:
df_extracted_aod055 = pd.read_csv(r'../data/df_extracted_openaq_aod055_v2.csv')
del(df_extracted_aod055['row'])
del(df_extracted_aod055['col'])
del(df_extracted_aod055['tile_id'])

df_extracted_dem = pd.read_csv(r'../data/df_extracted_openaq_dem_v2.csv')
del(df_extracted_dem['row'])
del(df_extracted_dem['col'])


path_df_openaq = r'../data/gdf_openaq_chiangmai_by_location.parquet'
if os.path.exists(path_df_openaq):
    df_openaq = pd.read_parquet(path_df_openaq)
    print(f'{path_df_openaq} has been loaded')
else:
    gdf_openaq = gpd.read_file('../data/gdf_openaq_chiangmai_by_location.gpkg')
    print('gdf_openaq has been loaded.')
    if gdf_openaq.index.name is None:
        gdf_openaq = gdf_openaq.set_index('measurement_id')
    
    df_openaq = gdf_openaq.drop(columns=['geometry']).copy()
    df_openaq.to_parquet(path_df_openaq)
    print(f'{path_df_openaq} has been saved')

../data/gdf_openaq_chiangmai_by_location.parquet has been loaded


In [7]:
df_openaq = df_openaq[df_openaq['value'] != -999].copy()
df_openaq

Unnamed: 0_level_0,locationId,location,parameter,value,date,unit,country,city,isMobile,isAnalysis,entity,sensorType,datetime,year,month,year_month,time,lat,lon
measurement_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,1235996,"Vieng Nua, Pai",pm25,2.191964,2023-10-31,µg/m³,TH,,False,,Governmental Organization,low-cost sensor,2023-10-31T00:00,2023,0-,2023-10-,00:00,19.374620,98.445611
1,1235996,"Vieng Nua, Pai",pm25,2.313988,2023-10-30,µg/m³,TH,,False,,Governmental Organization,low-cost sensor,2023-10-30T23:00,2023,0-,2023-10-,23:00,19.374620,98.445611
2,1235996,"Vieng Nua, Pai",pm25,2.580357,2023-10-30,µg/m³,TH,,False,,Governmental Organization,low-cost sensor,2023-10-30T22:00,2023,0-,2023-10-,22:00,19.374620,98.445611
3,1235996,"Vieng Nua, Pai",pm25,3.220238,2023-10-30,µg/m³,TH,,False,,Governmental Organization,low-cost sensor,2023-10-30T21:00,2023,0-,2023-10-,21:00,19.374620,98.445611
4,1235996,"Vieng Nua, Pai",pm25,3.644345,2023-10-30,µg/m³,TH,,False,,Governmental Organization,low-cost sensor,2023-10-30T20:00,2023,0-,2023-10-,20:00,19.374620,98.445611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253632,523937,Tha Kham Municipality Office,pm25,9.600000,2023-06-01,µg/m³,TH,,False,,Governmental Organization,reference grade,2023-06-01T06:00,2023,6-,2023-06-,06:00,18.202222,98.597222
253633,523937,Tha Kham Municipality Office,pm25,9.700000,2023-06-01,µg/m³,TH,,False,,Governmental Organization,reference grade,2023-06-01T05:00,2023,6-,2023-06-,05:00,18.202222,98.597222
253634,523937,Tha Kham Municipality Office,pm25,9.800000,2023-06-01,µg/m³,TH,,False,,Governmental Organization,reference grade,2023-06-01T04:00,2023,6-,2023-06-,04:00,18.202222,98.597222
253635,523937,Tha Kham Municipality Office,pm25,10.200000,2023-06-01,µg/m³,TH,,False,,Governmental Organization,reference grade,2023-06-01T02:00,2023,6-,2023-06-,02:00,18.202222,98.597222


In [8]:
df_openaq['locationId'].nunique()

73

In [9]:
df_extracted_aod055 = df_extracted_aod055[df_extracted_aod055['aod_055'] >= 0].copy()
df_extracted_aod055 = df_extracted_aod055.set_index('measurement_id')
df_extracted_aod055

Unnamed: 0_level_0,aod_055,aod_datetime
measurement_id,Unnamed: 1_level_1,Unnamed: 2_level_1
157249,632.0,2022-04-19 07:05:00+00:00
157250,632.0,2022-04-19 07:05:00+00:00
205094,357.0,2022-04-19 07:05:00+00:00
205095,357.0,2022-04-19 07:05:00+00:00
118040,177.0,2023-01-21 07:45:00+00:00
...,...,...
235082,248.0,2024-03-09 07:35:00+00:00
245072,534.0,2024-03-09 07:35:00+00:00
245073,534.0,2024-03-09 07:35:00+00:00
250183,290.0,2024-03-09 07:35:00+00:00


In [10]:
df_extracted_dem = df_extracted_dem.copy()
df_extracted_dem = df_extracted_dem.set_index('measurement_id')
df_extracted_dem

Unnamed: 0_level_0,dem
measurement_id,Unnamed: 1_level_1
0,524.0
1,524.0
2,524.0
3,524.0
4,524.0
...,...
253632,291.0
253633,291.0
253634,291.0
253635,291.0


In [11]:
df_joined = df_openaq.merge(df_extracted_aod055, how='inner', left_index=True, right_index=True).copy()
df_joined = df_joined.merge(df_extracted_dem, how='inner', left_index=True, right_index=True).copy()
df_joined = df_joined.rename(columns={'value':'pm25'})


df_joined['year_month'] = df_joined['year_month'].str.slice(0, 7)
df_joined['month'] = df_joined['year_month'].str.slice(5, 7)
df_joined

Unnamed: 0_level_0,locationId,location,parameter,pm25,date,unit,country,city,isMobile,isAnalysis,...,datetime,year,month,year_month,time,lat,lon,aod_055,aod_datetime,dem
measurement_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
108,1235996,"Vieng Nua, Pai",pm25,21.903274,2023-10-26,µg/m³,TH,,False,,...,2023-10-26T05:00,2023,10,2023-10,05:00,19.374620,98.445611,191.0,2023-10-26 04:15:00+00:00,524.0
109,1235996,"Vieng Nua, Pai",pm25,20.860119,2023-10-26,µg/m³,TH,,False,,...,2023-10-26T04:00,2023,10,2023-10,04:00,19.374620,98.445611,191.0,2023-10-26 04:15:00+00:00,524.0
127,1235996,"Vieng Nua, Pai",pm25,10.125000,2023-10-25,µg/m³,TH,,False,,...,2023-10-25T08:00,2023,10,2023-10,08:00,19.374620,98.445611,211.0,2023-10-25 07:05:00+00:00,524.0
128,1235996,"Vieng Nua, Pai",pm25,12.901786,2023-10-25,µg/m³,TH,,False,,...,2023-10-25T07:00,2023,10,2023-10,07:00,19.374620,98.445611,211.0,2023-10-25 07:05:00+00:00,524.0
131,1235996,"Vieng Nua, Pai",pm25,14.886905,2023-10-25,µg/m³,TH,,False,,...,2023-10-25T04:00,2023,10,2023-10,04:00,19.374620,98.445611,195.0,2023-10-25 03:30:00+00:00,524.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253493,523937,Tha Kham Municipality Office,pm25,34.000000,2023-05-06,µg/m³,TH,,False,,...,2023-05-06T04:00,2023,05,2023-05,04:00,18.202222,98.597222,348.0,2023-05-06 04:20:00+00:00,291.0
253516,523937,Tha Kham Municipality Office,pm25,36.000000,2023-05-05,µg/m³,TH,,False,,...,2023-05-05T04:00,2023,05,2023-05,04:00,18.202222,98.597222,318.0,2023-05-05 03:40:00+00:00,291.0
253517,523937,Tha Kham Municipality Office,pm25,36.000000,2023-05-05,µg/m³,TH,,False,,...,2023-05-05T03:00,2023,05,2023-05,03:00,18.202222,98.597222,318.0,2023-05-05 03:40:00+00:00,291.0
253584,523937,Tha Kham Municipality Office,pm25,25.000000,2023-05-02,µg/m³,TH,,False,,...,2023-05-02T04:00,2023,05,2023-05,04:00,18.202222,98.597222,356.0,2023-05-02 03:15:00+00:00,291.0


In [12]:
df_pm25_ori = df_joined[(df_joined['lat'].between(lat_min, lat_max)) & (df_joined['lon'].between(lon_min, lon_max))]
df_pm25_ori = df_pm25_ori.reindex(columns=['pm25', 'aod_055', 'dem', 'sensorType', 'locationId']).copy()
df_pm25_ori

Unnamed: 0_level_0,pm25,aod_055,dem,sensorType,locationId
measurement_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
108,21.903274,191.0,524.0,low-cost sensor,1235996
109,20.860119,191.0,524.0,low-cost sensor,1235996
127,10.125000,211.0,524.0,low-cost sensor,1235996
128,12.901786,211.0,524.0,low-cost sensor,1235996
131,14.886905,195.0,524.0,low-cost sensor,1235996
...,...,...,...,...,...
253493,34.000000,348.0,291.0,reference grade,523937
253516,36.000000,318.0,291.0,reference grade,523937
253517,36.000000,318.0,291.0,reference grade,523937
253584,25.000000,356.0,291.0,reference grade,523937


In [13]:
#Filter only Open AQ measurement within Chiangmai boundary
gdf_province = gpd.read_file(r'..\data\thailandWithName.json')
gdf_chiangmai = gdf_province[gdf_province['name'] == 'Chiang Mai']
df_openaq_location = df_openaq.copy()
df_openaq_location = df_openaq_location.reindex(columns=['locationId', 'location', 'sensorType', 'entity', 'lat', 'lon']).drop_duplicates()
df_openaq_location['geometry'] = 'POINT (' + df_openaq_location['lon'].astype(str) + ' ' + df_openaq_location['lat'].astype(str) + ')'
gdf_openaq_location = gis_tools.df_to_gdf(df_openaq_location, 'geometry')
gdf_openaq_location_chiangmai = gdf_openaq_location[gdf_openaq_location.geometry.within(gdf_chiangmai.geometry.iloc[0])]
len(gdf_openaq_location_chiangmai)

list_location_id = gdf_openaq_location_chiangmai['locationId'].to_list()
list_location_id

[1235999,
 1236014,
 1236022,
 1236024,
 1236040,
 1236050,
 1236066,
 1405439,
 1641552,
 1641558,
 1646752,
 1753562,
 1753563,
 1753564,
 1861095,
 1861445,
 1864570,
 1935353,
 2041043,
 2169936,
 2174592,
 225579,
 225583,
 225669,
 225693,
 2379288,
 2429011,
 2432814,
 2461311,
 2575375,
 270292,
 2807918,
 387660,
 388656,
 523937]

In [14]:
df_pm25 = df_pm25_ori.copy()
df_pm25 = df_pm25[df_pm25['locationId'].isin(list_location_id)].copy()
res = stats.pearsonr(df_pm25['aod_055'], df_pm25['pm25'])
res

PearsonRResult(statistic=0.7876213000276187, pvalue=0.0)

In [15]:
df_pm25['pm25'] = df_pm25['pm25'].round(1)

In [16]:
fig = px.scatter(df_pm25, x='aod_055', y='pm25', color='sensorType')
fig.update_layout(
    yaxis_title='PM2.5 (µg/m3)',
    xaxis_title='AOD (MODIS)',
    title='Scatter Plot - PM2.5 (OpenAQ) and AOD (MODIS) in Chiangmai (2021 to Present)')


fig.write_html(f'fig-scatter.html', div_id='scatter', full_html=False, include_plotlyjs=False)
fig

## For LazyPlotly (PIER-X)

In [27]:
df_pm25

Unnamed: 0_level_0,pm25,aod_055,dem,sensorType,locationId
measurement_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5888,12.1,110.0,338.0,low-cost sensor,1235999
5889,14.2,110.0,338.0,low-cost sensor,1235999
5934,15.1,247.0,338.0,low-cost sensor,1235999
5935,15.3,247.0,338.0,low-cost sensor,1235999
5957,18.0,275.0,338.0,low-cost sensor,1235999
...,...,...,...,...,...
253493,34.0,348.0,291.0,reference grade,523937
253516,36.0,318.0,291.0,reference grade,523937
253517,36.0,318.0,291.0,reference grade,523937
253584,25.0,356.0,291.0,reference grade,523937


In [26]:
df_pm25_lazyplotly = df_pm25.copy().drop(columns=['locationId', 'dem'])
df_pm25_lazyplotly.columns = ['y_pm25', 'x_aod', 'sensor_type']
df_pm25_lazyplotly.to_csv('df_scatter_full.csv', index=False)
df_pm25_lazyplotly.sample(1000, random_state=88).to_csv('df_scatter_sample1000.csv', index=False)

In [17]:
df_pm25_lazyplotly['aod_055'] = df_pm25_lazyplotly['aod_055'].astype(int)
df_pm25_lazyplotly

NameError: name 'df_pm25_lazyplotly' is not defined

In [None]:
df_pm25_lazyplotly_refgrade = df_pm25_lazyplotly[df_pm25_lazyplotly['sensorType']=='reference grade'].copy().drop(columns=['locationId', 'dem'])
df_pm25_lazyplotly_refgrade

Unnamed: 0_level_0,pm25,aod_055,sensorType
measurement_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
49636,13.4,50,reference grade
49637,14.7,50,reference grade
49652,33.0,212,reference grade
49653,34.8,212,reference grade
49679,39.1,89,reference grade
...,...,...,...
253493,34.0,348,reference grade
253516,36.0,318,reference grade
253517,36.0,318,reference grade
253584,25.0,356,reference grade


In [19]:
df_pm25_lazyplotly_lowcost = df_pm25_lazyplotly[df_pm25_lazyplotly['sensorType']=='low-cost sensor'].copy().drop(columns=['locationId', 'dem'])
df_pm25_lazyplotly_lowcost

NameError: name 'df_pm25_lazyplotly' is not defined

In [None]:
df_pm25_lazyplotly_lowcost

In [18]:
df_pm25_lazyplotly_tmp = df_pm25_lazyplotly_lowcost.copy().sample(1000, random_state=88)
df_pm25_lazyplotly_tmp.columns = ['y', 'x', '_']
str_x = 'x: ['
str_y = 'y: ['
for _, s_row in df_pm25_lazyplotly_tmp.iterrows():
    str_x = str_x + f"{s_row.x},"
    str_y = str_y + f"{s_row.y},"

str_x = str_x + '],'    
str_y = str_y + '],'

print(str_x)
print(str_y)

NameError: name 'df_pm25_lazyplotly_lowcost' is not defined

In [49]:
df_pm25_lazyplotly_tmp = df_pm25_lazyplotly_refgrade.copy().sample(1000, random_state=88)
df_pm25_lazyplotly_tmp.columns = ['y', 'x', '_']
str_x = 'x: ['
str_y = 'y: ['
for _, s_row in df_pm25_lazyplotly_tmp.iterrows():
    str_x = str_x + f"{s_row.x},"
    str_y = str_y + f"{s_row.y},"

str_x = str_x + '],'    
str_y = str_y + '],'

print(str_x)
print(str_y)

x: [151,158,196,304,87,185,414,501,1190,1365,1116,1082,419,892,117,56,98,208,228,130,937,502,43,113,100,11,75,188,944,115,306,195,11,351,421,1251,713,1410,851,129,172,203,154,187,477,186,43,193,838,212,1545,789,465,116,225,137,282,265,637,25,726,211,169,1137,219,38,511,123,256,671,419,969,75,177,119,164,85,443,219,317,142,555,477,190,204,778,374,461,79,148,190,561,130,1138,118,89,221,291,30,798,1325,957,306,104,287,189,1020,449,483,367,1121,0,190,204,582,337,73,177,172,875,184,939,242,187,962,218,207,360,63,467,61,466,284,109,391,105,272,105,316,366,202,186,130,187,56,1136,351,823,127,193,227,166,98,222,268,242,680,551,451,426,41,23,36,303,384,95,114,140,187,196,298,315,25,235,1077,177,62,100,116,23,377,718,327,183,969,749,112,196,76,185,651,512,138,190,49,37,233,114,87,399,182,378,870,733,466,109,752,45,309,345,455,608,1410,537,187,117,91,259,664,80,266,85,934,223,489,733,813,64,100,354,818,506,167,1042,233,220,212,1340,51,268,311,3,219,125,208,50,121,345,461,1018,72,114,59,391,162,18