In [1]:
import xgboost as xgb
import xarray as xr
import pandas as pd
import numpy as np
import gc
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import logging 
import pickle
# logging.disable(logging.CRITICAL)
import shap
from tqdm import tqdm

In [2]:
model_path = ""
data_path_prefix = "" 
p_dict = {"a":model_path+"all_gridcell/", 
          "c":model_path+"clusters/",
          "r":model_path+"regions/"}

def load_model(spatial_scale,time_scale,gas_flag,loc):
    if spatial_scale == "r":
        p = p_dict[spatial_scale]+time_scale+"_aod_emission_met"+gas_flag+"_"+loc+".pkl"
    else:
        p = p_dict[spatial_scale]+time_scale+"_aod_emission_met"+gas_flag+".pkl"

    with open(p, 'rb') as f:
        # The protocol version used is detected automatically, so we do not
        # have to specify it.
        automl = pickle.load(f)
        model_name =  automl._best_estimator
    return automl, model_name

def get_feature_list(time_scale,gas_flag):
    aod_ls = ['AOT_C', 'AOT_DUST_C']
    met_ls = ['T2M', 'PBLH', 'U10M', 'V10M', 'PRECTOT', 'RH']
    gas_ls = ['CO_trop', 'SO2_trop', 'NO2_trop', 'CH2O_trop', 'NH3_trop']

    # select based on time scale
    if time_scale == "monthly":
        emission = ['EmisDST_Natural', 
                    'EmisNO_Fert', 'EmisNO_Lightning', 'EmisNO_Ship', 'EmisNO_Soil',
                    'EmisBC_Anthro', 'EmisBC_BioBurn', 
                    'EmisCH2O_Anthro', 'EmisCH2O_BioBurn', 
                    'EmisCO_Anthro', 'EmisCO_BioBurn', 'EmisCO_Ship', 
                    'EmisNH3_Anthro', 'EmisNH3_BioBurn', 'EmisNH3_Natural', 
                    'EmisNO_Aircraft', 'EmisNO_Anthro', 'EmisNO_BioBurn', 
                    'EmisOC_Anthro', 'EmisOC_BioBurn',  
                    'EmisSO2_Aircraft', 'EmisSO2_Anthro', 'EmisSO2_BioBurn',
                    'EmisSO4_Anthro']
        
         # select based on gas or not
        if gas_flag=="_gas":
            return aod_ls+emission+met_ls+gas_ls
        else:
            return aod_ls+emission+met_ls

    else:
        emission = ['EmisDST_Natural', 
                    'EmisNO_Fert', 'EmisNO_Lightning', 'EmisNO_Ship', 'EmisNO_Soil']
        if gas_flag=="_gas":
            return aod_ls+emission+met_ls+gas_ls
        else:
            return aod_ls+emission+met_ls

def load_data(spatial_scale,time_scale,gas_flag,loc,data_type):
    # get feature list and include label
    feature_ls = get_feature_list(time_scale,gas_flag)
    # select based on spatial_scale
    if spatial_scale =="r":
        if time_scale=="monthly_le":
            data_path = data_path_prefix+"c_r_monthly_"+data_type+".gzip"
        else:
            data_path = data_path_prefix+"c_r_"+time_scale+"_"+data_type+".gzip"
        df = pd.read_parquet(data_path)[feature_ls+["PM25","region"]]
        return df[df["region"]==loc], feature_ls
    elif spatial_scale == "c":
        if time_scale=="monthly_le":
            data_path = data_path_prefix+"c_r_monthly_"+data_type+".gzip"
        else:
            data_path = data_path_prefix+"c_r_"+time_scale+"_"+data_type+".gzip"
        df = pd.read_parquet(data_path)[feature_ls+["PM25"]]
        return df, feature_ls
    else:
        if time_scale=="monthly_le":
            data_path = data_path_prefix+"monthly_"+data_type+".gzip"
        else:
            data_path = data_path_prefix+time_scale+"_"+data_type+".gzip"
        df = pd.read_parquet(data_path)[feature_ls+["PM25"]]
        return df, feature_ls
    
def model_performance(df, model, feature_ls, spatial_scale):
    y_true = df["PM25"]
    y_pred = model.predict(df[feature_ls])
    r2 = round(r2_score(y_true, y_pred),3)
    rmse = round(mean_squared_error(y_true, y_pred, squared = False),3)
    mae = round(mean_absolute_error(y_true, y_pred),3)
    print(spatial_scale, "r2_score:",
          r2)
    print(spatial_scale, "root mean_squared_error:",
          rmse)
    print(spatial_scale, "mean_absolute_error:",
          mae)
    return r2, rmse, mae

## save data

In [3]:
loc_ls = []
time_scale_ls = []
spatial_scale_ls = []
gas_ls = []
r2_ls = []
rmse_ls = []
mae_ls = []
for loc in ["E","S","W","N"]:
    print("============================")
    print("start location:",loc,"\n")
    for time_scale in ["daily","monthly_le","monthly"]:
        print("===",time_scale,"===")
        print("#########")
        for gas_flag in ["_gas",""]:
            print("##gas_flag:",gas_flag)
            spatial_scale = "r"
            df, feature_ls = load_data(spatial_scale,time_scale,gas_flag,loc,"test")
            
            for spatial_scale in ["r","a","c"]:
                print("**spatial scale:", spatial_scale)
                model, model_name = load_model(spatial_scale,time_scale,gas_flag,loc)
                r2, rmse, mae = model_performance(df, model, feature_ls, spatial_scale)
                loc_ls.append(loc)
                time_scale_ls.append(time_scale)
                spatial_scale_ls.append(spatial_scale)
                gas_ls.append(gas_flag)
                r2_ls.append(r2)
                rmse_ls.append(rmse)
                mae_ls.append(mae)
                del model, r2, rmse, mae
                gc.collect()
            
            del df
            gc.collect()
    print("============================")

start location: E 

=== daily ===
#########
##gas_flag: _gas
**spatial scale: r
r r2_score: 0.744
r root mean_squared_error: 4.837
r mean_absolute_error: 3.571
**spatial scale: a
a r2_score: 0.744
a root mean_squared_error: 4.841
a mean_absolute_error: 3.549
**spatial scale: c
c r2_score: 0.776
c root mean_squared_error: 4.521
c mean_absolute_error: 3.336
##gas_flag: 
**spatial scale: r
r r2_score: 0.592
r root mean_squared_error: 6.111
r mean_absolute_error: 4.295
**spatial scale: a
a r2_score: 0.649
a root mean_squared_error: 5.668
a mean_absolute_error: 4.137
**spatial scale: c
c r2_score: 0.641
c root mean_squared_error: 5.726
c mean_absolute_error: 4.124
=== monthly_le ===
#########
##gas_flag: _gas
**spatial scale: r
r r2_score: 0.848
r root mean_squared_error: 2.977
r mean_absolute_error: 2.186
**spatial scale: a
a r2_score: 0.863
a root mean_squared_error: 2.83
a mean_absolute_error: 2.064
**spatial scale: c
c r2_score: 0.918
c root mean_squared_error: 2.191
c mean_absolute_err

In [4]:
dd = {
    "loc":loc_ls,
    "time_scale":time_scale_ls,
    "spatial_scale":spatial_scale_ls,
    "gas":gas_ls,
    "r2":r2_ls,
    "rmse":rmse_ls,
    "mae":mae_ls
}

df = pd.DataFrame(dd)
df.to_csv("./data/apply_model.csv",index=False)

## visualization

In [5]:
df = pd.read_csv("./data/apply_model.csv")