In [1]:
#import xgboost as xgb
import xarray as xr
import pandas as pd
import numpy as np
import gc
# from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
#import statsmodels.api as sm
import matplotlib.pyplot as plt
import logging 
import pickle
logging.disable(logging.CRITICAL)
#import shap
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
import warnings
warnings.simplefilter("ignore")
print(xgb.__version__)

1.4.0


In [2]:
model_path = ""
data_path_prefix = "" 
p_dict = {"a":model_path+"all_gridcell/", 
          "c":model_path+"clusters/",
          "r":model_path+"regions/"}

def get_model_path(spatial_scale,time_scale,gas_flag,loc):
    if spatial_scale == "r":
        p = p_dict[spatial_scale]+time_scale+"_aod_emission_met"+gas_flag+"_"+loc+".pkl"
    else:
        p = p_dict[spatial_scale]+time_scale+"_aod_emission_met"+gas_flag+".pkl"
    return p
        
def load_model(spatial_scale,time_scale,gas_flag,loc):
    if spatial_scale == "r":
        p = p_dict[spatial_scale]+time_scale+"_aod_emission_met"+gas_flag+"_"+loc+".pkl"
    else:
        p = p_dict[spatial_scale]+time_scale+"_aod_emission_met"+gas_flag+".pkl"

    with open(p, 'rb') as f:
        # The protocol version used is detected automatically, so we do not
        # have to specify it.
        automl = pickle.load(f)
        model_name =  automl._best_estimator
    return automl, model_name

def get_feature_list(time_scale,gas_flag):
    aod_ls = ['AOT_C', 'AOT_DUST_C']
    met_ls = ['T2M', 'PBLH', 'U10M', 'V10M', 'PRECTOT', 'RH']
    gas_ls = ['CO_trop', 'SO2_trop', 'NO2_trop', 'CH2O_trop', 'NH3_trop']

    # select based on time scale
    if time_scale == "monthly":
        emission = ['EmisDST_Natural', 
                    'EmisNO_Fert', 'EmisNO_Lightning', 'EmisNO_Ship', 'EmisNO_Soil',
                    'EmisBC_Anthro', 'EmisBC_BioBurn', 
                    'EmisCH2O_Anthro', 'EmisCH2O_BioBurn', 
                    'EmisCO_Anthro', 'EmisCO_BioBurn', 'EmisCO_Ship', 
                    'EmisNH3_Anthro', 'EmisNH3_BioBurn', 'EmisNH3_Natural', 
                    'EmisNO_Aircraft', 'EmisNO_Anthro', 'EmisNO_BioBurn', 
                    'EmisOC_Anthro', 'EmisOC_BioBurn',  
                    'EmisSO2_Aircraft', 'EmisSO2_Anthro', 'EmisSO2_BioBurn',
                    'EmisSO4_Anthro']
        
         # select based on gas or not
        if gas_flag=="_gas":
            return aod_ls+emission+met_ls+gas_ls
        else:
            return aod_ls+emission+met_ls

    else:
        emission = ['EmisDST_Natural', 
                    'EmisNO_Fert', 'EmisNO_Lightning', 'EmisNO_Ship', 'EmisNO_Soil']
        if gas_flag=="_gas":
            return aod_ls+emission+met_ls+gas_ls
        else:
            return aod_ls+emission+met_ls

def load_data(spatial_scale,time_scale,gas_flag,loc,data_type):
    # get feature list and include label
    feature_ls = get_feature_list(time_scale,gas_flag)
    # select based on spatial_scale
    if spatial_scale =="r":
        if time_scale=="monthly_le":
            data_path = data_path_prefix+"c_r_monthly_"+data_type+".gzip"
        else:
            data_path = data_path_prefix+"c_r_"+time_scale+"_"+data_type+".gzip"
        df = pd.read_parquet(data_path)[feature_ls+["PM25","region"]]
        return df[df["region"]==loc], feature_ls
    elif spatial_scale == "c":
        if time_scale=="monthly_le":
            data_path = data_path_prefix+"c_r_monthly_"+data_type+".gzip"
        else:
            data_path = data_path_prefix+"c_r_"+time_scale+"_"+data_type+".gzip"
        df = pd.read_parquet(data_path)[feature_ls+["PM25"]]
        return df, feature_ls
    else:
        if time_scale=="monthly_le":
            data_path = data_path_prefix+"monthly_"+data_type+".gzip"
        else:
            data_path = data_path_prefix+time_scale+"_"+data_type+".gzip"
        df = pd.read_parquet(data_path)[feature_ls+["PM25"]]
        return df, feature_ls
    
def model_performance(df, model, feature_ls, spatial_scale):
    y_true = df["PM25"]
    y_pred = model.predict(df[feature_ls])
    print(spatial_scale, "r2_score:",
          r2_score(y_true, y_pred))
    print(spatial_scale, "root mean_squared_error:",
          mean_squared_error(y_true, y_pred, squared = False))
    print(spatial_scale, "mean_absolute_error:",
          mean_absolute_error(y_true, y_pred))

## regional: RF

In [3]:
for loc in ["E","S","W","N"]:
    print("============================")
    print("start location:",loc,"\n")
    for time_scale in ["daily","monthly_le","monthly"]:
        print("===",time_scale,"===")
        print("#########")
        for gas_flag in ["_gas",""]:
            print("##gas_flag:",gas_flag)
            for spatial_scale in ["r"]:
                print("**spatial scale:", spatial_scale)
                df_train, feature_ls = load_data(spatial_scale,time_scale,gas_flag,loc,"train")
                df_test, feature_ls = load_data(spatial_scale,time_scale,gas_flag,loc,"test")
                X_train = df_train[feature_ls]
                y_train = df_train["PM25"]
                X_test = df_test[feature_ls]
                y_true = df_test["PM25"]

                # train the model
                reg = RandomForestRegressor(n_jobs=-1,random_state=66)
                reg.fit(X_train, y_train)
                y_pred = reg.predict(X_test)
                print(spatial_scale, "r2_score:",
                      round(r2_score(y_true, y_pred),2))
                print(spatial_scale, "root mean_squared_error:",
                      round(mean_squared_error(y_true, y_pred, squared = False),2))
                print(spatial_scale, "mean_absolute_error:",
                      round(mean_absolute_error(y_true, y_pred),2))

                del df_train, df_test, reg, X_train, y_train, X_test, y_true
                gc.collect()
        print("============================")

start location: E 

=== daily ===
#########
##gas_flag: _gas
**spatial scale: r
r r2_score: 0.66
r root mean_squared_error: 5.58
r mean_absolute_error: 3.95
##gas_flag: 
**spatial scale: r
r r2_score: 0.56
r root mean_squared_error: 6.31
r mean_absolute_error: 4.29
=== monthly_le ===
#########
##gas_flag: _gas
**spatial scale: r
r r2_score: 0.51
r root mean_squared_error: 5.35
r mean_absolute_error: 3.27
##gas_flag: 
**spatial scale: r
r r2_score: 0.26
r root mean_squared_error: 6.57
r mean_absolute_error: 4.21
=== monthly ===
#########
##gas_flag: _gas
**spatial scale: r
r r2_score: 0.68
r root mean_squared_error: 4.32
r mean_absolute_error: 2.75
##gas_flag: 
**spatial scale: r
r r2_score: 0.65
r root mean_squared_error: 4.55
r mean_absolute_error: 3.01
start location: S 

=== daily ===
#########
##gas_flag: _gas
**spatial scale: r
r r2_score: 0.66
r root mean_squared_error: 4.08
r mean_absolute_error: 3.1
##gas_flag: 
**spatial scale: r
r r2_score: 0.59
r root mean_squared_error: 4.4

## all gridcells, and clusters: RF

In [4]:
from sklearn.ensemble import RandomForestRegressor
for spatial_scale in ["a","c"]:
    print("============================")
    print("**spatial scale:", spatial_scale)
    for time_scale in ["daily","monthly_le","monthly"]:
        print("===",time_scale,"===")
        print("#########")
        for gas_flag in ["_gas",""]:
            print("##gas_flag:",gas_flag)
            df_train, feature_ls = load_data(spatial_scale,time_scale,gas_flag,"None","train")
            df_test, feature_ls = load_data(spatial_scale,time_scale,gas_flag,"None","test")
            X_train = df_train[feature_ls]
            y_train = df_train["PM25"]
            X_test = df_test[feature_ls]
            y_true = df_test["PM25"]

            # train the model
            reg = RandomForestRegressor(n_jobs=-1,random_state=66)
            reg.fit(X_train, y_train)
            y_pred = reg.predict(X_test)
            print(spatial_scale, "r2_score:",
                  round(r2_score(y_true, y_pred),2))
            print(spatial_scale, "root mean_squared_error:",
                  round(mean_squared_error(y_true, y_pred, squared = False),2))
            print(spatial_scale, "mean_absolute_error:",
                  round(mean_absolute_error(y_true, y_pred),2))

            del df_train, df_test, reg, X_train, y_train, X_test, y_true
            gc.collect()
    print("============================")

**spatial scale: a
=== daily ===
#########
##gas_flag: _gas
a r2_score: 0.87
a root mean_squared_error: 5.08
a mean_absolute_error: 3.43
##gas_flag: 
a r2_score: 0.85
a root mean_squared_error: 5.58
a mean_absolute_error: 3.74
=== monthly_le ===
#########
##gas_flag: _gas
a r2_score: 0.92
a root mean_squared_error: 3.65
a mean_absolute_error: 2.44
##gas_flag: 
a r2_score: 0.88
a root mean_squared_error: 4.28
a mean_absolute_error: 3.04
=== monthly ===
#########
##gas_flag: _gas
a r2_score: 0.92
a root mean_squared_error: 3.61
a mean_absolute_error: 2.42
##gas_flag: 
a r2_score: 0.91
a root mean_squared_error: 3.85
a mean_absolute_error: 2.71
**spatial scale: c
=== daily ===
#########
##gas_flag: _gas
c r2_score: 0.79
c root mean_squared_error: 5.76
c mean_absolute_error: 4.09
##gas_flag: 
c r2_score: 0.73
c root mean_squared_error: 6.48
c mean_absolute_error: 4.55
=== monthly_le ===
#########
##gas_flag: _gas
c r2_score: 0.81
c root mean_squared_error: 4.3
c mean_absolute_error: 3.12
#