In [None]:
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, r2_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from collections import Counter

In [None]:
import math
import scipy
import shapely
import datetime
import scipy
import numpy as np
import pandas as pd
import geopandas as gpd

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_pickle(f"/content/drive/MyDrive/training_data_grids_final.pkl")
data_lags_nn_queen_less_rook = pd.read_pickle(f"/content/drive/MyDrive/training_data_grids_nn_queen_less_rook.pkl")
data_lags_nn_rook = pd.read_pickle(f"/content/drive/MyDrive/training_data_grids_nn_rook.pkl")

data = data.drop([x for x in data.columns if 'neighb' in x or 'edge' in x or 'geom' in x], axis=1)
data = pd.concat([data[[c for c in data.columns if "nn" not in c]], data_lags_nn_queen_less_rook, data_lags_nn_rook], axis=1)

In [None]:
data.shape

(8760, 318)

In [None]:
data['day'] = data['time'].apply(lambda x: x.day)
data['day_of_year'] = data['time'].apply(lambda x: x.timetuple().tm_yday)
data['month'] = data['time'].apply(lambda x: x.month)
data['week'] = data['time'].apply(lambda x: x.isocalendar()[1])
data['weekday'] = data['time'].apply(lambda x: x.weekday())

In [None]:
##  monitoring stations
grid_stations = gpd.read_file(f"/content/drive/MyDrive/training_data_grid_stations.shp")
##  outcome of SO2 data
out_so2 = pd.read_pickle(f"/content/drive/MyDrive/training_data_out_so2.pkl")

In [None]:
def dist(x, y):
    lat1 = math.radians(x[0])
    lon1 = math.radians(x[1])
    lat2 = math.radians(y[0])
    lon2 = math.radians(y[1])
    R = 6373.0
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return round(distance, 4)

In [None]:
##  Create Lagged Features of Surface Monitored SO2  ##

weighted_so2 = []
var_list = ['min_so2','mean_so2','max_so2',
            'min_so2_4hr','mean_so2_4hr','max_so2_4hr',
            'min_so2_8hr','mean_so2_8hr','max_so2_8hr',
            'min_so2_12hr','mean_so2_12hr','max_so2_12hr',
            'min_so2_24hr','mean_so2_24hr','max_so2_24hr',
            'min_so2_48hr','mean_so2_48hr','max_so2_48hr']

decay_factor = 0.1      #  multiplier to control the rate of exponential decay by distance

for n in range(data[['id','lon','lat']].drop_duplicates().shape[0]):

    #  generate distance matrix between stations

    tem = data[['id','lon','lat']].drop_duplicates().iloc[n]
    tem_coords_ref = tem[['lat','lon']].values.tolist()
    distances = [dist(tem_coords_ref, z) for z in grid_stations[['latitude','longitude']].values.tolist()]
    tem_dist = pd.DataFrame({"id": grid_stations["id_right"].values.tolist(), 
                             "station_name": grid_stations["station_name"].values.tolist(), 
                             "distances": distances})

    #  include influence from nearby stations within a 10 km buffer,
    #  exclude the in-situ / local-grid station to prevent data leakage in modelling

    tem_dist = tem_dist[tem_dist['distances'] <= 10]
    tem_leak = tem_dist[tem_dist['distances'] <= np.sqrt(0.275**2 + 0.275**2)]

    if tem_dist.shape[0] > 0:

        #  compute inverse-distance weighting for the influence (SO2 monitored observations) from nearby stations (spatial lags)

        tem_dist['weighting'] = 1 / (tem_dist['distances'] / tem_dist['distances'].sum())
        tem_dist['weighting'] = tem_dist['weighting'] / tem_dist['weighting'].sum()
        tem_out_so2 = out_so2.merge(tem_dist, left_on = ['station_name','id_right'], right_on = ['station_name','id'], how="inner")

        if tem_leak.shape[0] > 0:
            tem_out_so2['min_so2'] = tem_out_so2.apply(lambda x: x['min_so2'] if x['station_name'] != tem_leak['station_name'].values[0] else np.nan, axis=1)
            tem_out_so2['mean_so2'] = tem_out_so2.apply(lambda x: x['mean_so2'] if x['station_name'] != tem_leak['station_name'].values[0] else np.nan, axis=1)
            tem_out_so2['max_so2'] = tem_out_so2.apply(lambda x: x['max_so2'] if x['station_name'] != tem_leak['station_name'].values[0] else np.nan, axis=1)

        for var in var_list:
            tem_out_so2['weighted_' + var] = tem_out_so2[var] * tem_out_so2['weighting']

        tem_out_so2_agg = tem_out_so2.groupby(["day_time"]).agg({k: [np.nansum] for k in ['weighted_' + x for x in var_list]}).reset_index()
        tem_out_so2_agg.columns = ["time"] + ['weighted_' + x for x in var_list]

        #  apply log-transformation to the SO2 data
        #  apply exponential decay function by distance to the SO2 data, hence influence from stations farther away will diminish

        for var in var_list:
            tem_out_so2_agg['weighted_' + var] = np.log(tem_out_so2_agg['weighted_' + var] + 0.001)
            tem_out_so2_agg['weighted_' + var] = tem_out_so2_agg['weighted_' + var] * np.exp(-1 * decay_factor * tem_dist['distances'].min())

        tem_out_so2_agg['id'] = data['id'].drop_duplicates().iloc[n]

        #  create temporal lags for monitored SO2 at local-grid station up to past 6 days

        for lags in range(6):
            tem_out_so2_agg['weighted_min_so2_lag_' + str(lags+1)] = tem_out_so2_agg['weighted_min_so2'].shift(lags+1)
            tem_out_so2_agg['weighted_mean_so2_lag_' + str(lags+1)] = tem_out_so2_agg['weighted_mean_so2'].shift(lags+1)
            tem_out_so2_agg['weighted_max_so2_lag_' + str(lags+1)] = tem_out_so2_agg['weighted_max_so2'].shift(lags+1)

        tem_out_so2_agg = tem_out_so2_agg[tem_out_so2_agg['time'] >= datetime.datetime.date(datetime.datetime.strptime('2022-01-01', "%Y-%m-%d"))]

        weighted_so2.append(tem_out_so2_agg)

weighted_so2 = pd.concat(weighted_so2)

data = data.merge(weighted_so2, on=['id','time'], how='left')

In [None]:
##  binary outcome variable for SO2-clean vs SO2-polluted days
data['SO2_above_0'] = data['mean_so2'].apply(lambda x: 0 if x == 0 else 1)

In [None]:
data.shape

(8760, 360)

In [None]:
def cross_validate_fit_binary_classification(model, X_train, y_train):

    accuracy = []
    precision = []
    recall = []
    f1score = []

    kf = KFold(n_splits=10, random_state=42, shuffle=True)
    for i, (train_index, test_index) in enumerate(kf.split(X_train)):
        X_train_cv, y_train_cv = X_train[train_index], y_train.values[train_index]
        X_test_cv, y_test_cv = X_train[test_index], y_train.values[test_index]

        model.fit(X_train, y_train)

        binary_preds_train = model.predict(X_train_cv)
        binary_preds_test = model.predict(X_test_cv)

        m1 = (accuracy_score(y_test_cv, binary_preds_test), accuracy_score(y_train_cv, binary_preds_train))
        m2 = (precision_score(y_test_cv, binary_preds_test, average="weighted"), precision_score(y_train_cv, binary_preds_train, average="weighted"))
        m3 = (recall_score(y_test_cv, binary_preds_test, average="weighted"), recall_score(y_train_cv, binary_preds_train, average="weighted"))
        m4 = (f1_score(y_test_cv, binary_preds_test, average="weighted"), f1_score(y_train_cv, binary_preds_train, average="weighted"))

        accuracy.append(m1)
        precision.append(m2)
        recall.append(m3)
        f1score.append(m4)

    return accuracy, precision, recall, f1score

In [None]:
def cross_validate_fit_regression(model, X_train, y_train):

    m1a, m2a, m3a, m4a, m5a, m6a, m7a = [], [], [], [], [], [], []
    m1b, m2b, m3b, m4b, m5b, m6b, m7b = [], [], [], [], [], [], []

    kf = KFold(n_splits=10, random_state=42, shuffle=True)
    for i, (train_index, test_index) in enumerate(kf.split(X_train)):
        X_train_cv, y_train_cv = X_train[train_index], y_train.values[train_index]
        X_test_cv, y_test_cv = X_train[test_index], y_train.values[test_index]

        model.fit(X_train, y_train)

        gbmlogmean_preds_train = model.predict(X_train_cv)
        gbmlogmean_preds_test = model.predict(X_test_cv)

        print("Train R2: " + str(r2_score(y_train_cv, gbmlogmean_preds_train)))
        print("Train R: " + str(scipy.stats.pearsonr(np.array(y_train_cv).flatten(), gbmlogmean_preds_train)[0]))
        print("Train MAE: " + str(mean_absolute_error(y_train_cv, gbmlogmean_preds_train)))
        print("Train MedAE: " + str(median_absolute_error(y_train_cv, gbmlogmean_preds_train)))
        print("Train MAE (original): " + str(mean_absolute_error(np.exp(y_train_cv) - 0.001, np.exp(gbmlogmean_preds_train) - 0.001)))
        print("Train MedAE (original): " + str(median_absolute_error(np.exp(y_train_cv) - 0.001, np.exp(gbmlogmean_preds_train) - 0.001)))
        print("Train upper quartile (original): " + str(np.quantile(abs(np.exp(y_train_cv) - np.exp(gbmlogmean_preds_train)), 0.75)))
        print("\n")
        print("Test R2: " + str(r2_score(y_test_cv, gbmlogmean_preds_test)))
        print("Test R: " + str(scipy.stats.pearsonr(np.array(y_test_cv).flatten(), gbmlogmean_preds_test)[0]))
        print("Test MAE: " + str(mean_absolute_error(y_test_cv, gbmlogmean_preds_test)))
        print("Test MedAE: " + str(median_absolute_error(y_test_cv, gbmlogmean_preds_test)))
        print("Test MAE (original): " + str(mean_absolute_error(np.exp(y_test_cv) - 0.001, np.exp(gbmlogmean_preds_test) - 0.001)))
        print("Test MedAE (original): " + str(median_absolute_error(np.exp(y_test_cv) - 0.001, np.exp(gbmlogmean_preds_test) - 0.001)))
        print("Test upper quartile (original): " + str(np.quantile(abs(np.exp(y_test_cv) - np.exp(gbmlogmean_preds_test)), 0.75)))
        print("\n")

        m1a.append(r2_score(y_test_cv, gbmlogmean_preds_test))
        m2a.append(scipy.stats.pearsonr(np.array(y_test_cv).flatten(), gbmlogmean_preds_test)[0])
        m3a.append(mean_absolute_error(y_test_cv, gbmlogmean_preds_test))
        m4a.append(median_absolute_error(y_test_cv, gbmlogmean_preds_test))
        m5a.append(mean_absolute_error(np.exp(y_test_cv) - 0.001, np.exp(gbmlogmean_preds_test) - 0.001))
        m6a.append(median_absolute_error(np.exp(y_test_cv) - 0.001, np.exp(gbmlogmean_preds_test) - 0.001))
        m7a.append(np.quantile(abs(np.exp(y_test_cv) - np.exp(gbmlogmean_preds_test)), 0.75))

        m1b.append(r2_score(y_train_cv, gbmlogmean_preds_train))
        m2b.append(scipy.stats.pearsonr(np.array(y_train_cv).flatten(), gbmlogmean_preds_train)[0])
        m3b.append(mean_absolute_error(y_train_cv, gbmlogmean_preds_train))
        m4b.append(median_absolute_error(y_train_cv, gbmlogmean_preds_train))
        m5b.append(mean_absolute_error(np.exp(y_train_cv) - 0.001, np.exp(gbmlogmean_preds_train) - 0.001))
        m6b.append(median_absolute_error(np.exp(y_train_cv) - 0.001, np.exp(gbmlogmean_preds_train) - 0.001))
        m7b.append(np.quantile(abs(np.exp(y_train_cv) - np.exp(gbmlogmean_preds_train)), 0.75))

    return m1a, m2a, m3a, m4a, m5a, m6a, m7a, \
            m1b, m2b, m3b, m4b, m5b, m6b, m7b

####  **CONTROL 1) Lasso Regression： only Meteorological and Land-Use Land-Cover features**

#####  **Binary Classification (Cross-Validation):**

In [None]:
y = data['SO2_above_0']
X = data.iloc[:,6:323]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
lasso_logistic = Pipeline([("imputer", KNNImputer(n_neighbors=4, weights='uniform')),
                           ("regressor", LogisticRegression(penalty='l1', solver='liblinear'))])

accuracy, precision, recall, f1score = cross_validate_fit_binary_classification(lasso_logistic , X_train, y_train)

In [None]:
print(np.mean([accuracy[n][0] for n in range(10)]))
print(np.mean([precision[n][0] for n in range(10)]))
print(np.mean([recall[n][0] for n in range(10)]))
print(np.mean([f1score[n][0] for n in range(10)]))

0.8296254330548198
0.7984864869667627
0.8296254330548198
0.7980414422186178


#####  **Binary Classification (Training & Testing):**

In [None]:
lasso_logistic = Pipeline([("imputer", KNNImputer(n_neighbors=4, weights='uniform')),
                           ("regressor", LogisticRegression(penalty='l1', solver='liblinear'))])
lasso_logistic.fit(X_train, y_train)

In [None]:
lasso_logistic_preds_train = lasso_logistic.predict(X_train)
lasso_logistic_preds_test = lasso_logistic.predict(X_test)

In [None]:
print(classification_report(y_train, lasso_logistic_preds_train))

              precision    recall  f1-score   support

           0       0.56      0.22      0.32      1252
           1       0.85      0.96      0.90      5756

    accuracy                           0.83      7008
   macro avg       0.71      0.59      0.61      7008
weighted avg       0.80      0.83      0.80      7008



In [None]:
confusion_matrix(y_train, lasso_logistic_preds_train)

array([[ 278,  974],
       [ 217, 5539]])

In [None]:
print(classification_report(y_test, lasso_logistic_preds_test))

              precision    recall  f1-score   support

           0       0.47      0.18      0.26       313
           1       0.84      0.96      0.90      1439

    accuracy                           0.82      1752
   macro avg       0.66      0.57      0.58      1752
weighted avg       0.78      0.82      0.78      1752



In [None]:
confusion_matrix(y_test, lasso_logistic_preds_test)

array([[  56,  257],
       [  63, 1376]])

#####  **Daily Mean SO2 Regression (Cross-Validation):**

In [None]:
endog = np.log(data[data['SO2_above_0'] == 1]['mean_so2'] + 0.001)
scaler = StandardScaler().fit(data[data['SO2_above_0'] == 1].iloc[:,6:318])
exog = np.concatenate((
        scaler.transform(data[data['SO2_above_0'] == 1].iloc[:,6:318]),
        data[data['SO2_above_0'] == 1].iloc[:,318:323],
), axis = 1)
group = data[data['SO2_above_0'] == 1]['id']

X_train, X_test, y_train, y_test = train_test_split(exog, endog, test_size=0.2, random_state=42, stratify=group)

In [None]:
lasso_log_mean = Pipeline([("imputer", KNNImputer(n_neighbors=8, weights='uniform')),
                           ("regressor", Lasso(alpha=0.1, max_iter=6000, tol=0.00001))])

m1a, m2a, m3a, m4a, m5a, m6a, m7a, \
  m1b, m2b, m3b, m4b, m5b, m6b, m7b = cross_validate_fit_regression(lasso_log_mean , X_train, y_train)

In [None]:
print("R2: " + str(np.mean(m1a)))
print("r: " + str(np.mean(m2a)))
print("MAE: " + str(np.mean(m3a)))
print("MedAE: " + str(np.mean(m4a)))
print("MAE (original): " + str(np.mean(m5a)))
print("MedAE (original): " + str(np.mean(m6a)))
print("upper quartile (original): " + str(np.mean(m7a)))

R2: 0.1507482451810397
r: 0.4087495253880659
MAE: 1.1163779408479935
MedAE: 0.8300251010593171
MAE (original): 0.8148962207142929
MedAE (original): 0.19921558668536882
upper quartile (original): 0.44838283068481777


#####  **Daily Mean SO2 Regression (Training & Testing):**

In [None]:
lasso_log_mean = Pipeline([("imputer", KNNImputer(n_neighbors=8, weights='uniform')),
                           ("regressor", Lasso(alpha=0.1, max_iter=6000, tol=0.00001))])
lasso_log_mean.fit(X_train, y_train)

In [None]:
lasso_log_mean_preds_train = lasso_log_mean.predict(X_train)
lasso_log_mean_preds_test = lasso_log_mean.predict(X_test)

In [None]:
print("Train R2: " + str(r2_score(y_train, lasso_log_mean_preds_train)))
print("Train R: " + str(scipy.stats.pearsonr(np.array(y_train).flatten(), lasso_log_mean_preds_train)[0]))
print("Train MSE: " + str(mean_squared_error(y_train, lasso_log_mean_preds_train)))
print("Train MAE: " + str(mean_absolute_error(y_train, lasso_log_mean_preds_train)))
print("Train MedAE: " + str(median_absolute_error(y_train, lasso_log_mean_preds_train)))
print("Train MSE (original): " + str(mean_squared_error(np.exp(y_train) - 0.001, np.exp(lasso_log_mean_preds_train) - 0.001)))
print("Train MAE (original): " + str(mean_absolute_error(np.exp(y_train) - 0.001, np.exp(lasso_log_mean_preds_train) - 0.001)))
print("Train MedAE (original): " + str(median_absolute_error(np.exp(y_train) - 0.001, np.exp(lasso_log_mean_preds_train) - 0.001)))
print("Train upper quartile (original): " + str(np.quantile(abs(np.exp(y_train) - np.exp(lasso_log_mean_preds_train)), 0.75)))
print("\n")
print("Test R2: " + str(r2_score(y_test, lasso_log_mean_preds_test)))
print("Test R: " + str(scipy.stats.pearsonr(np.array(y_test).flatten(), lasso_log_mean_preds_test)[0]))
print("Test MSE: " + str(mean_squared_error(y_test, lasso_log_mean_preds_test)))
print("Test MAE: " + str(mean_absolute_error(y_test, lasso_log_mean_preds_test)))
print("Test MedAE: " + str(median_absolute_error(y_test, lasso_log_mean_preds_test)))
print("Test MSE (original): " + str(mean_squared_error(np.exp(y_test) - 0.001, np.exp(lasso_log_mean_preds_test) - 0.001)))
print("Test MAE (original): " + str(mean_absolute_error(np.exp(y_test) - 0.001, np.exp(lasso_log_mean_preds_test) - 0.001)))
print("Test MedAE (original): " + str(median_absolute_error(np.exp(y_test) - 0.001, np.exp(lasso_log_mean_preds_test) - 0.001)))
print("Test upper quartile (original): " + str(np.quantile(abs(np.exp(y_test) - np.exp(lasso_log_mean_preds_test)), 0.75)))

Train R2: 0.15285957147606732
Train R: 0.40821595010835376
Train MSE: 2.2033576509364736
Train MAE: 1.1163793658078027
Train MedAE: 0.8253715525696268
Train MSE (original): 5.557786966911258
Train MAE (original): 0.8148577026673258
Train MedAE (original): 0.1991389565607337
Train upper quartile (original): 0.44102624987144695


Test R2: 0.1435344016766299
Test R: 0.3958562021987716
Test MSE: 2.319862451906422
Test MAE: 1.152668721484833
Test MedAE: 0.8358685160418733
Test MSE (original): 5.566116659841568
Test MAE (original): 0.8057801445627757
Test MedAE (original): 0.19110333030343502
Test upper quartile (original): 0.42547402169892057


#####  **Daily Max SO2 Regression (Cross-Validation):**

In [None]:
endog = np.log(data[data['SO2_above_0'] == 1]['max_so2'] + 0.001)
scaler = StandardScaler().fit(data[data['SO2_above_0'] == 1].iloc[:,6:318])
exog = np.concatenate((
        scaler.transform(data[data['SO2_above_0'] == 1].iloc[:,6:318]),
        data[data['SO2_above_0'] == 1].iloc[:,318:323],
), axis = 1)
group = data[data['SO2_above_0'] == 1]['id']

X_train, X_test, y_train, y_test = train_test_split(exog, endog, test_size=0.2, random_state=42, stratify=group)

In [None]:
lasso_log_max = Pipeline([("imputer", KNNImputer(n_neighbors=8, weights='uniform')),
                           ("regressor", Lasso(alpha=0.1, max_iter=6000, tol=0.00001))])

m1a, m2a, m3a, m4a, m5a, m6a, m7a, \
  m1b, m2b, m3b, m4b, m5b, m6b, m7b = cross_validate_fit_regression(lasso_log_max , X_train, y_train)

In [None]:
print("R2: " + str(np.mean(m1a)))
print("r: " + str(np.mean(m2a)))
print("MAE: " + str(np.mean(m3a)))
print("MedAE: " + str(np.mean(m4a)))
print("MAE (original): " + str(np.mean(m5a)))
print("MedAE (original): " + str(np.mean(m6a)))
print("upper quartile (original): " + str(np.mean(m7a)))

R2: 0.13863163485599114
r: 0.39582351829566825
MAE: 1.137750330270834
MedAE: 0.9855915915487634
MAE (original): 3.6180353457139276
MedAE (original): 0.8361312318641596
upper quartile (original): 2.1003414300845376


#####  **Daily Max SO2 Regression (Training & Testing):**

In [None]:
lasso_log_max = Pipeline([("imputer", KNNImputer(n_neighbors=8, weights='uniform')),
                          ("regressor", Lasso(alpha=0.1, max_iter=6000, tol=0.00001))])
lasso_log_max.fit(X_train, y_train)

In [None]:
lasso_log_max_preds_train = lasso_log_max.predict(X_train)
lasso_log_max_preds_test = lasso_log_max.predict(X_test)

In [None]:
print("Train R2: " + str(r2_score(y_train, lasso_log_max_preds_train)))
print("Train R: " + str(scipy.stats.pearsonr(np.array(y_train).flatten(), lasso_log_max_preds_train)[0]))
print("Train MSE: " + str(mean_squared_error(y_train, lasso_log_max_preds_train)))
print("Train MAE: " + str(mean_absolute_error(y_train, lasso_log_max_preds_train)))
print("Train MedAE: " + str(median_absolute_error(y_train, lasso_log_max_preds_train)))
print("Train MSE (original): " + str(mean_squared_error(np.exp(y_train) - 0.001, np.exp(lasso_log_max_preds_train) - 0.001)))
print("Train MAE (original): " + str(mean_absolute_error(np.exp(y_train) - 0.001, np.exp(lasso_log_max_preds_train) - 0.001)))
print("Train MedAE (original): " + str(median_absolute_error(np.exp(y_train) - 0.001, np.exp(lasso_log_max_preds_train) - 0.001)))
print("Train upper quartile (original): " + str(np.quantile(abs(np.exp(y_train) - np.exp(lasso_log_max_preds_train)), 0.75)))
print("\n")
print("Test R2: " + str(r2_score(y_test, lasso_log_max_preds_test)))
print("Test R: " + str(scipy.stats.pearsonr(np.array(y_test).flatten(), lasso_log_max_preds_test)[0]))
print("Test MSE: " + str(mean_squared_error(y_test, lasso_log_max_preds_test)))
print("Test MAE: " + str(mean_absolute_error(y_test, lasso_log_max_preds_test)))
print("Test MedAE: " + str(median_absolute_error(y_test, lasso_log_max_preds_test)))
print("Test MSE (original): " + str(mean_squared_error(np.exp(y_test) - 0.001, np.exp(lasso_log_max_preds_test) - 0.001)))
print("Test MAE (original): " + str(mean_absolute_error(np.exp(y_test) - 0.001, np.exp(lasso_log_max_preds_test) - 0.001)))
print("Test MedAE (original): " + str(median_absolute_error(np.exp(y_test) - 0.001, np.exp(lasso_log_max_preds_test) - 0.001)))
print("Test upper quartile (original): " + str(np.quantile(abs(np.exp(y_test) - np.exp(lasso_log_max_preds_test)), 0.75)))

Train R2: 0.1408098710316954
Train R: 0.3954425259889201
Train MSE: 1.9780005248698946
Train MAE: 1.1377317584525082
Train MedAE: 0.98617199234898
Train MSE (original): 121.38983519885576
Train MAE (original): 3.6178429434019392
Train MedAE (original): 0.8338481120932044
Train upper quartile (original): 2.092433919909164


Test R2: 0.13985262352114858
Test R: 0.39663758122390663
Test MSE: 2.003068493604263
Test MAE: 1.1526481659198766
Test MedAE: 1.0089424450882516
Test MSE (original): 141.9848451077848
Test MAE (original): 3.558993142787851
Test MedAE (original): 0.8226652434857926
Test upper quartile (original): 1.8648098918976004


####  **CONTROL 2) HGBM： only Meteorological and Land-Use Land-Cover features**

#####  **Binary Classification (Cross-Validation):**

In [None]:
y = data['SO2_above_0']
X = data.iloc[:,6:317]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
binary_gbm = HistGradientBoostingClassifier(max_iter = 5000, learning_rate = 0.001,
                                            max_depth = 5, max_leaf_nodes = 2 ** 5 - 1,
                                            random_state=42,
                                            class_weight={0: 1 / (Counter(data['SO2_above_0'])[0] / (Counter(data['SO2_above_0'])[1] + Counter(data['SO2_above_0'])[0])),
                                                          1: 1 / (Counter(data['SO2_above_0'])[1] / (Counter(data['SO2_above_0'])[1] + Counter(data['SO2_above_0'])[0]))})

accuracy, precision, recall, f1score = cross_validate_fit_binary_classification(binary_gbm , X_train, y_train)

In [None]:
print(np.mean([accuracy[n][0] for n in range(10)]))
print(np.mean([precision[n][0] for n in range(10)]))
print(np.mean([recall[n][0] for n in range(10)]))
print(np.mean([f1score[n][0] for n in range(10)]))

0.7615602200937437
0.8324582873206413
0.7615602200937437
0.7836441838439161


#####  **Binary Classification (Training & Testing):**

In [None]:
binary_gbm_control = HistGradientBoostingClassifier(max_iter = 5000, learning_rate = 0.001,
                                                    max_depth = 5, max_leaf_nodes = 2 ** 5 - 1,
                                                    random_state=42,
                                                    class_weight={0: 1 / (Counter(data['SO2_above_0'])[0] / (Counter(data['SO2_above_0'])[1] + Counter(data['SO2_above_0'])[0])),
                                                                  1: 1 / (Counter(data['SO2_above_0'])[1] / (Counter(data['SO2_above_0'])[1] + Counter(data['SO2_above_0'])[0]))})
binary_gbm_control.fit(X_train, y_train)

In [None]:
binary_control_preds_train = binary_gbm_control.predict(X_train)
binary_control_preds_test = binary_gbm_control.predict(X_test)

In [None]:
print(classification_report(y_train, binary_control_preds_train))

              precision    recall  f1-score   support

           0       0.51      0.92      0.65      1252
           1       0.98      0.81      0.88      5756

    accuracy                           0.83      7008
   macro avg       0.74      0.86      0.77      7008
weighted avg       0.89      0.83      0.84      7008



In [None]:
confusion_matrix(y_train, binary_control_preds_train)

array([[1147,  105],
       [1113, 4643]])

In [None]:
print(classification_report(y_test, binary_control_preds_test))

              precision    recall  f1-score   support

           0       0.38      0.74      0.51       313
           1       0.93      0.74      0.82      1439

    accuracy                           0.74      1752
   macro avg       0.66      0.74      0.67      1752
weighted avg       0.83      0.74      0.77      1752



In [None]:
confusion_matrix(y_test, binary_control_preds_test)

array([[ 233,   80],
       [ 374, 1065]])

#####  **Daily Mean SO2 Regression (Cross-Validation):**

In [None]:
endog = np.log(data[data['SO2_above_0'] == 1]['mean_so2'] + 0.001)
scaler = StandardScaler().fit(data[data['SO2_above_0'] == 1].iloc[:,6:318])
exog = np.concatenate((
        scaler.transform(data[data['SO2_above_0'] == 1].iloc[:,6:318]),
        data[data['SO2_above_0'] == 1].iloc[:,318:323],
), axis = 1)
group = data[data['SO2_above_0'] == 1]['id']

X_train, X_test, y_train, y_test = train_test_split(exog, endog, test_size=0.2, random_state=42, stratify=group)

In [None]:
gbmlogmean = HistGradientBoostingRegressor(max_iter = 5000,
                                           learning_rate = 0.001,
                                           max_depth = 5,
                                           max_leaf_nodes = 2 ** 5 - 1,
                                           random_state=42)

m1a, m2a, m3a, m4a, m5a, m6a, m7a, \
  m1b, m2b, m3b, m4b, m5b, m6b, m7b = cross_validate_fit_regression(gbmlogmean , X_train, y_train)

In [None]:
print("R2: " + str(np.mean(m1a)))
print("r: " + str(np.mean(m2a)))
print("MAE: " + str(np.mean(m3a)))
print("MedAE: " + str(np.mean(m4a)))
print("MAE (original): " + str(np.mean(m5a)))
print("MedAE (original): " + str(np.mean(m6a)))
print("upper quartile (original): " + str(np.mean(m7a)))

R2: 0.3623530924707542
r: 0.6145094487560907
MAE: 0.9678119050368759
MedAE: 0.7352449737947714
MAE (original): 0.7314343492577107
MedAE (original): 0.18060846515939846
upper quartile (original): 0.44234374813507643


#####  **Daily Mean SO2 Regression (Training & Testing):**

In [None]:
gbmlogmean_control = HistGradientBoostingRegressor(max_iter = 5000, learning_rate = 0.001,
                                                  max_depth = 5, max_leaf_nodes = 2 ** 5 - 1,
                                                  random_state=42)
gbmlogmean_control.fit(X_train, y_train)

In [None]:
gbmlogmean_control_preds_train = gbmlogmean_control.predict(X_train)
gbmlogmean_control_preds_test = gbmlogmean_control.predict(X_test)

In [None]:
print("Train R2: " + str(r2_score(y_train, gbmlogmean_control_preds_train)))
print("Train R: " + str(scipy.stats.pearsonr(np.array(y_train).flatten(), gbmlogmean_control_preds_train)[0]))
print("Train MSE: " + str(mean_squared_error(y_train, gbmlogmean_control_preds_train)))
print("Train MAE: " + str(mean_absolute_error(y_train, gbmlogmean_control_preds_train)))
print("Train MedAE: " + str(median_absolute_error(y_train, gbmlogmean_control_preds_train)))
print("Train MAE (original): " + str(mean_squared_error(np.exp(y_train) - 0.001, np.exp(gbmlogmean_control_preds_train) - 0.001)))
print("Train MAE (original): " + str(mean_absolute_error(np.exp(y_train) - 0.001, np.exp(gbmlogmean_control_preds_train) - 0.001)))
print("Train MedAE (original): " + str(median_absolute_error(np.exp(y_train) - 0.001, np.exp(gbmlogmean_control_preds_train) - 0.001)))
print("Train upper quartile (original): " + str(np.quantile(abs(np.exp(y_train) - np.exp(gbmlogmean_control_preds_train)), 0.75)))
print("\n")
print("Test R2: " + str(r2_score(y_test, gbmlogmean_control_preds_test)))
print("Test R: " + str(scipy.stats.pearsonr(np.array(y_test).flatten(), gbmlogmean_control_preds_test)[0]))
print("Test MSE: " + str(mean_squared_error(y_test, gbmlogmean_control_preds_test)))
print("Test MAE: " + str(mean_absolute_error(y_test, gbmlogmean_control_preds_test)))
print("Test MedAE: " + str(median_absolute_error(y_test, gbmlogmean_control_preds_test)))
print("Test MAE (original): " + str(mean_squared_error(np.exp(y_test) - 0.001, np.exp(gbmlogmean_control_preds_test) - 0.001)))
print("Test MAE (original): " + str(mean_absolute_error(np.exp(y_test) - 0.001, np.exp(gbmlogmean_control_preds_test) - 0.001)))
print("Test MedAE (original): " + str(median_absolute_error(np.exp(y_test) - 0.001, np.exp(gbmlogmean_control_preds_test) - 0.001)))
print("Test upper quartile (original): " + str(np.quantile(abs(np.exp(y_test) - np.exp(gbmlogmean_control_preds_test)), 0.75)))

Train R2: 0.5322119681617024
Train R: 0.7667331708051086
Train MSE: 1.2166865188612694
Train MAE: 0.8316340356507674
Train MedAE: 0.6354516778842864
Train MAE (original): 4.365517573522495
Train MAE (original): 0.6761531242386526
Train MedAE (original): 0.1498870220075325
Train upper quartile (original): 0.378153543249021


Test R2: 0.3676424551359253
Test R: 0.619193062197306
Test MSE: 1.7128329817119154
Test MAE: 0.9856529511686465
Test MedAE: 0.7387912271165589
Test MAE (original): 4.700790719158652
Test MAE (original): 0.7207986461896725
Test MedAE (original): 0.17143467099870585
Test upper quartile (original): 0.4075431838473958


#####  **Daily Max SO2 Regression (Cross-Validation):**

In [None]:
endog = np.log(data[data['SO2_above_0'] == 1]['max_so2'] + 0.001)
scaler = StandardScaler().fit(data[data['SO2_above_0'] == 1].iloc[:,6:318])
exog = np.concatenate((
        scaler.transform(data[data['SO2_above_0'] == 1].iloc[:,6:318]),
        data[data['SO2_above_0'] == 1].iloc[:,318:323],
), axis = 1)
group = data[data['SO2_above_0'] == 1]['id']

X_train, X_test, y_train, y_test = train_test_split(exog, endog, test_size=0.2, random_state=42, stratify=group)

In [None]:
gbmlogmean = HistGradientBoostingRegressor(max_iter = 5000,
                                           learning_rate = 0.001,
                                           max_depth = 5,
                                           max_leaf_nodes = 2 ** 5 - 1,
                                           random_state=42)

m1a, m2a, m3a, m4a, m5a, m6a, m7a, \
  m1b, m2b, m3b, m4b, m5b, m6b, m7b = cross_validate_fit_regression(gbmlogmean , X_train, y_train)

In [None]:
print("R2: " + str(np.mean(m1a)))
print("r: " + str(np.mean(m2a)))
print("MAE: " + str(np.mean(m3a)))
print("MedAE: " + str(np.mean(m4a)))
print("MAE (original): " + str(np.mean(m5a)))
print("MedAE (original): " + str(np.mean(m6a)))
print("upper quartile (original): " + str(np.mean(m7a)))

R2: 0.39470496481786266
r: 0.6416712689622848
MAE: 0.923676755044237
MedAE: 0.7534499728978206
MAE (original): 3.241800960974367
MedAE (original): 0.7152887557910284
upper quartile (original): 2.0144860134703064


#####  **Daily Max SO2 Regression (Training & Testing):**

In [None]:
gbmlogmax_control = HistGradientBoostingRegressor(max_iter = 5000, learning_rate = 0.001,
                                                  max_depth = 5, max_leaf_nodes = 2 ** 5 - 1,
                                                  random_state=42)
gbmlogmax_control.fit(X_train, y_train)

In [None]:
gbmlogmax_control_preds_train = gbmlogmax_control.predict(X_train)
gbmlogmax_control_preds_test = gbmlogmax_control.predict(X_test)

In [None]:
print("Train R2: " + str(r2_score(y_train, gbmlogmax_control_preds_train)))
print("Train R: " + str(scipy.stats.pearsonr(np.array(y_train).flatten(), gbmlogmax_control_preds_train)[0]))
print("Train MSE: " + str(mean_squared_error(y_train, gbmlogmax_control_preds_train)))
print("Train MAE: " + str(mean_absolute_error(y_train, gbmlogmax_control_preds_train)))
print("Train MedAE: " + str(median_absolute_error(y_train, gbmlogmax_control_preds_train)))
print("Train MSE (original): " + str(mean_squared_error(np.exp(y_train) - 0.001, np.exp(gbmlogmax_control_preds_train) - 0.001)))
print("Train MAE (original): " + str(mean_absolute_error(np.exp(y_train) - 0.001, np.exp(gbmlogmax_control_preds_train) - 0.001)))
print("Train MedAE (original): " + str(median_absolute_error(np.exp(y_train) - 0.001, np.exp(gbmlogmax_control_preds_train) - 0.001)))
print("Train upper quartile (original): " + str(np.quantile(abs(np.exp(y_train) - np.exp(gbmlogmax_control_preds_train)), 0.75)))
print("\n")
print("Test R2: " + str(r2_score(y_test, gbmlogmax_control_preds_test)))
print("Test R: " + str(scipy.stats.pearsonr(np.array(y_test).flatten(), gbmlogmax_control_preds_test)[0]))
print("Test MSE: " + str(mean_squared_error(y_test, gbmlogmax_control_preds_test)))
print("Test MAE: " + str(mean_absolute_error(y_test, gbmlogmax_control_preds_test)))
print("Test MedAE: " + str(median_absolute_error(y_test, gbmlogmax_control_preds_test)))
print("Test MSE (original): " + str(mean_squared_error(np.exp(y_test) - 0.001, np.exp(gbmlogmax_control_preds_test) - 0.001)))
print("Test MAE (original): " + str(mean_absolute_error(np.exp(y_test) - 0.001, np.exp(gbmlogmax_control_preds_test) - 0.001)))
print("Test MedAE (original): " + str(median_absolute_error(np.exp(y_test) - 0.001, np.exp(gbmlogmax_control_preds_test) - 0.001)))
print("Test upper quartile (original): " + str(np.quantile(abs(np.exp(y_test) - np.exp(gbmlogmax_control_preds_test)), 0.75)))

Train R2: 0.5630121619834944
Train R: 0.7839886251065744
Train MSE: 1.0060196734293427
Train MAE: 0.7852594017849734
Train MedAE: 0.6432782234248068
Train MSE (original): 104.12534505279909
Train MAE (original): 2.999215945748895
Train MedAE (original): 0.5761387535688619
Train upper quartile (original): 1.6677278227683348


Test R2: 0.4019843113562235
Test R: 0.6479161578908418
Test MSE: 1.3926292369885027
Test MAE: 0.9356295965017764
Test MedAE: 0.7684654204643037
Test MSE (original): 132.18242593857715
Test MAE (original): 3.223446006803147
Test MedAE (original): 0.6968297673032124
Test upper quartile (original): 1.865206781020686


####  **FINAL 3) HGBM： Full features with Spatiotemporally-lagged Surface Monitored SO2**

#####  **Binary Classification (Cross-Validation):**

In [None]:
y = data['SO2_above_0']
X = np.array(data.iloc[:,6:-1])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
binary_gbm = HistGradientBoostingClassifier(max_iter = 5000, learning_rate = 0.001,
                                            max_depth = 5, max_leaf_nodes = 2 ** 5 - 1,
                                            random_state=42,
                                            class_weight={0: 1 / (Counter(data['SO2_above_0'])[0] / (Counter(data['SO2_above_0'])[1] + Counter(data['SO2_above_0'])[0])),
                                                          1: 1 / (Counter(data['SO2_above_0'])[1] / (Counter(data['SO2_above_0'])[1] + Counter(data['SO2_above_0'])[0]))})

accuracy, precision, recall, f1score = cross_validate_fit_binary_classification(binary_gbm , X_train, y_train)

In [None]:
print(np.mean([accuracy[n][0] for n in range(10)]))
print(np.mean([precision[n][0] for n in range(10)]))
print(np.mean([recall[n][0] for n in range(10)]))
print(np.mean([f1score[n][0] for n in range(10)]))

0.8323336050540044
0.8642892493149613
0.8323336050540044
0.8429701228542724


#####  **Binary Classification (Training & Testing):**

In [None]:
binary_gbm = HistGradientBoostingClassifier(max_iter = 5000, learning_rate = 0.001,
                                            max_depth = 5, max_leaf_nodes = 2 ** 5 - 1,
                                            random_state=42,
                                            class_weight={0: 1 / (Counter(data['SO2_above_0'])[0] / (Counter(data['SO2_above_0'])[1] + Counter(data['SO2_above_0'])[0])),
                                                          1: 1 / (Counter(data['SO2_above_0'])[1] / (Counter(data['SO2_above_0'])[1] + Counter(data['SO2_above_0'])[0]))})
binary_gbm.fit(X_train, y_train)

In [None]:
binary_preds_train = binary_gbm.predict(X_train)
binary_preds_test = binary_gbm.predict(X_test)

In [None]:
print(classification_report(y_train, binary_preds_train))

              precision    recall  f1-score   support

           0       0.65      0.97      0.78      1252
           1       0.99      0.89      0.94      5756

    accuracy                           0.90      7008
   macro avg       0.82      0.93      0.86      7008
weighted avg       0.93      0.90      0.91      7008



In [None]:
confusion_matrix(y_train, binary_preds_train)

array([[1211,   41],
       [ 650, 5106]])

In [None]:
print(classification_report(y_test, binary_preds_test))

              precision    recall  f1-score   support

           0       0.51      0.78      0.61       313
           1       0.95      0.84      0.89      1439

    accuracy                           0.83      1752
   macro avg       0.73      0.81      0.75      1752
weighted avg       0.87      0.83      0.84      1752



In [None]:
confusion_matrix(y_test, binary_preds_test)

array([[ 244,   69],
       [ 237, 1202]])

#####  **Daily Mean SO2 Regression (Cross-Validation):**

In [None]:
endog = np.log(data[data['SO2_above_0'] == 1]['mean_so2'] + 0.001)
scaler = StandardScaler().fit(data[data['SO2_above_0'] == 1].iloc[:,6:318])
exog = np.concatenate((
        scaler.transform(data[data['SO2_above_0'] == 1].iloc[:,6:318]),
        data[data['SO2_above_0'] == 1].iloc[:,318:359],
), axis = 1)
group = data[data['SO2_above_0'] == 1]['id']

X_train, X_test, y_train, y_test = train_test_split(exog, endog, test_size=0.2, random_state=42, stratify=group)

In [None]:
gbmlogmean = HistGradientBoostingRegressor(max_iter = 5000,
                                           learning_rate = 0.001,
                                           max_depth = 5,
                                           max_leaf_nodes = 2 ** 5 - 1,
                                           random_state=42)

m1a, m2a, m3a, m4a, m5a, m6a, m7a, \
  m1b, m2b, m3b, m4b, m5b, m6b, m7b = cross_validate_fit_regression(gbmlogmean , X_train, y_train)

In [None]:
print("R2: " + str(np.mean(m1a)))
print("r: " + str(np.mean(m2a)))
print("MAE: " + str(np.mean(m3a)))
print("MedAE: " + str(np.mean(m4a)))
print("MAE (original): " + str(np.mean(m5a)))
print("MedAE (original): " + str(np.mean(m6a)))
print("upper quartile (original): " + str(np.mean(m7a)))

R2: 0.45128175050607977
r: 0.6777348841536721
MAE: 0.8748860605515034
MedAE: 0.6159551290053347
MAE (original): 0.6344249071627328
MedAE (original): 0.15549580620025627
upper quartile (original): 0.40072953864965344


#####  **Daily Mean SO2 Regression (Training & Testing):**

In [None]:
gbmlogmean = HistGradientBoostingRegressor(max_iter = 5000, learning_rate = 0.001,
                                           max_depth = 5, max_leaf_nodes = 2 ** 5 - 1,
                                           random_state=42)
gbmlogmean.fit(X_train, y_train)

In [None]:
gbmlogmean_preds_train = gbmlogmean.predict(X_train)
gbmlogmean_preds_test = gbmlogmean.predict(X_test)

In [None]:
print("Train R2: " + str(r2_score(y_train, gbmlogmean_preds_train)))
print("Train R: " + str(scipy.stats.pearsonr(np.array(y_train).flatten(), gbmlogmean_preds_train)[0]))
print("Train MSE: " + str(mean_squared_error(y_train, gbmlogmean_preds_train)))
print("Train MAE: " + str(mean_absolute_error(y_train, gbmlogmean_preds_train)))
print("Train MedAE: " + str(median_absolute_error(y_train, gbmlogmean_preds_train)))
print("Train MSE (original): " + str(mean_squared_error(np.exp(y_train) - 0.001, np.exp(gbmlogmean_preds_train) - 0.001)))
print("Train MAE (original): " + str(mean_absolute_error(np.exp(y_train) - 0.001, np.exp(gbmlogmean_preds_train) - 0.001)))
print("Train MedAE (original): " + str(median_absolute_error(np.exp(y_train) - 0.001, np.exp(gbmlogmean_preds_train) - 0.001)))
print("Train upper quartile (original): " + str(np.quantile(abs(np.exp(y_train) - np.exp(gbmlogmean_preds_train)), 0.75)))
print("\n")
print("Test R2: " + str(r2_score(y_test, gbmlogmean_preds_test)))
print("Test R: " + str(scipy.stats.pearsonr(np.array(y_test).flatten(), gbmlogmean_preds_test)[0]))
print("Test MSE: " + str(mean_squared_error(y_test, gbmlogmean_preds_test)))
print("Test MAE: " + str(mean_absolute_error(y_test, gbmlogmean_preds_test)))
print("Test MedAE: " + str(median_absolute_error(y_test, gbmlogmean_preds_test)))
print("Test MSE (original): " + str(mean_squared_error(np.exp(y_test) - 0.001, np.exp(gbmlogmean_preds_test) - 0.001)))
print("Test MAE (original): " + str(mean_absolute_error(np.exp(y_test) - 0.001, np.exp(gbmlogmean_preds_test) - 0.001)))
print("Test MedAE (original): " + str(median_absolute_error(np.exp(y_test) - 0.001, np.exp(gbmlogmean_preds_test) - 0.001)))
print("Test upper quartile (original): " + str(np.quantile(abs(np.exp(y_test) - np.exp(gbmlogmean_preds_test)), 0.75)))

Train R2: 0.6295469533165862
Train R: 0.812082792624095
Train MSE: 0.963524496339826
Train MAE: 0.7205678517501773
Train MedAE: 0.5092074118412466
Train MSE (original): 3.052470116366343
Train MAE (original): 0.5508581774386284
Train MedAE (original): 0.12268311537476262
Train upper quartile (original): 0.3212563715786113


Test R2: 0.45898711252159374
Test R: 0.6812661844751344
Test MSE: 1.4654126051479281
Test MAE: 0.8849909509900246
Test MedAE: 0.6300311211122835
Test MSE (original): 3.4043310105598588
Test MAE (original): 0.6212939734905499
Test MedAE (original): 0.14288496160380298
Test upper quartile (original): 0.3974519757707491


#####  **Daily Max SO2 Regression (Cross-Validation):**

In [None]:
endog = np.log(data[data['SO2_above_0'] == 1]['max_so2'] + 0.001)
scaler = StandardScaler().fit(data[data['SO2_above_0'] == 1].iloc[:,6:318])
exog = np.concatenate((
        scaler.transform(data[data['SO2_above_0'] == 1].iloc[:,6:318]),
        data[data['SO2_above_0'] == 1].iloc[:,318:359],
), axis = 1)
group = data[data['SO2_above_0'] == 1]['id']

X_train, X_test, y_train, y_test = train_test_split(exog, endog, test_size=0.2, random_state=42, stratify=group)

In [None]:
gbmlogmax = HistGradientBoostingRegressor(max_iter = 5000,
                                          learning_rate = 0.001,
                                          max_depth = 5,
                                          max_leaf_nodes = 2 ** 5 - 1,
                                          random_state=42)

m1a, m2a, m3a, m4a, m5a, m6a, m7a, \
  m1b, m2b, m3b, m4b, m5b, m6b, m7b = cross_validate_fit_regression(gbmlogmean , X_train, y_train)

In [None]:
print("R2: " + str(np.mean(m1a)))
print("r: " + str(np.mean(m2a)))
print("MAE: " + str(np.mean(m3a)))
print("MedAE: " + str(np.mean(m4a)))
print("MAE (original): " + str(np.mean(m5a)))
print("MedAE (original): " + str(np.mean(m6a)))
print("upper quartile (original): " + str(np.mean(m7a)))

R2: 0.4424022210311069
r: 0.6727928463114373
MAE: 0.8722778094855291
MedAE: 0.7035495289828263
MAE (original): 3.041886535165683
MedAE (original): 0.664642210316139
upper quartile (original): 2.0568198067102132


#####  **Daily Max SO2 Regression (Training & Testing):**

In [None]:
gbmlogmax = HistGradientBoostingRegressor(max_iter = 5000, learning_rate = 0.001,
                                          max_depth = 5, max_leaf_nodes = 2 ** 5 - 1,
                                          random_state=42)
gbmlogmax.fit(X_train, y_train)

In [None]:
gbmlogmax_preds_train = gbmlogmax.predict(X_train)
gbmlogmax_preds_test = gbmlogmax.predict(X_test)

In [None]:
print("Train R2: " + str(r2_score(y_train, gbmlogmax_preds_train)))
print("Train R: " + str(scipy.stats.pearsonr(np.array(y_train).flatten(), gbmlogmax_preds_train)[0]))
print("Train MSE: " + str(mean_squared_error(y_train, gbmlogmax_preds_train)))
print("Train MAE: " + str(mean_absolute_error(y_train, gbmlogmax_preds_train)))
print("Train MedAE: " + str(median_absolute_error(y_train, gbmlogmax_preds_train)))
print("Train MSE (original): " + str(mean_squared_error(np.exp(y_train) - 0.001, np.exp(gbmlogmax_preds_train) - 0.001)))
print("Train MAE (original): " + str(mean_absolute_error(np.exp(y_train) - 0.001, np.exp(gbmlogmax_preds_train) - 0.001)))
print("Train MedAE (original): " + str(median_absolute_error(np.exp(y_train) - 0.001, np.exp(gbmlogmax_preds_train) - 0.001)))
print("Train upper quartile (original): " + str(np.quantile(abs(np.exp(y_train) - np.exp(gbmlogmax_preds_train)), 0.75)))
print("\n")
print("Test R2: " + str(r2_score(y_test, gbmlogmax_preds_test)))
print("Test R: " + str(scipy.stats.pearsonr(np.array(y_test).flatten(), gbmlogmax_preds_test)[0]))
print("Test MSE: " + str(mean_squared_error(y_test, gbmlogmax_preds_test)))
print("Test MAE: " + str(mean_absolute_error(y_test, gbmlogmax_preds_test)))
print("Test MedAE: " + str(median_absolute_error(y_test, gbmlogmax_preds_test)))
print("Test MSE (original): " + str(mean_squared_error(np.exp(y_test) - 0.001, np.exp(gbmlogmax_preds_test) - 0.001)))
print("Test MAE (original): " + str(mean_absolute_error(np.exp(y_test) - 0.001, np.exp(gbmlogmax_preds_test) - 0.001)))
print("Test MedAE (original): " + str(median_absolute_error(np.exp(y_test) - 0.001, np.exp(gbmlogmax_preds_test) - 0.001)))
print("Test upper quartile (original): " + str(np.quantile(abs(np.exp(y_test) - np.exp(gbmlogmax_preds_test)), 0.75)))

Train R2: 0.6214034620469683
Train R: 0.8095507871095845
Train MSE: 0.8715930566896064
Train MAE: 0.7206934175938562
Train MedAE: 0.5756101093992164
Train MSE (original): 92.08018861769848
Train MAE (original): 2.6798933323335588
Train MedAE (original): 0.5202511198855675
Train upper quartile (original): 1.5996077478844821


Test R2: 0.4431854085157316
Test R: 0.6704217838596339
Test MSE: 1.2966821680571499
Test MAE: 0.885910569740831
Test MedAE: 0.7034364936615795
Test MSE (original): 124.8812656227077
Test MAE (original): 3.0144548101393327
Test MedAE (original): 0.6659236938216544
Test upper quartile (original): 1.7447238494640906
