Description

Run the XGBoost algorithm for LDEO-HPD Reconstruction 

Uses best parameters from Luke's work

GCB Models 2020 version - through 2019

Inputs

In [1]:
# =========================================
# For accessing directories
# =========================================
root_dir = "/data/artemis/workspace/vbennington/LDEO_HPD/models/XGB/GCB_2021"  # directory output will be written to  

data_output_dir = f"/data/artemis/workspace/vbennington/LDEO_HPD/2021models/data"  # directory with features and pCO2
model_output_dir = f"{root_dir}/trained"  # trained models will be saved here
recon_output_dir = f"{root_dir}/reconstructions" # reconstructions will be saved here
other_output_dir = f"{root_dir}/performance_metrics" # performance metrics will be saved here

approach = 'xg'
# =========================================
# Number of cores you have access to for model training
# =========================================
jobs = 30

In [2]:
import pandas as pd
import xarray as xr
import numpy as np
import numpy.ma as ma
%matplotlib inline
%config InlineBackend.figure_format = 'jpg'
%config InlineBackend.print_figure_kwargs = {'dpi':300, 'bbox_inches': 'tight'}
import matplotlib as mpl
from matplotlib.ticker import AutoMinorLocator
import matplotlib.pyplot as plt
import scipy
import sklearn.linear_model 
import pickle

Modules

In [4]:
# standard imports
import os
import datetime
from pathlib import Path
from collections import defaultdict
import scipy
import random
import numpy as np
import xarray as xr
import pandas as pd
import joblib
import pickle

# machine learning libraries
import sklearn            # machine-learning libary with many algorithms implemented
import xgboost as xgb     # extreme gradient boosting (XGB
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold
import sklearn.model_selection as mselect

# Python file with supporting functions
import pre_HPD

In [5]:
# modeling
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error

Predefined Values

In [10]:
# Loading references

In [6]:
# =========================================
# Defining some inputs for the modeling process
# =========================================

# Train-validate-test split proportions for LET runs:
val_prop = .2
test_prop = .2

# Parameters from Luke's work for consistency:
param_best = {        'random_state':42,
                      'max_depth': 9, 
                      'min_child_weight': 1, 
                      'gamma': 0,
                      'subsample': 0.85,
                      'colsample_bytree': 0.95,
                      'reg_alpha': 0.09,
                      'reg_lambda': 1,
                      'n_estimators': 1500,
                      'learning_rate': 0.05,
                     }

# Feature and target lists for feeding into ML
features_sel = ['sst','sst_anom','SSS','sss_anom','chl_log','chl_anom','mld_log','XCO2','A','B','C','T0','T1']
target_sel = ['error']  # What we reconstruct with ML

In [7]:
models = [ 'cesm_sfco2_1x1_A', 
               #'csiro_spco2_1x1_A',
               'fesom2_sfco2_1x1_A',
               'mpi_sfco2_1x1_A', 
               'cnrm_sfco2_1x1_A',
               'ipsl_sfco2_1x1_A',
               'planktom_sfco2_1x1_A',
               'noresm_sfco2_1x1_A',
               'princeton_sfco2_1x1_A']

In [8]:
# Dates of Reconstruction 
date_range_start = '1982-01-01T00:00:00.000000000'
date_range_end = '2020-12-01T00:00:00.000000000'

# create date vector
dates = pd.date_range(start=date_range_start, 
                      end=date_range_end,freq='MS') + np.timedelta64(14, 'D')

In [9]:
random_seeds = [97, 62, 12] # LDEO HPD
random_seeds = [22, 15, 84] # 2021 models

In [10]:
best_params = param_best # Luke's parameters
test_performance = defaultdict(dict)
K_folds = 3
target_sel= ['error']

print(datetime.datetime.now())

# Data file path
data_dir = f"{data_output_dir}"
fname = f"data_clean_2D_mon_1x1_198201-202012.pkl"
file_path = f"{data_dir}/{fname}"
        
# Read in data, create some selection filters, produce a reduced dataframe
# Same for all models!
df = pd.read_pickle(file_path)
print("Done opening data file")

# Get rid of features we never use:
df = df.drop(columns=['chl','mld'])
print("Dropped unused columns")

# Now, loop on the models to reconstruct the model-obs difference:

for mod in models:
    print(mod)
    
    df['error'] =  df['pCO2'] - df[f"{mod}"]
    
    # Get rid of NaN    
    # Same for all models
    recon_sel = (~df[features_sel+['net_mask']].isna().any(axis=1))   # Dont' have pCO2 for non-SOCAT locations, but have features

    ##################################################################################################################################        
    sel = (recon_sel & (df['socat_mask'] == 1) & (~np.isnan(df['error']))).to_numpy().ravel()           # locations not masked AND IN SOCAT SAMPLING
    print("Obs. length =",sum(sel))
    ###################################################################################################################################
 
    # Convert dataframe to numpy arrays, train/val/test split
    X = df.loc[sel,features_sel].to_numpy()
    y = df.loc[sel,target_sel].to_numpy().ravel()
    
    # Where we want pCO2 reconstructed
    X_recon = df.loc[recon_sel,features_sel].to_numpy()   
    
    ###################################################################################################################################
    # Separate the data sets
    ###################################################################################################################################
    ###################################################################################################################################    
    # Uses train_test_split build into sklearn.model_selection
    N = X.shape[0]
    train_val_idx, train_idx, val_idx, test_idx = pre_HPD.train_val_test_split(N, test_prop, val_prop, random_seeds)
    X_train_val, X_train, X_val, X_test, y_train_val, y_train, y_val, y_test = pre_HPD.apply_splits(X, y, train_val_idx, train_idx, val_idx, test_idx) 
      
    # Fit the model on train/validation data
    model = XGBRegressor(**best_params, n_jobs=jobs)
    model.fit(X_train_val, y_train_val)          

    # Save the model
    pre_HPD.save_model(model, mod, model_output_dir, approach)   #Uncomment when actually running
           
    # Calculate some test error metrics and store in a dictionary
    y_pred_test = model.predict(X_test)
    # for pCO2-model difference
    test_performance[mod] = pre_HPD.evaluate_test(y_test, y_pred_test)
    print(test_performance[mod])

    ######################################################################################
    # Reconstruct Everywhere with Trained Model
    ######################################################################################
    # Everywhere, SOCAT and non-SOCAT ####################################################
    y_recon = model.predict(X_recon)

    # Full reconstruction ##
    df[f'error_{mod}'] = np.nan
    df.loc[recon_sel,[f'error_{mod}']] = y_recon   
        
    DS_recon = df[['net_mask','socat_mask','pCO2', f'{mod}',f'error_{mod}']].to_xarray()

    ########## SAVE ####################################################################################################
    pre_HPD.save_recon(DS_recon, mod, recon_output_dir, approach)   # Uncomment when actually running

    del y_test, y_recon, y, DS_recon
    
print(datetime.datetime.now())

2022-03-02 16:32:26.212680
Done opening data file
Dropped unused columns
cesm_sfco2_1x1_A
Obs. length = 278448
Starting model saving process
Save complete
{'mse': 162.85638017640937, 'mae': 8.329405159635293, 'medae': 5.521587149289104, 'max_error': 202.13710386769168, 'bias': -0.10451801717700437, 'r2': 0.8344498646908731, 'corr': 0.9141307235905182, 'cent_rmse': 12.761091500740879, 'stdev': 27.596539, 'amp_ratio': 0.6835614382290343, 'stdev_ref': 31.364445844246983, 'range_ref': 484.7129558148982, 'iqr_ref': 31.21303327745595}
Starting reconstruction saving process
Save complete
fesom2_sfco2_1x1_A
Obs. length = 280844
Starting model saving process
Save complete
{'mse': 179.58448322811398, 'mae': 8.827463033152462, 'medae': 6.036677794576008, 'max_error': 189.7687765860457, 'bias': 0.06690025441022573, 'r2': 0.8657027504240838, 'corr': 0.9306248462569876, 'cent_rmse': 13.400746512417669, 'stdev': 33.340607, 'amp_ratio': 0.8407536255814106, 'stdev_ref': 36.567969450416086, 'range_ref':

In [11]:
# Saving best parameters and performance metrics
approach_output_dir = f"{other_output_dir}"
param_fname = f"{approach_output_dir}/best_params_dict.pickle"
test_perform_fname = f"{approach_output_dir}/test_performance_dict.pickle"

Path(approach_output_dir).mkdir(parents=True, exist_ok=True)

with open(param_fname, 'wb') as handle:
    pickle.dump(best_params, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(test_perform_fname, 'wb') as handle:
    pickle.dump(test_performance, handle)
    
# Convert performance metrics to dataframes
test_df = pd.DataFrame.from_dict(test_performance,
                                 orient='index')

# Save the dataframes too
test_df_fname = f"{approach_output_dir}/test_performance_df.pickle"

test_df.to_pickle(test_df_fname)