Description

XGBoostRegression to find Uncertainty Bounds

Inputs

In [2]:
# =========================================
# For accessing directories
# =========================================
root_dir = "/data/artemis/workspace/vbennington/SOCAT_ML/pCO2_DIC"

reference_output_dir = f"{root_dir}/references"
data_output_dir = f"{root_dir}/data/processed"
data_output_dir = f"/data/artemis/workspace/vbennington/SOCAT_ML/pCO2_DIC/data/processed"
model_output_dir = f"{root_dir}/models/trained"
recon_output_dir = f"{root_dir}/models/reconstructions"
other_output_dir = f"{root_dir}/models/performance_metrics"

approach = 'xgr'
approach_output_dir = f"{other_output_dir}/{approach}"
# =========================================
# Number of cores you have access to for model training
# =========================================
jobs = 30

In [3]:
import pandas as pd
import xarray as xr
import numpy as np
import numpy.ma as ma
%matplotlib inline
%config InlineBackend.figure_format = 'jpg'
%config InlineBackend.print_figure_kwargs = {'dpi':150, 'bbox_inches': 'tight'}
import matplotlib as mpl
from matplotlib.ticker import AutoMinorLocator
import matplotlib.pyplot as plt
import scipy
import sklearn.linear_model 
import pickle

Modules

In [5]:
# standard imports
import os
import datetime
from pathlib import Path
from collections import defaultdict
import scipy
import random
import numpy as np
import xarray as xr
import pandas as pd
import joblib
import pickle

# machine learning libraries
import sklearn            # machine-learning libary with many algorithms implemented
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb     # extreme gradient boosting (XGB
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold
import sklearn.model_selection as mselect

# Python file with supporting functions
import pre_SOCAT

Predefined Values

In [10]:
# Loading references

In [6]:
# =========================================
# Defining some inputs for the modeling process
# =========================================

# Parameter grids
xg_param_grid = {"n_estimators":[1000, 2000, 3000],
                 "max_depth":[4,5,6,7,8,9]
                }

# Feature and target lists for feeding into ML
features_sel = ['sst','sst_anom','sss','sss_anom','chl_log','chl_anom','mld_log','xco2','A','B','C','T0','T1'] # pCO2_DIC

target_sel = ['pCO2_DIC']  # What we reconstruct with ML
final_sel = ['pCO2'] # What we want RMSE, etc. for (pCO2 = pCO2_DIC + pCO2_T)

In [7]:
# Want to train with 4 out of every 5 months, and test on the fifth month
date_range_start = '1982-01-01T00:00:00.000000000'
date_range_end = '2019-12-01T00:00:00.000000000'

# create date vector
dates = pd.date_range(start=date_range_start, 
                      end=date_range_end,freq='MS') + np.timedelta64(14, 'D')
select_dates = []
test_dates = []
for i in range(0,len(dates)):
    if i % 5 != 0:
        select_dates.append(dates[i])
    if i % 5 == 0:
        test_dates.append(dates[i])
year_mon = []
for i in range(0,len(select_dates)):
    tmp = select_dates[i]
    year_mon.append(f"{tmp.year}-{tmp.month}")
test_year_mon = []
for i in range(0,len(test_dates)):
    tmp = test_dates[i]
    test_year_mon.append(f"{tmp.year}-{tmp.month}")

In [36]:
approach = 'xgr'
approach_output_dir = f"{other_output_dir}/{approach}"
random_seeds = [1,71] # pCO2_DIC
#random_seeds = [68, 23] # pCO2_DIC_train2000s
#random_seeds = [50, 17] # pCO2_DIC_noCHL

first_run = True

In [37]:
if first_run:
    best_params = {} # Uncomment if running cross validation to find best params
else:
    param_fname = f"/data/artemis/workspace/vbennington/SOCAT_ML/pCO2_DIC/models/performance_metrics/xgr/{approach}_best_params_dict.pickle"
    with open(param_fname, 'rb') as handle:
        best_params = pickle.load(handle)
    print(best_params)
    
test_performance = defaultdict(dict)

K_folds = 3

print(datetime.datetime.now())

# Data file path
data_dir = f"{data_output_dir}"
fname = f"data_clean_2D_mon_1x1_198201-201912.pkl"
file_path = f"{data_dir}/{fname}"
        
# Read in data, create some selection filters, produce a reduced dataframe
df = pd.read_pickle(file_path)

# Get rid of features we never use:
df = df.drop(columns=['chl','mld'])

# Test on every fifth month to reduce autocorrelation along cruise tracks, but still sample many climates...
df['year'] = df.index.get_level_values('time').year
df['mon'] = df.index.get_level_values('time').month
df['year_month'] = df['year'].astype(str) + "-" + df['mon'].astype(str)

# Get rid of N    
recon_sel = (~df[features_sel+['net_mask']].isna().any(axis=1))   # Dont' have dpCO2 for non-SOCAT locations, but have features

# Try (200,-250) as cut off based on histogram plots of SOCAT data:
################################################################################################################################        
sel = (recon_sel & (df['socat_mask'] == 1)) & ((df[target_sel] < 300) & ((df[target_sel] > -300))).to_numpy().ravel()           # locations not masked AND IN SOCAT SAMPLING and within reason

print("SOCAT length =",sum(sel))

###################################################################################################################################
# Separate the data sets
###################################################################################################################################

train_val_sel = ((sel) & (pd.Series(df['year_month']).isin(year_mon))).to_numpy().ravel()
print("Train/Val length = ",sum(train_val_sel))

test_sel = ((sel) & (pd.Series(df['year_month']).isin(test_year_mon))).to_numpy().ravel()   # Should be along SOCAT track
print("Test length =",sum(test_sel))     

################################################################################################################################
        
# Convert dataframe to numpy arrays, train/val/test split
X = df.loc[sel,features_sel].to_numpy()         
y = df.loc[sel,target_sel].to_numpy().ravel()

# Where we want pCO2 reconstructed
X_recon = df.loc[recon_sel,features_sel].to_numpy()         


#X_train, X_val, y_train, y_val = mselect.train_test_split(X_train_val, y_train_val, test_size=val_prop, random_state=42)  
# Convert dataframe to numpy arrays, train/val/test split
X_train_val = df.loc[train_val_sel,features_sel].to_numpy()                # create Xtrain and Xtest to randomly select from for X_train and X_test
y_train_val = df.loc[train_val_sel,target_sel].to_numpy().ravel()


print("Have X")

if first_run:
    
    # Define the model with validation set #####  
    model = GradientBoostingRegressor(random_state=random_seeds[0])
    param_grid = xg_param_grid
    grid = GridSearchCV(model, param_grid, cv=K_folds, return_train_score=False, refit=True)
    grid.fit(X_train_val,y_train_val)
    best_params = grid.best_params_
    print(best_params)

# Fit the model on train/validation data for upper bound:
model = GradientBoostingRegressor(loss='quantile',random_state=random_seeds[1], **best_params, alpha=0.95)
model.fit(X_train_val, y_train_val)          

print("Doing y_upper")
# Now, apply everywhere with this upper bound estimate:
y_upper = model.predict(X_recon)
                                               
# Now get lower bounds:
model.set_params(alpha=1-0.95)                                               
model.fit(X_train_val, y_train_val)
print("Doing y_lower")
y_lower = model.predict(X_recon)                                               
                                               
# Now get median prediction:
model.set_params(loss='ls')
model.fit(X_train_val, y_train_val)
                                                                            
# Save this model:
pre_SOCAT.save_model(model, model_output_dir, approach)   #Uncomment when actually running

################ TEST ####################################################################################################
X_test = df.loc[test_sel,features_sel].to_numpy()                #  Test metrics on all of SOCAT data from test years
y_test = df.loc[test_sel,target_sel].to_numpy().ravel()    
        
# Calculate some test error metrics and store in a dictionary
print("Doing Y Test")
y_pred_test = model.predict(X_test)

# for pCO2_DIC
test_performance = pre_SOCAT.evaluate_test(y_test, y_pred_test)
print(test_performance)

# for pCO2
y_final = df.loc[test_sel,final_sel].to_numpy().ravel()  # Real pCO2 from SOCAT
y_pco2t = df.loc[test_sel,'pCO2_T'].to_numpy().ravel() # pCO2_T
y_pred_final = y_pred_test + y_pco2t

pco2_performance = pre_SOCAT.evaluate_test(y_final, y_pred_final)
print(pco2_performance)
######################################################################################


2021-06-21 13:04:40.273121
SOCAT length = 262714
Train/Val length =  209964
Test length = 52750
Have X
{'max_depth': 9, 'n_estimators': 1000}
Doing y_upper
Doing y_lower
Starting model saving process
Save complete
Doing Y Test
{'mse': 274.5420439730508, 'mae': 11.148414845294308, 'medae': 7.606292267503843, 'max_error': 186.197911194561, 'bias': -0.0230382592201539, 'r2': 0.835793306657876, 'corr': 0.9142995182242403, 'cent_rmse': 16.569294288281046, 'stdev': 37.885773063576075, 'amp_ratio': 0.9912416235423145, 'stdev_ref': 40.88923670469988, 'range_ref': 399.4094490875581, 'iqr_ref': 44.52784536195463}
{'mse': 274.5420439730508, 'mae': 11.14841484529431, 'medae': 7.606292267503818, 'max_error': 186.19791119456096, 'bias': -0.02303825922018632, 'r2': 0.78988683767585, 'corr': 0.8891469467108202, 'cent_rmse': 16.569294288281046, 'stdev': 33.09338863555198, 'amp_ratio': 1.0146784805167066, 'stdev_ref': 36.14746132635529, 'range_ref': 376.92213983805937, 'iqr_ref': 43.41449354886481}


In [38]:
# Make room for our reconstruction:
df=df.drop(columns=['sss','sst','mld_log','chl_log','sss_anom','sst_anom','A','B','C','T0','T1'])

# Reconstruct where no data ##########################################################
#y_pred_nonsocat = model.predict(X_nonsocat)

# Everywhere ####################################################
y_recon = model.predict(X_recon)

# Full reconstruction ##
df['pCO2_DIC_recon'] = np.nan
df.loc[recon_sel,['pCO2_DIC_recon']] = y_recon   

# Full reconstruction Lower ##
df['pCO2_DIC_lower_recon'] = np.nan
df.loc[recon_sel,['pCO2_DIC_lower_recon']] = y_lower 

# Full reconstruction Upper ##
df['pCO2_DIC_upper_recon'] = np.nan
df.loc[recon_sel,['pCO2_DIC_upper_recon']] = y_upper 

# Full reconstruction ##
df['pCO2_recon'] = np.nan
df.loc[recon_sel,['pCO2_recon']] = y_recon + df.loc[recon_sel,'pCO2_T'].to_numpy() # pCO2-DIC + pCO2_T

df['pCO2_test_recon'] = np.nan
df.loc[test_sel,['pCO2_test_recon']] = y_pred_final

df['pCO2_test'] = np.nan
df.loc[test_sel,['pCO2_test']] = y_final

df['pCO2_DIC_test_recon'] = np.nan
df.loc[test_sel,['pCO2_DIC_test_recon']] = y_pred_test

df['pCO2_DIC_test'] = np.nan
df.loc[test_sel,['pCO2_DIC_test']] = y_test

        
DS_recon = df[['net_mask','socat_mask','pCO2_DIC','pCO2_DIC_upper_recon','pCO2_DIC_lower_recon','pCO2_DIC_recon','pCO2','pCO2_recon','pCO2_T','pCO2_DIC_test','pCO2_DIC_test_recon','pCO2_test','pCO2_test_recon']].to_xarray()

########## SAVE ####################################################################################################
pre_SOCAT.save_recon(DS_recon, recon_output_dir, approach)   # Uncomment when actually running

print(datetime.datetime.now())

Starting reconstruction saving process
Save complete
2021-06-22 11:20:06.559532


In [39]:
# Saving best parameters and performance metrics
approach_output_dir = f"{other_output_dir}/{approach}"
param_fname = f"{approach_output_dir}/{approach}_best_params_dict.pickle"
test_perform_fname = f"{approach_output_dir}/{approach}_test_performance_dict.pickle"

Path(approach_output_dir).mkdir(parents=True, exist_ok=True)

with open(param_fname, 'wb') as handle:
    pickle.dump(best_params, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(test_perform_fname, 'wb') as handle:
    pickle.dump(test_performance, handle)
    
# Convert performance metrics to dataframes
test_df = pd.DataFrame.from_dict(test_performance,
                                 orient='index')

# Save the dataframes too
test_df_fname = f"{approach_output_dir}/{approach}_test_performance_df.pickle"

test_df.to_pickle(test_df_fname)