This script is used for tuning the hyperparameters. The hyperparameters would be used for training the emulators

```bash
qsub -X -I -l select=1:ncpus=36 -l walltime=03:00:00 -q regular -A <NCAR_project_id>
source /glade/work/$USER/personal_clone_name/bin/activate
start-notebook
```

In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import xgboost as xgb
import skopt
import sklearn
from xgboost import XGBRegressor
from sklearn import model_selection, metrics
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate
from sklearn.metrics import mean_squared_error
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import time
import matplotlib.pyplot as plt
import gc

print(f"XGBoost version {xgb.__version__}")
print(f"skopt version {skopt.__version__}")
print(f"sklearn version {sklearn.__version__}")

XGBoost version 1.3.3
skopt version 0.8.1
sklearn version 0.22


## load the data

In [2]:
%%time
df_ls = []
for start_date in ["2006","2061"]:
    df_ls.append(pd.read_csv("/glade/scratch/zhonghua/ensem_training_data/"+start_date+".csv")\
                 .sample(frac=0.001, random_state=66))
    
df = pd.concat(df_ls)
del df_ls
gc.collect()

CPU times: user 2min 17s, sys: 18.1 s, total: 2min 35s
Wall time: 2min 40s


## define the features

In [3]:
df["time"]=pd.to_datetime(df["time"])
months = ["Jan","Feb", "Mar", "Apr", "May", "June", "July", "Aug", "Sept", "Oct", "Nov", "Dec"]
month_to_months = dict(zip(range(1,13), months))
df = pd.concat([df,pd.get_dummies(df["time"].dt.month.map(month_to_months).astype('category'))],axis=1)

features_ls = ["QBOT","UBOT","VBOT",
               "TREFHT",
               "FLNS","FSNS",
               "PRECT","PRSN",
               "Jan","Feb", "Mar", 
               "Apr", "May", "June", 
               "July", "Aug", "Sept", 
               "Oct", "Nov", "Dec"]
    
pred = ["TREFMXAV_U"]

X_train = df[features_ls]
y_train = df[pred]

## BayesSearchCV

In [4]:
def hyper_tune(n_iter):
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html
    # https://xgboost.readthedocs.io/en/latest/parameter.html

    # hyperparameters from https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost-tuning.html
    # changes: 
    # n_estimator from [1,4000] -> [1,500]
    # max_depth from [0,10] -> [0,6]

    opt = BayesSearchCV(
            XGBRegressor(objective ='reg:squarederror',n_jobs=36),
            {
                'learning_rate': Real(0.01,1),
                'n_estimators': Integer(10,600),
                'max_depth': Integer(2,7)
            },
            scoring = 'neg_mean_squared_error',
            n_iter=n_iter,
            cv=5,
            random_state=66
    )
    _ = opt.fit(df[features_ls],df[pred])
    
    for k in opt.best_params_:
        print(f"{k}: {opt.best_params_[k]}")

In [5]:
%%time
hyper_tune(n_iter=128)



learning_rate: 0.08796346103242554
max_depth: 6
n_estimators: 576
CPU times: user 1d 4h 47min 19s, sys: 1h 5min 47s, total: 1d 5h 53min 6s
Wall time: 52min 6s


## print number of samples

In [6]:
df.shape

(103696, 25)