In [1]:
# basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.cluster import KMeans

#others
from xgboost import XGBRegressor
import cartopy.crs as ccrs
import cartopy.mpl.ticker as cticker
import time
import xarray as xr
import sherpa

In [2]:
# Models {RandomForestRegressor, GradientBoostingRegressor, XGBRegressor}
# Datasets {original, +elev, +seasonality, 6grids, Linear-Interpolation}

# 1. Hyperparameter Tuning on Original Dataset

In [3]:

# Variables from config file
BASE_DIR = "/home/yusukemh/github/yusukemh/StatisticalDownscaling/dataset"

# Load the dataset
df_metadata = pd.read_excel(f"{BASE_DIR}/FilledDataset2012.xlsx", sheet_name="Header")
df_data_original = pd.read_csv(f"{BASE_DIR}/dataset.csv")

# make sure there is no NaN value
assert df_data_original.isnull().values.any() == False
print(f"There are {df_data_original.shape[0]} samples.")
print(
    "Each sample is associated with lat and lon coordinates.\n" + 
    "Use only the closest observation to represent each field, from 16 different NetCDF files.", )

df_combined = df_data_original.merge(right=df_metadata[["SKN", "ElevFT"]], left_on="skn", right_on="SKN")
df_clean = (
    df_combined.drop(
        labels=["lat", "lon", "year", "month", "SKN", "skn", "Lon_DD_updated"],
        axis=1
    ).rename(
        columns={"Lat_DD": "lat", "Lon_DD": "lon", "ElevFT": "elev"}
    )
)

# split the dataset without "elev"
X = np.array(df_clean.drop(labels=["data_in", "elev"], axis=1))
Y = np.array(df_clean["data_in"])

Xtemp, Xtest, Ytemp, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)
Xtrain, Xvalid, Ytrain, Yvalid = train_test_split(Xtemp, Ytemp, test_size=0.25, random_state=42)

There are 865561 samples.
Each sample is associated with lat and lon coordinates.
Use only the closest observation to represent each field, from 16 different NetCDF files.


## 1.1 RadomForestRegressor

In [None]:
file_name = './hyperparametertuning/RFR_original.txt'

parameters = [
    sherpa.Choice('n_estimators', list(range(50, 310, 10))),
    sherpa.Discrete('min_samples_split', [2, 10])
]
alg = sherpa.algorithms.RandomSearch(max_num_trials=50)
study = sherpa.Study(parameters=parameters,
                     algorithm=alg,
                     lower_is_better=True)

for trial in study:
    start = time.time()
    line = '===============================================\n'
    params = {
        "n_estimators": trial.parameters['n_estimators'],
        "max_depth": None,
        "min_samples_split": trial.parameters["min_samples_split"],
        "verbose": True,
        "n_jobs": -1
    }
    print(params)
    line += str(params) + '\n'
    model = RandomForestRegressor(**params)
    model.fit(Xtrain, Ytrain)
    training_error = mean_squared_error(Ytrain, model.predict(Xtrain))
    validation_error = mean_squared_error(Yvalid, model.predict(Xvalid))
    study.add_observation(
        trial=trial,
        iteration=1,
        objective=validation_error,
        context={'training_error': training_error}
    )
    end = time.time()
    line += "MSE on training set  : {:.6f}".format(training_error) + '\n'
    line += "MSE on validation set: {:.6f}".format(validation_error) + '\n'
    line += "elapsed time         : {:.3f}".format(end - start) + '\n'
    
    with open(file_name, 'a') as f:
        f.write(line)

    study.finalize(trial)

print(study.get_best_result())

INFO:sherpa.core:
-------------------------------------------------------
SHERPA Dashboard running. Access via
http://10.100.11.206:8880 if on a cluster or
http://localhost:8880 if running locally.
-------------------------------------------------------


 * Serving Flask app 'sherpa.app.app' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


{'n_estimators': 160, 'max_depth': None, 'min_samples_split': 5, 'verbose': True, 'n_jobs': -1}


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:  2.0min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.0s
[Parallel(n_jobs=8)]: Done 160 out of 160 | elapsed:    7.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 160 out of 160 | elapsed:    2.4s finished


{'n_estimators': 280, 'max_depth': None, 'min_samples_split': 8, 'verbose': True, 'n_jobs': -1}


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed:  3.4min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    8.1s
[Parallel(n_jobs=8)]: Done 280 out of 280 | elapsed:   11.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done 280 out of 280 | elapsed:    4.1s finished


{'n_estimators': 290, 'max_depth': None, 'min_samples_split': 4, 'verbose': True, 'n_jobs': -1}


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 290 out of 290 | elapsed:  3.7min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    9.7s
[Parallel(n_jobs=8)]: Done 290 out of 290 | elapsed:   14.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    3.3s
[Parallel(n_jobs=8)]: Done 290 out of 290 | elapsed:    4.9s finished


{'n_estimators': 70, 'max_depth': None, 'min_samples_split': 5, 'verbose': True, 'n_jobs': -1}


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:   55.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done  70 out of  70 | elapsed:    3.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done  70 out of  70 | elapsed:    1.6s finished


{'n_estimators': 150, 'max_depth': None, 'min_samples_split': 6, 'verbose': True, 'n_jobs': -1}


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.9min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:    6.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:    2.0s finished


{'n_estimators': 300, 'max_depth': None, 'min_samples_split': 2, 'verbose': True, 'n_jobs': -1}


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   31.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  4.0min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.7s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   12.5s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:   19.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    4.3s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    6.7s finished


{'n_estimators': 280, 'max_depth': None, 'min_samples_split': 3, 'verbose': True, 'n_jobs': -1}


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   30.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 280 out of 280 | elapsed:  3.7min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    9.6s
[Parallel(n_jobs=8)]: Done 280 out of 280 | elapsed:   14.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    3.6s
[Parallel(n_jobs=8)]: Done 280 out of 280 | elapsed:    5.3s finished


{'n_estimators': 150, 'max_depth': None, 'min_samples_split': 3, 'verbose': True, 'n_jobs': -1}


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.9min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    3.0s
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:   10.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:    3.6s finished


{'n_estimators': 250, 'max_depth': None, 'min_samples_split': 9, 'verbose': True, 'n_jobs': -1}


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
