In [3]:
# basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.cluster import KMeans

#others
from xgboost import XGBRegressor
import cartopy.crs as ccrs
import cartopy.mpl.ticker as cticker
import time
import xarray as xr
import sherpa

In [2]:
# Models {RandomForestRegressor, GradientBoostingRegressor, XGBRegressor}
# Datasets {original, +elev, +seasonality, 6grids, Linear-Interpolation}

# 1. Hyperparameter Tuning on Original Dataset

In [4]:

# Variables from config file
BASE_DIR = "/home/yusukemh/github/yusukemh/StatisticalDownscaling/dataset"

# Load the dataset
df_metadata = pd.read_excel(f"{BASE_DIR}/FilledDataset2012.xlsx", sheet_name="Header")
df_data_original = pd.read_csv(f"{BASE_DIR}/dataset.csv")

# make sure there is no NaN value
assert df_data_original.isnull().values.any() == False
print(f"There are {df_data_original.shape[0]} samples.")
print(
    "Each sample is associated with lat and lon coordinates.\n" + 
    "Use only the closest observation to represent each field, from 16 different NetCDF files.", )

df_combined = df_data_original.merge(right=df_metadata[["SKN", "ElevFT"]], left_on="skn", right_on="SKN")
df_clean = (
    df_combined.drop(
        labels=["lat", "lon", "year", "month", "SKN", "skn", "Lon_DD_updated"],
        axis=1
    ).rename(
        columns={"Lat_DD": "lat", "Lon_DD": "lon", "ElevFT": "elev"}
    )
)

# split the dataset without "elev"
X = np.array(df_clean.drop(labels=["data_in", "elev"], axis=1))
Y = np.array(df_clean["data_in"])

Xtemp, Xtest, Ytemp, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)
Xtrain, Xvalid, Ytrain, Yvalid = train_test_split(Xtemp, Ytemp, test_size=0.25, random_state=42)

There are 865561 samples.
Each sample is associated with lat and lon coordinates.
Use only the closest observation to represent each field, from 16 different NetCDF files.


## 1.1 RadomForestRegressor

In [None]:
file_name = './hyperparametertuning/RFR_original.txt'

parameters = [
    sherpa.Choice('n_estimators', list(range(50, 310, 10))),
    sherpa.Discrete('min_samples_split', [2, 10])
]
alg = sherpa.algorithms.RandomSearch(max_num_trials=2)
study = sherpa.Study(parameters=parameters,
                     algorithm=alg,
                     lower_is_better=True)

for trial in study:
    start = time.time()
    line = '===============================================\n'
    params = {
        "n_estimators": trial.parameters['n_estimators'],
        "max_depth": None,
        "min_samples_split": trial.parameters["min_samples_split"],
        "verbose": True,
        "n_jobs": -1
    }
    print(params)
    line += str(params) + '\n'
    model = RandomForestRegressor(**params)
    model.fit(Xtrain, Ytrain)
    training_error = mean_squared_error(Ytrain, model.predict(Xtrain))
    validation_error = mean_squared_error(Yvalid, model.predict(Xvalid))
    study.add_observation(
        trial=trial,
        iteration=1,
        objective=validation_error,
        context={'training_error': training_error}
    )
    end = time.time()
    line += "MSE on training set  : {:.6f}".format(training_error) + '\n'
    line += "MSE on validation set: {:.6f}".format(validation_error) + '\n'
    line += "elapsed time         : {:.3f}".format(end - start) + '\n'
    
    with open(file_name, 'a') as f:
        f.write(line)

    study.finalize(trial)

print(study.get_best_result())

In [5]:
best_params = {'n_estimators': 280, 'max_depth': None, 'min_samples_split': 3, 'verbose': True, 'n_jobs': -1}
model = RandomForestRegressor(**best_params)
model.fit(Xtrain, Ytrain)
yhat = model.predict(Xtest)
print("MSE on test set: {:.4f}".format(mean_squared_error(Ytest, yhat)))

[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

## 1.2 GradientBoostingRegressor

In [6]:
# # Done as batch job
# file_name = './hyperparametertuning/GBR_original.txt'

# parameters = [
#     sherpa.Choice('n_estimators', list(range(100, 310, 10))),
#     sherpa.Choice('learning_rate', [0.05, 0.1, 0.5, 1.0, 1.25, 1.5, 2]),
#     sherpa.Discrete('max_depth', [1, 8]),
#     sherpa.Discrete('min_samples_split', [2, 10])
# ]

# alg = sherpa.algorithms.RandomSearch(max_num_trials=50)
# study = sherpa.Study(parameters=parameters,
#                      algorithm=alg,
#                      lower_is_better=True)

# for trial in study:
#     start = time.time()
#     line = '===============================================\n'
#     params = {
#         "n_estimators": trial.parameters['n_estimators'],
#         "learning_rate": trial.parameters['learning_rate'],
#         "max_depth": trial.parameters['max_depth'],
#         "min_samples_split": trial.parameters["min_samples_split"],
#         "verbose": True
#     }
#     print(params)
#     line += str(params) + '\n'
#     model = GradientBoostingRegressor(**params)
#     model.fit(Xtrain, Ytrain)
#     training_error = mean_squared_error(Ytrain, model.predict(Xtrain))
#     validation_error = mean_squared_error(Yvalid, model.predict(Xvalid))
#     study.add_observation(
#         trial=trial,
#         iteration=1,
#         objective=validation_error,
#         context={'training_error': training_error}
#     )
#     end = time.time()
#     line += "MSE on training set  : {:.6f}".format(training_error) + '\n'
#     line += "MSE on validation set: {:.6f}".format(validation_error) + '\n'
#     line += "elapsed time         : {:.3f}".format(end - start) + '\n'
    
#     with open(file_name, 'a') as f:
#         f.write(line)

#     study.finalize(trial)

# print(study.get_best_result())

INFO:sherpa.core:
-------------------------------------------------------
SHERPA Dashboard running. Access via
http://10.100.11.206:8880 if on a cluster or
http://localhost:8880 if running locally.
-------------------------------------------------------


 * Serving Flask app 'sherpa.app.app' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on




{'n_estimators': 190, 'learning_rate': 0.1, 'max_depth': 2, 'min_samples_split': 8, 'verbose': True}
      Iter       Train Loss   Remaining Time 
         1          39.7233            3.78m
         2          38.7459            3.69m
         3          37.9059            3.67m
         4          37.2039            3.62m


KeyboardInterrupt: 

In [None]:
best_params = {'n_estimators': 280, 'learning_rate': 0.5, 'max_depth': 5, 'min_samples_split': 2, 'verbose': True}
model = GradientBoostingRegressor(**best_params)
model.fit(Xtrain, Ytrain)
yhat = model.predict(Xtest)
print("MSE on test set: {:.4f}".format(mean_squared_error(Ytest, yhat)))

In [4]:
# # Done as a batch job
# file_name = './hyperparametertuning/XGB_original.txt'

# parameters = [
#     sherpa.Choice('n_estimators', list(range(100, 310, 10))),
#     sherpa.Choice('learning_rate', [0.05, 0.1, 0.5, 1.0, 1.25, 1.5, 2]),
#     sherpa.Discrete('max_depth', [1, 10]),
# ]

# alg = sherpa.algorithms.RandomSearch(max_num_trials=50)
# study = sherpa.Study(parameters=parameters,
#                      algorithm=alg,
#                      lower_is_better=True)

# for trial in study:
#     start = time.time()
#     line = '===============================================\n'
#     params = {
#         "n_estimators": trial.parameters['n_estimators'],
#         "learning_rate": trial.parameters['learning_rate'],
#         "max_depth": trial.parameters['max_depth'],
#         "verbosity": 1
#     }
#     print(params)
#     line += str(params) + '\n'
#     model = XGBRegressor(**params)
#     model.fit(Xtrain, Ytrain)
#     training_error = mean_squared_error(Ytrain, model.predict(Xtrain))
#     validation_error = mean_squared_error(Yvalid, model.predict(Xvalid))
#     study.add_observation(
#         trial=trial,
#         iteration=1,
#         objective=validation_error,
#         context={'training_error': training_error}
#     )
#     end = time.time()
#     line += "MSE on training set  : {:.6f}".format(training_error) + '\n'
#     line += "MSE on validation set: {:.6f}".format(validation_error) + '\n'
#     line += "elapsed time         : {:.3f}".format(end - start) + '\n'
    
#     with open(file_name, 'a') as f:
#         f.write(line)

#     study.finalize(trial)

# print(study.get_best_result())

INFO:sherpa.core:
-------------------------------------------------------
SHERPA Dashboard running. Access via
http://10.100.11.206:8880 if on a cluster or
http://localhost:8880 if running locally.
-------------------------------------------------------


 * Serving Flask app 'sherpa.app.app' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on




{'n_estimators': 170, 'learning_rate': 2, 'max_depth': 1, 'verbosity': 1}


KeyboardInterrupt: 

# 2. Hyperparameter with elev data

In [7]:
df_metadata = pd.read_excel(f"{BASE_DIR}/FilledDataset2012.xlsx", sheet_name="Header")
df_data_original = pd.read_csv(f"{BASE_DIR}/dataset.csv")

# make sure there is no NaN value
assert df_data_original.isnull().values.any() == False
print(f"There are {df_data_original.shape[0]} samples.")
print(
    "Each sample is associated with lat and lon coordinates.\n" + 
    "Use only the closest observation to represent each field, from 16 different NetCDF files.", )

df_combined = df_data_original.merge(right=df_metadata[["SKN", "ElevFT"]], left_on="skn", right_on="SKN")
df_clean = (
    df_combined.drop(
        labels=["lat", "lon", "year", "month", "SKN", "skn", "Lon_DD_updated"],
        axis=1
    ).rename(
        columns={"Lat_DD": "lat", "Lon_DD": "lon", "ElevFT": "elev"}
    )
)

# split the dataset with
X = np.array(df_clean.drop(labels=["data_in"], axis=1))
Y = np.array(df_clean["data_in"])

Xtemp, Xtest, Ytemp, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)
Xtrain, Xvalid, Ytrain, Yvalid = train_test_split(Xtemp, Ytemp, test_size=0.25, random_state=42)

There are 865561 samples.
Each sample is associated with lat and lon coordinates.
Use only the closest observation to represent each field, from 16 different NetCDF files.


## 2.1 RandomForestRegressor

In [9]:
# # done as batch job
# file_name = './RFR_elev.txt'

# parameters = [
#     sherpa.Choice('n_estimators', list(range(50, 310, 10))),
#     sherpa.Discrete('min_samples_split', [2, 10])
# ]
# alg = sherpa.algorithms.RandomSearch(max_num_trials=50)
# study = sherpa.Study(parameters=parameters,
#                      algorithm=alg,
#                      lower_is_better=True)

# for trial in study:
#     start = time.time()
#     line = '===============================================\n'
#     params = {
#         "n_estimators": trial.parameters['n_estimators'],
#         "max_depth": None,
#         "min_samples_split": trial.parameters["min_samples_split"],
#         "verbose": True,
#         "n_jobs": -1
#     }
#     print(params)
#     line += str(params) + '\n'
#     model = RandomForestRegressor(**params)
#     model.fit(Xtrain, Ytrain)
#     training_error = mean_squared_error(Ytrain, model.predict(Xtrain))
#     validation_error = mean_squared_error(Yvalid, model.predict(Xvalid))
#     study.add_observation(
#         trial=trial,
#         iteration=1,
#         objective=validation_error,
#         context={'training_error': training_error}
#     )
#     end = time.time()
#     line += "MSE on training set  : {:.6f}".format(training_error) + '\n'
#     line += "MSE on validation set: {:.6f}".format(validation_error) + '\n'
#     line += "elapsed time         : {:.3f}".format(end - start) + '\n'

#     with open(file_name, 'a') as f:
#         f.write(line)

#     study.finalize(trial)