In [None]:
import importlib
import copy
import pickle
# --------------------
from sklearn.utils import gen_batches, check_array
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import root_mean_squared_error as RMSE
from sklearn.utils.validation import FLOAT_DTYPES
import numpy as np
from numpy.linalg import norm,inv,matrix_rank, pinv
# --------------------
from skopt.space import Integer
from skopt.utils import use_named_args
from skopt import gp_minimize
# --------------------
import h5py
import datetime
import Code.SVDPLS

# import warnings
# warnings.filterwarnings('ignore')

from pytictoc import TicToc
tim=TicToc()
tim_tot = TicToc()

In [None]:
def RollingCV(tscv,X):
    cv = tscv.split(X)
    (train_index, test_index) = next(cv)
    yield(
        train_index,
        test_index
    )
    test_size = len(test_index)

    for (train_index, test_index) in (cv):
        yield(
            train_index[-test_size:],
            test_index
        )

In [None]:
with h5py.File('./data/merra2_t.h5', 'r') as f:
    X_train, Y_train = f['X_train'], f['Y_train']
    n_train = X_train.shape[0]
    print(n_train)

    X_train = X_train[0:n_train]
    Y_train = Y_train[0:n_train]

n_fold = 74
test_size=30
tscv = TimeSeriesSplit(n_splits=n_fold, test_size=test_size)

In [None]:
with h5py.File('./data/TW_PM25.h5', 'r') as f:
    X_train, Y_train = f['X_train'], f['Y_train']
    n_train = X_train.shape[0]
    print(n_train)

    X_train = X_train[0:n_train]
    Y_train = Y_train[0:n_train]
n_fold = 17
test_size=30
tscv = TimeSeriesSplit(n_splits=n_fold, test_size=test_size)

In [None]:
def dump_PLS(PLS, fn):
    np.savez(fn,
             n_comp=PLS.n_components,
             x_weights = PLS.x_weights_,
             y_weights = PLS.y_weights_,
             x_mean = PLS._x_mean,
             y_mean = PLS._y_mean
             )

def predict(dat, X, n_comp):
    X = check_array(X, copy=copy, dtype=FLOAT_DTYPES)
    x_weights = dat['x_weights'][:,:n_comp]
    y_weights = dat['y_weights'][:,:n_comp]

    coef = np.dot(x_weights, y_weights.T)
    X -= dat['x_mean']
    ypred = np.dot(X, coef)
    ypred += dat['y_mean']

    return ypred


In [None]:
print(X_train.shape)
print(Y_train.shape)

for i, (train_index, test_index) in enumerate(tscv.split(X_train)):
    print(f"Fold {i}:")
    print(f"  Train: len={len(train_index)}")
    print(f"  Test:  len={len(test_index)}")
    if(i>2):
        print(f"  Train: from {train_index[0]} to {train_index[-1]}")
        print(f"  Test:  from {test_index[0]} to {test_index[-1]}")
        break


In [None]:
importlib.reload(Code.SVDPLS)
from Code.SVDPLS import SVDPLS

n_comp_max=100
PLS = SVDPLS(n_components=n_comp_max)
tim_tot.tic()
tim.tic()
for i,(train_index, test_index) in enumerate(tscv.split(X_train)):
    PLS.fit(X_train[train_index], Y_train[train_index])
    dump_PLS(PLS, f'./data/wrk/SVDPLS_fold{i:02d}.npz')
    if (i%5==0):
        print(f"fold: {i+1:02d}, elapsed time: {tim.tocvalue():.1f}s")
        tim.tic()

print(f"total time={tim_tot.tocvalue():.1f}s")

In [None]:
PLS_list = []
for i in range(n_fold):
    PLS_list.append(np.load(f'./data/wrk/SVDPLS_fold{i:02d}.npz'))


In [None]:
space  = [Integer(1, n_comp_max, name='n_components')]
@use_named_args(space)
def Comp_Model_Score(n_components):
    scores = np.zeros((n_fold,))
    for i,(train_index, test_index) in enumerate(tscv.split(X_train)):
        y_true = Y_train[test_index]
        y_pred = predict(PLS_list[i], X_train[test_index], n_components)
        scores[i] = RMSE(y_true, y_pred)
    return np.mean(scores)


In [None]:
n_calls = int(np.log(n_comp_max)) + 1
print(f"n_calls = {n_calls:d}")
n_calls *= 2

tim_tot.tic()
res_gp = gp_minimize(Comp_Model_Score, space, n_calls=max(n_calls,10), 
                     random_state=0, verbose=True)

print("-"*40)
print(f"Best param = {res_gp.x[0]:02d}")
print(f"Best score = {res_gp.fun:.4f}")
print(f"total time = {tim_tot.tocvalue():.1f}s")
