In [None]:
import sys
sys.path.append('../')

import numpy as np
import random
import torch
import gpytorch
from gp_mjo.gp_mjo_model import gp_mjo
from gp_mjo.utils.dat_ops import dics_divide

from matplotlib import pyplot as plt
import matplotlib.colors as mcolors

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
npzfile = np.load('../data/mjo_new_data.npz', allow_pickle=True)

data_names = npzfile.files
n_files = len(npzfile.files)

new_datas = [0]*n_files
for i in range(n_files):
    new_datas[i] = npzfile[data_names[i]]

print(data_names)
print(new_datas)
print(new_datas[0].shape)
print(new_datas[3].shape)

['year', 'month', 'day', 'RMM1', 'RMM2', 'phase', 'amplitude']
[array([1979, 1979, 1979, ..., 2022, 2022, 2022]), array([ 1,  1,  1, ..., 11, 11, 11]), array([ 1,  2,  3, ..., 14, 15, 16]), array([ 0.142507  , -0.2042    , -0.15861   , ...,  0.63895518,
        0.71995491,  0.67032343]), array([ 1.05047   ,  1.37361   ,  1.53943   , ..., -0.39565107,
       -0.2206952 , -0.1839911 ]), array([6., 7., 7., ..., 4., 4., 4.]), array([1.0600899 , 1.3887    , 1.54758   , ..., 0.7515341 , 0.75302154,
       0.69511598])]
(16026,)
(16026,)


In [3]:
## Set initial values
widths = [40, 60, 80]
n_iter = 200
sigma_eps = 0.01
fixed_noise = True

N = len(new_datas[0]) # the total number of days in new dataset
n = 10000 # the number of days for training
c = 365 # the number of dropped buffer set
m = N-n-c # the number of days for testing


n_cv = 1 # the number of operations for cross-validation
n1s = [0]*n_cv
for i in range(n_cv):
    n1s[i] = random.randint(0,n) # (include 0 and n) the number of the first part of the training set


palette_colors = list(mcolors.TABLEAU_COLORS.keys()) # list of Tableau Palette colors
plot_colors = palette_colors[:len(widths)]

## Set the kernel of GP
nu = 0.5 # 1.5,2.5.... smoothness parameter of Matern kernel
d = 1 # d = width or d = 1
kernel = gpytorch.kernels.MaternKernel(nu=nu, ard_num_dims=d) 

In [4]:
print(m)

5661


In [5]:
max_lead_time = m + 1 - max(widths)
lead_times = np.arange(1, 101)
n_pred = 1000 # 14*365
# width + lead_time + n_pred-1 <= m

In [6]:
# Independent
dics_total = {}
cor_total = {}
rmse_total = {}
for n1 in n1s:
    dics, dics_ids = dics_divide(new_datas, data_names, n1, m, n, c)
    dics_total[n1] = dics

    cor_width = {}
    rmse_width = {}
    for width in widths:
        cor_lead = np.zeros(len(lead_times))
        rmse_lead = np.zeros(len(lead_times))
        for i, lead_time in enumerate(lead_times):
            mjo_model = gp_mjo(dics, dics_ids, kernel, width, n_iter, sigma_eps,fixed_noise)
            for data_name in ['RMM1','RMM2']:

                mjo_model.train_mjo(data_name)
                mjo_model.pred_mjo(data_name, lead_time, n_pred)
            
            mjo_model.cor()
            mjo_model.rmse()

            cor_lead[i] = mjo_model.cor_leadtime
            rmse_lead[i] = mjo_model.rmse_leadtime
        
        cor_width[width] = cor_lead
        rmse_width[width] = rmse_lead
    
    cor_total[n1] = cor_width
    rmse_total[n1] = rmse_width



In [None]:
# Plot
for n1 in n1s:
    for width in widths:
        _cor = cor_total[n1][width]
        _rmse = rmse_total[n1][width]
        
        fig, ax = plt.subplots(1, 1, figsize=(14, 8))
        ax.plot(lead_times, _cor, color='red', linewidth=2.5, marker='+')
        ax.plot(lead_times, _rmse, color='red', linewidth=2.5, marker='D')

        ax.set_xlabel('Forecast lead time (days)', fontsize=14)
        ax.legend(['COR', 'RMSE'], fontsize=14)
        ax.set_title(f'COR & RMSE for width = {width} and n1 = {n1}', fontsize=14)


In [20]:
n1 = n1s[0]
print(n1)

dics, dics_ids = dics_divide(new_datas, data_names, n1, m, n, c, fixed_start=False, start_index=None, width=None)
print(dics['RMM1']['train1'])
print(dics['year']['train1'])
print(dics['year']['train2'])

1525
[ 0.142507   -0.2042     -0.15861    ... -0.76100999 -0.35888001
 -0.1133    ]
[1979 1979 1979 ... 1983 1983 1983]
[1999 1999 1999 ... 2022 2022 2022]
