In [1]:
#!/usr/bin/python

import sys
import os
import numpy as np
import random
from random import randint
from random import shuffle
import time
import json
import pickle

import tensorflow as tf
from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
from tensorflow.contrib import rnn

# local packages 
from utils_libs import *
from utils_training import *
from utils_inference import *
from mixture_models import *

# ------ GPU set-up in multi-GPU environment
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# ----- data and log paths
arg_py = "market1_tar10_len10"
path_data = "../datasets/bitcoin/market1_tar10_len10/"
path_model = "../results/m1_t10_1/"
path_py = "../results/m1_t10_1/py_" + arg_py
path_log_error = "../results/log_error_mix_" + arg_py + "_1" + ".txt"

# ----- set-up

# -- model

para_distr_type = "log_normal_logOpt_linearComb"
para_distr_para = []
para_var_type = "exp" # square, exp
para_share_type_gate = "no_share"
# no_share, share, mix
para_model_type = 'linear'

# -- data

if para_model_type == 'rnn':
    para_x_src_padding = False
    para_add_common_factor = False
    para_common_factor_type = "pool" if para_add_common_factor == True else ""
    
elif para_model_type == 'linear':
    para_x_src_padding = True
    para_add_common_factor = False
    para_common_factor_type = "factor" if para_add_common_factor == True else ""

para_bool_target_seperate = False # [Note] if yes, the last source corresponds to the auto-regressive target variable
para_x_shape_acronym = ["src", "N", "T", "D"]

# -- training

# [Note] if best epoch is close to "para_n_epoch", possible to increase "para_n_epoch".
# [Note] if best epoch is around the middle place of the training trajectory, ensemble expects to take effect. 
para_n_epoch = 90
para_burn_in_epoch = 85
para_vali_snapshot_num = max(1, int(0.05*para_n_epoch))
para_test_snapshot_num = para_n_epoch - para_burn_in_epoch
para_test_snapshot_sample_interval = 2

para_hpara_search = "random" # random, grid 
para_hpara_train_trial_num = 30
para_hpara_retrain_num = 10
para_hpara_ensemble_num = 3

# optimization
para_loss_type = "heter_lk_inv"
# "heter_lk_inv"
para_optimizer = "adam"
# RMSprop, adam, sgd, adamW 
# sg_mcmc_RMSprop, sg_mcmc_adam
# [Note] for sg_mcmc family, "para_n_epoch" could be set to higher values

# [Note] training heuristic: re-set the following for training on new data
# [Note] if lr_decay is on, "lr" and "para_n_epoch" can be set to higher values
para_optimizer_lr_decay = True 
para_optimizer_lr_decay_epoch = 10 # after the warm-up
# [Note] when sg_mcmc is on, turn off the learning rate warm-up
para_optimizer_lr_warmup_epoch = max(1, int(0.1*para_n_epoch))

para_early_stop_bool = False
para_early_stop_window = 0

para_validation_metric = 'nnllk'
para_metric_map = {'rmse':0, 'mae':1, 'mape':2, 'nnllk':3}

# regularization
para_regu_mean = True
para_regu_var = True
para_regu_gate = False
para_regu_mean_positive = False

para_bool_bias_in_mean = True
para_bool_bias_in_var = True
para_bool_bias_in_gate = True

# -- hpara: hyper parameter

para_hpara_range = {}
para_hpara_range['random'] = {}
para_hpara_range['random']['linear'] = {}
para_hpara_range['random']['rnn'] = {}

# - linear
if para_add_common_factor == True:
    para_hpara_range['random']['linear']['factor_size'] = [10, 10]
para_hpara_range['random']['linear']['lr'] = [1e-4, 5e-4]
para_hpara_range['random']['linear']['batch_size'] = [10, 300]
# source-wise
# tar1
# para_hpara_range['random']['linear']['l2_mean'] = [1e-1, 5e+1]
# para_hpara_range['random']['linear']['l2_var']  = [1e-1, 5e+1]

# # tar5
# para_hpara_range['random']['linear']['l2_mean'] = [1e-1, 1e+1]
# para_hpara_range['random']['linear']['l2_var']  = [1e-1, 1e+1]

# tar10
para_hpara_range['random']['linear']['l2_mean'] = [1e-1, 5e+0]
para_hpara_range['random']['linear']['l2_var']  = [1e-1, 5e+0]

# # - rnn
# # source-wise
# para_hpara_range['random']['rnn']['rnn_size'] =  [16, 16]
# para_hpara_range['random']['rnn']['dense_num'] = [0, 3] # inproper value leads to non-convergence in training

# para_hpara_range['random']['rnn']['lr'] = [0.001, 0.001]
# para_hpara_range['random']['rnn']['batch_size'] = [100, 140]

# # source-wise
# para_hpara_range['random']['rnn']['l2_mean'] = [1e-7, 1e-3]
# para_hpara_range['random']['rnn']['l2_var'] = [1e-7, 1e-3]
# if para_regu_gate == True:
#     para_hpara_range['random']['linear']['l2_gate'] = [1e-7, 1e-3]
    
# para_hpara_range['random']['rnn']['dropout_keep_prob'] = [0.7, 1.0]
# para_hpara_range['random']['rnn']['max_norm_cons'] = [0.0, 0.0]

# -- log
def log_train(path):
    with open(path, "a") as text_file:
        text_file.write("\n\n ------ Bayesian mixture : \n")
        
        text_file.write("data source padding : %s \n"%(para_x_src_padding))
        text_file.write("data path : %s \n"%(path_data))
        text_file.write("data source timesteps : %s \n"%(para_steps_x))
        text_file.write("data source feature dimensionality : %s \n"%(para_dim_x))
        text_file.write("data source number : %d \n"%( len(src_ts_x) ))
        text_file.write("data common factor : %s \n"%(para_add_common_factor))
        text_file.write("data common factor type : %s \n"%(para_common_factor_type))
        text_file.write("prediction path : %s \n"%(path_py))
        text_file.write("\n")
        
        text_file.write("model type : %s \n"%(para_model_type))
        text_file.write("target distribution type : %s \n"%(para_distr_type))
        text_file.write("target distribution para. : %s \n"%(str(para_distr_para)))
        text_file.write("target variable as a seperated data source : %s \n"%(para_bool_target_seperate))
        text_file.write("variance calculation type : %s \n"%(para_var_type))
        text_file.write("para. sharing in gate logit : %s \n"%(para_share_type_gate))
        text_file.write("\n")
        
        text_file.write("regularization on mean : %s \n"%(para_regu_mean))
        text_file.write("regularization on variance : %s \n"%(para_regu_var))
        text_file.write("regularization on mixture gates : %s \n"%(para_regu_gate))
        text_file.write("regularization on positive means : %s \n"%(para_regu_mean_positive))
        text_file.write("\n")
        
        text_file.write("adding bias terms in mean : %s \n"%(para_bool_bias_in_mean))
        text_file.write("adding bias terms in variance : %s \n"%(para_bool_bias_in_var))
        text_file.write("adding bias terms in gates : %s \n"%(para_bool_bias_in_gate))
        text_file.write("\n")
        
        text_file.write("optimizer : %s \n"%(para_optimizer))
        text_file.write("loss type : %s \n"%(para_loss_type))
        text_file.write("learning rate decay : %s \n"%(str(para_optimizer_lr_decay)))
        text_file.write("learning rate decay epoch : %s \n"%(str(para_optimizer_lr_decay_epoch)))
        text_file.write("learning rate warm-up epoch : %s \n"%(str(para_optimizer_lr_warmup_epoch)))
        text_file.write("\n")
        
        text_file.write("hyper-para search : %s \n"%(para_hpara_search))
        text_file.write("hyper-para range : %s \n"%(str(para_hpara_range[para_hpara_search][para_model_type])))
        text_file.write("hyper-para training trial num : %s \n"%(str(para_hpara_train_trial_num)))
        text_file.write("hyper-para retraining num.: %s \n"%(str(para_hpara_retrain_num)))
        text_file.write("random seed ensemble num.: %s \n"%(str(para_hpara_ensemble_num)))
        text_file.write("\n")
        
        text_file.write("epochs in total : %s \n"%(para_n_epoch))
        text_file.write("burn_in_epoch : %s \n"%(para_burn_in_epoch))
        text_file.write("num. snapshots in validating : %s \n"%(para_vali_snapshot_num))
        text_file.write("num. snapshots in testing : %s \n"%(para_test_snapshot_num))
        text_file.write("validation metric : %s \n"%(para_validation_metric))
        text_file.write("early-stoping : %s \n"%(para_early_stop_bool))
        text_file.write("early-stoping look-back window : %s \n"%(para_early_stop_window))
        
        text_file.write("\n\n")

# ----- training and evalution
    
def training_validating(xtr,
                        ytr,
                        xval,
                        yval,
                        dim_x,
                        steps_x,
                        hyper_para_dict,
                        training_dict,
                        retrain_top_steps, 
                        retrain_bayes_steps,
                        retrain_bool,
                        retrain_idx,
                        random_seed):
    '''
    Argu.:
      xtr: [num_src, N, T, D]
         S: num_src
         N: number of data samples
         T: number of steps
         D: dimension at each time step
      ytr: [N 1]
        
      dim_x: integer, corresponding to D
      steps_x: integer, corresponding to T
      
      hyper_para_dict: 
       "lr": float,
       "batch_size": int
       "l2": float,
                           
       "lstm_size": int,
       "dense_num": int,
       "use_hidden_before_dense": bool
       
      training_dict:
       "batch_per_epoch": int
       "tr_idx": list of integer
    '''
    # clear the graph in the current session 
    tf.reset_default_graph()
    
    print('----- test ', np.shape(ytr))
    
    
    
    
    with tf.device('/device:GPU:0'):
        
        # clear the graph in the current session 
        tf.reset_default_graph()
        
        # fix the random seed to stabilize the network
        os.environ['PYTHONHASHSEED'] = str(random_seed)
        random.seed(random_seed)  # `python` built-in pseudo-random generator
        np.random.seed(random_seed)
        tf.set_random_seed(random_seed)
        
        # session set-up
        config = tf.ConfigProto()
        config.allow_soft_placement = True
        config.gpu_options.allow_growth = True
        sess = tf.Session(config = config)
        
        model = mixture_statistic(session = sess, 
                                  loss_type = para_loss_type,
                                  num_src = len(xtr),
                                  hyper_para_dict = hyper_para_dict, 
                                  model_type = para_model_type)
        
        # -- initialize the network
        model.network_ini(hyper_para_dict,
                          x_dim = dim_x,
                          x_steps = steps_x, 
                          x_bool_common_factor = para_add_common_factor,
                          y_dim = np.shape(ytr)[1],
                          model_type = para_model_type, 
                          model_distr_type = para_distr_type,
                          model_distr_para = para_distr_para,
                          model_var_type = para_var_type,
                          model_para_share_type = para_share_type_gate,
                          bool_regu_mean = para_regu_mean,
                          bool_regu_var = para_regu_var,
                          bool_regu_gate = para_regu_gate,
                          bool_regu_positive_mean = para_regu_mean_positive,
                          bool_bias_mean = para_bool_bias_in_mean,
                          bool_bias_var = para_bool_bias_in_var,
                          bool_bias_gate = para_bool_bias_in_gate,
                          optimization_method = para_optimizer,
                          optimization_lr_decay = para_optimizer_lr_decay,
                          optimization_lr_decay_steps = para_optimizer_lr_decay_epoch*int(len(xtr[0])/int(hyper_para_dict["batch_size"])),
                          optimization_burn_in_step = para_burn_in_epoch,
                          optimization_warmup_step = para_optimizer_lr_warmup_epoch*training_dict["batch_per_epoch"] - 1)
        
        # !! the order of Saver
        saver = tf.train.Saver(max_to_keep = None)
        
        model.train_ini()
        model.inference_ini()
        #tf.get_default_graph().finalize()
        
        # -- set up training batch parameters
        batch_gen = data_loader(x = xtr,
                                y = ytr,
                                batch_size = int(hyper_para_dict["batch_size"]), 
                                num_ins = training_dict["tr_num_ins"],  
                                num_src = len(xtr))
        # -- begin training
        
        # training and validation error log
        step_error = []
        global_step = 0
        
        # training time counter
        st_time = time.time()
        
        for epoch in range(para_n_epoch):
            # shuffle traning instances each epoch
            batch_gen.re_shuffle()
            batch_x, batch_y, bool_last = batch_gen.one_batch()
            
            # loop over all batches
            while batch_x != None:
                    
                # one-step training on a batch of training data
                model.train_batch(batch_x, 
                                  batch_y,
                                  global_step = epoch)
                
                # - batch-wise validation
                # val_metric: [val_rmse, val_mae, val_mape, val_nnllk]
                # nnllk: normalized negative log likelihood
                val_metric, monitor_metric = model.validation(xval,
                                                              yval,
                                                              step = global_step,
                                                              bool_end_of_epoch = bool_last)
                if val_metric:
                    # tr_metric [tr_rmse, tr_mae, tr_mape, tr_nnllk]
                    tr_metric, _ = model.inference(xtr,
                                                   ytr, 
                                                   bool_py_eval = False)
                    #step_error.append([global_step, tr_metric, val_metric, epoch])
                    step_error.append([epoch, tr_metric, val_metric, epoch])
                    
                # - next batch
                batch_x, batch_y, bool_last = batch_gen.one_batch()
                global_step += 1
                    
            # -- model saver 
            model_saver_flag = model.model_saver(path = path_model + para_model_type + '_' + str(retrain_idx) + '_' + str(epoch),
                                                 epoch = epoch,
                                                 step = global_step,
                                                 top_snapshots = retrain_top_steps,
                                                 bayes_snapshots = retrain_bayes_steps,
                                                 early_stop_bool = para_early_stop_bool,
                                                 early_stop_window = para_early_stop_window, 
                                                 tf_saver = saver)
            # epoch-wise
            print("\n --- At epoch %d : \n  %s "%(epoch, str(step_error[-1])))
            print("\n   loss and regualization : \n", monitor_metric)
            
            # NAN value exception 
            if np.isnan(monitor_metric[0]) == True:
                print("\n --- NAN loss !! \n" )
                break
                
            if retrain_bool == True and model_saver_flag != None:
                print("\n    [MODEL SAVED] " + model_saver_flag + " \n " + path_model + para_model_type + '_' + str(retrain_idx) + '_' + str(epoch))
                
        ed_time = time.time()
        
    # ? sorted training log ?
    # step_error: [global_step, tr_metric, val_metric, epoch]
    # sort step_error based on para_validation_metric
    sort_step_error = sorted(step_error, key = lambda x:x[2][para_metric_map[para_validation_metric]])
    
    return sort_step_error,\
           1.0*(ed_time - st_time)/(epoch + 1e-5),\

# ----- main process  

if __name__ == '__main__':
    
    # ------ data
    
    import pickle
    tr_dta = pickle.load(open(path_data + 'train_dese.p', "rb"), encoding = 'latin1')
    val_dta = pickle.load(open(path_data + 'val_dese.p', "rb"), encoding = 'latin1')
    ts_dta = pickle.load(open(path_data + 'test_dese.p', "rb"), encoding = 'latin1')
    print(len(tr_dta), len(val_dta), len(ts_dta))
    
    # if para_bool_target_seperate = yes, the last source corresponds to the auto-regressive target variable
    tr_x, tr_y = data_reshape(tr_dta, 
                              bool_target_seperate = para_bool_target_seperate)
    val_x, val_y = data_reshape(val_dta,
                                bool_target_seperate = para_bool_target_seperate)
    ts_x, ts_y = data_reshape(ts_dta,
                              bool_target_seperate = para_bool_target_seperate)
    
    # --- log transformation of y
        
    # output from the reshape
    # y [N 1], x [S [N T D]]
    print("training: ", len(tr_x[0]), len(tr_y))
    print("validation: ", len(val_x[0]), len(val_y))
    print("testing: ", len(ts_x[0]), len(ts_y))
    
    # --- source-wise data preparation 

    if para_x_src_padding == True:
        # T and D different across data sources
        # padding to same T and D
        # y: [N 1], x: [S [N T D]]
        src_tr_x = data_padding_x(tr_x,
                                  num_src = len(tr_x))
        src_val_x = data_padding_x(val_x,
                                   num_src = len(tr_x))
        src_ts_x = data_padding_x(ts_x,
                                  num_src = len(tr_x))
        print("Shapes after padding: ", np.shape(src_tr_x), np.shape(src_val_x), np.shape(src_ts_x))
        
    else:
        src_tr_x = tr_x
        src_val_x = val_x
        src_ts_x = ts_x
        
    if para_add_common_factor == True:
        # x: [S [N T D]]
        # assume T is same across data sources
        
        # [N T sum(D)]
        tr_x_concat = np.concatenate(tr_x, -1)
        val_x_concat = np.concatenate(val_x, -1)
        ts_x_concat = np.concatenate(ts_x, -1)
        
        if para_common_factor_type == "pool":
            tr_x_factor = tr_x_concat
            val_x_factor = val_x_concat
            ts_x_factor = ts_x_concat
            
        elif para_common_factor_type == "factor":
            tmp_dim = np.shape(tr_x_concat)[-1]
            tmp_step = np.shape(tr_x_concat)[1]
            
            from sklearn.decomposition import FactorAnalysis
            transformer = FactorAnalysis(n_components = 10, 
                                         random_state = 0)
            # [N T d]
            tr_x_factor = []
            for tmp_x in tr_x_concat:
                # tmp_x: [T sum(D)] -> [T d]
                tr_x_factor.append(transformer.fit_transform(tmp_x))
                
            val_x_factor = []
            for tmp_x in val_x_concat:
                # tmp_x: [T sum(D)] -> [T d]
                val_x_factor.append(transformer.fit_transform(tmp_x))
            
            ts_x_factor = []
            for tmp_x in ts_x_concat:
                # tmp_x: [T sum(D)] -> [T d]
                ts_x_factor.append(transformer.fit_transform(tmp_x))
        
        # [S+1 [N T d]]
        src_tr_x.append(np.asarray(tr_x_factor))
        src_val_x.append(np.asarray(val_x_factor))
        src_ts_x.append(np.asarray(ts_x_factor))
    
    # steps and dimensionality of each source
    para_steps_x = []
    para_dim_x = []
    for tmp_src in range(len(src_tr_x)):
        tmp_shape = np.shape(src_tr_x[tmp_src][0])
        para_steps_x.append(tmp_shape[0])
        para_dim_x.append(tmp_shape[1])
        print("src " + str(tmp_src) + " shape: ", tmp_shape)
    
    shape_tr_x_dict = dict({"N": len(tr_x[0])})
    
    # ------ training and validation
    
    log_train(path_log_error)
    
    # -- hyper-para generator 
    if para_hpara_search == "random":        
        hpara_generator = hyper_para_random_search(para_hpara_range[para_hpara_search][para_model_type], 
                                                   para_hpara_train_trial_num)
    elif para_hpara_search == "grid":
        hpara_generator = hyper_para_grid_search(para_hpara_range[para_hpara_search][para_model_type])
            
    # -- begin hyper-para search
    hpara_log = []
    
    # sample one set-up of hyper-para
    hpara_dict = hpara_generator.one_trial()
                                                 
    while hpara_dict != None:
        
        tr_dict = training_para_gen(shape_x_dict = shape_tr_x_dict, 
                                    hpara_dict = hpara_dict)
        # hp_: hyper-parameter
        # hp_step_error: [[step, train_metric, val_metric, epoch]]
        hp_step_error, hp_epoch_time = training_validating(src_tr_x,
                                                           tr_y,
                                                           src_val_x,
                                                           val_y,
                                                           dim_x = para_dim_x,
                                                           steps_x = para_steps_x,
                                                           hyper_para_dict = hpara_dict,
                                                           training_dict = tr_dict,
                                                           retrain_bool = False,
                                                           retrain_top_steps = [],
                                                           retrain_bayes_steps = [],
                                                           retrain_idx = 0,
                                                           random_seed = 1)
        
        #[ dict{lr, batch, l2, ..., burn_in_steps}, [[step, tr_metric, val_metric, epoch]] ]
        hpara_dict["burn_in_steps"] = para_burn_in_epoch # tr_dict["batch_per_epoch"] - 1
        hpara_log.append([hpara_dict, hp_step_error])
        
        # -- prepare for the next trial
        
        # sample the next hyper-para
        hpara_dict = hpara_generator.one_trial()
        
        # -- logging
        log_train_val_performance(path_log_error,
                                  hpara = hpara_log[-1][0],
                                  hpara_error = hpara_log[-1][1][0],
                                  train_time = hp_epoch_time)
        # NAN loss exception
        log_null_loss_exception(hp_step_error, 
                                path_log_error)
        
        print('\n Validation performance under the hyper-parameters: \n', hpara_log[-1][0], hpara_log[-1][1][0])
        print('\n Training time: \n', hp_epoch_time, '\n')
        
    # ------ re-train
    #save all epoches in re-training, then select snapshots
    
    # best hyper-para
    best_hpara, _, _, _, _ = hyper_para_selection(hpara_log, 
                                                  val_snapshot_num = para_vali_snapshot_num, 
                                                  test_snapshot_num = para_test_snapshot_num,
                                                  metric_idx = para_metric_map[para_validation_metric])
    retrain_hpara_steps = []
    retrain_hpara_step_error = []
    retrain_random_seeds = [1] + [randint(0, 1000) for _ in range(para_hpara_retrain_num-1)]
    
    for tmp_retrain_id in range(para_hpara_retrain_num):
        
        tr_dict = training_para_gen(shape_x_dict = shape_tr_x_dict,
                                    hpara_dict = best_hpara)
        
        step_error, _ = training_validating(src_tr_x,
                                            tr_y,
                                            src_val_x,
                                            val_y,
                                            dim_x = para_dim_x,
                                            steps_x = para_steps_x,
                                            hyper_para_dict = best_hpara,
                                            training_dict = tr_dict,
                                            retrain_bool = True,
                                            retrain_top_steps = list(range(para_n_epoch)), #top_steps,
                                            retrain_bayes_steps = list(range(para_n_epoch)), #bayes_steps,
                                            retrain_idx = tmp_retrain_id,
                                            random_seed = retrain_random_seeds[tmp_retrain_id])
        
        top_steps, bayes_steps, top_steps_features, bayes_steps_features, val_error, step_error_pairs = snapshot_selection(train_log = step_error,
                                                                                                                           snapshot_num = para_test_snapshot_num,
                                                                                                                           total_step_num = para_n_epoch,
                                                                                                                           metric_idx = para_metric_map[para_validation_metric],
                                                                                                                           val_snapshot_num = para_vali_snapshot_num)
        if len(top_steps) != 0:
            retrain_hpara_steps.append([top_steps, bayes_steps, top_steps_features, bayes_steps_features, tmp_retrain_id, val_error])
            retrain_hpara_step_error.append([step_error_pairs, tmp_retrain_id])
        
        log_val_hyper_para(path = path_log_error,
                           hpara_tuple = [best_hpara, top_steps],
                           error_tuple = step_error[0], 
                           log_string = "-- " + str(tmp_retrain_id))
    
        print('\n----- Retrain hyper-parameters: ', best_hpara, top_steps, '\n')
        print('\n----- Retrain validation performance: ', step_error[0], '\n')
    
    sort_retrain_hpara_steps = sorted(retrain_hpara_steps, 
                                      key = lambda x:x[-1])
    
    log_test_performance(path = path_log_error, 
                         error_tuple = [i[-2:] for i in sort_retrain_hpara_steps], 
                         ensemble_str = "Retrain Ids and Vali. Errors: ")
    
    log_test_performance(path = path_log_error, 
                         error_tuple = [i[-2:] for i in sort_retrain_hpara_steps[:para_hpara_ensemble_num]], 
                         ensemble_str = "Retrain Ids for ensemble: ")
    
    # ------ testing
    # error tuple: [rmse, mae, mape, nnllk]
    # py_tuple
    
    # -- one snapshot from one retrain
    error_tuple, py_tuple = testing(retrain_snapshots = [sort_retrain_hpara_steps[0][0][:1]],
                                    retrain_ids = [ sort_retrain_hpara_steps[0][-2] ],
                                    xts = src_ts_x, 
                                    yts = ts_y, 
                                    file_path = path_model, 
                                    bool_instance_eval = True,
                                    loss_type = para_loss_type,
                                    num_src = len(src_val_x),
                                    snapshot_features = [],
                                    hpara_dict = best_hpara, 
                                    para_model_type = para_model_type, 
                                    para_loss_type = para_loss_type)
    log_test_performance(path = path_log_error, 
                         error_tuple = [error_tuple], 
                         ensemble_str = "One-shot-one-retrain")
    # dump predictions
    pickle.dump(py_tuple, open(path_py + "_one_one" + ".p", "wb"))
    
    # -- one snapshot from multi retrain
    error_tuple, py_tuple = testing(retrain_snapshots = [tmp_steps[0][:1] for tmp_steps in sort_retrain_hpara_steps], 
                                    retrain_ids = [i[-2] for i in sort_retrain_hpara_steps[:para_hpara_ensemble_num]],
                                    xts = src_ts_x,
                                    yts = ts_y, 
                                    file_path = path_model,
                                    bool_instance_eval = True,
                                    loss_type = para_loss_type,
                                    num_src = len(src_ts_x), 
                                    snapshot_features = [], 
                                    hpara_dict = best_hpara, 
                                    para_model_type = para_model_type, 
                                    para_loss_type = para_loss_type)
    log_test_performance(path = path_log_error, 
                         error_tuple = [error_tuple], 
                         ensemble_str = "One-shot-multi-retrain")
    # dump predictions
    pickle.dump(py_tuple, open(path_py + "_one_multi" + ".p", "wb"))
    
    # -- top snapshots from one retrain
    error_tuple, py_tuple = testing(retrain_snapshots = [sort_retrain_hpara_steps[0][0]], 
                                    retrain_ids = [ sort_retrain_hpara_steps[0][-2] ], 
                                    xts = src_ts_x, 
                                    yts = ts_y, 
                                    file_path = path_model,
                                    bool_instance_eval = True, 
                                    loss_type = para_loss_type, 
                                    num_src = len(src_ts_x), 
                                    snapshot_features = [], 
                                    hpara_dict = best_hpara, 
                                    para_model_type = para_model_type, 
                                    para_loss_type = para_loss_type)
    log_test_performance(path = path_log_error,
                         error_tuple = [error_tuple],
                         ensemble_str = "Top-shots-one-retrain")
    # dump predictions
    pickle.dump(py_tuple, open(path_py + "_top_one" + ".p", "wb"))
    
    # -- top snapshots multi retrain
    error_tuple, py_tuple = testing(retrain_snapshots = [tmp_steps[0] for tmp_steps in sort_retrain_hpara_steps], 
                                    retrain_ids = [i[-2] for i in sort_retrain_hpara_steps[:para_hpara_ensemble_num]], 
                                    xts = src_ts_x,
                                    yts = ts_y,
                                    file_path = path_model,
                                    bool_instance_eval = True,
                                    loss_type = para_loss_type,
                                    num_src = len(src_ts_x), 
                                    snapshot_features = [], 
                                    hpara_dict = best_hpara, 
                                    para_model_type = para_model_type, 
                                    para_loss_type = para_loss_type)
    log_test_performance(path = path_log_error, 
                         error_tuple = [error_tuple], 
                         ensemble_str = "Top-shots-multi-retrain")
    # dump predictions
    pickle.dump(py_tuple, open(path_py + "_top_multi" + ".p", "wb"))
    
    # -- bayesian snapshots one retrain
    error_tuple, py_tuple = testing(retrain_snapshots = [sort_retrain_hpara_steps[0][1]], 
                                    retrain_ids = [ sort_retrain_hpara_steps[0][-2] ], 
                                    xts = src_ts_x, 
                                    yts = ts_y,
                                    file_path = path_model, 
                                    bool_instance_eval = True, 
                                    loss_type = para_loss_type, 
                                    num_src = len(src_ts_x), 
                                    snapshot_features = [], 
                                    hpara_dict = best_hpara, 
                                    para_model_type = para_model_type, 
                                    para_loss_type = para_loss_type)
    log_test_performance(path = path_log_error, 
                         error_tuple = [error_tuple], 
                         ensemble_str = "Bayesian-one-retrain")
    # dump predictions
    pickle.dump(py_tuple, open(path_py + "_bayes_one" + ".p", "wb"))
    
    # -- bayesian snapshots multi retrain
    error_tuple, py_tuple = testing(retrain_snapshots = [tmp_steps[1] for tmp_steps in sort_retrain_hpara_steps],
                                    retrain_ids = [i[-2] for i in sort_retrain_hpara_steps[:para_hpara_ensemble_num]],
                                    xts = src_ts_x,
                                    yts = ts_y,
                                    file_path = path_model,
                                    bool_instance_eval = True,
                                    loss_type = para_loss_type,
                                    num_src = len(src_ts_x),
                                    snapshot_features = [],
                                    hpara_dict = best_hpara, 
                                    para_model_type = para_model_type, 
                                    para_loss_type = para_loss_type)
    log_test_performance(path = path_log_error,
                         error_tuple = [error_tuple],
                         ensemble_str = "Bayesian-multi-retrain")
    # dump predictions
    pickle.dump(py_tuple, open(path_py + "_bayes_multi" + ".p", "wb"))
    
    # -- global top1 and topK steps
    
    retrain_ids, retrain_id_steps = global_top_steps_multi_retrain(retrain_step_error = retrain_hpara_step_error, 
                                                                   num_step = int(para_test_snapshot_num*para_hpara_ensemble_num))    
    log_test_performance(path = path_log_error, 
                         error_tuple = [retrain_ids, retrain_id_steps], 
                         ensemble_str = "Global-top-steps: ")
    
    error_tuple, py_tuple = testing(retrain_snapshots = retrain_id_steps, 
                                    retrain_ids = retrain_ids,
                                    xts = src_ts_x,
                                    yts = ts_y, 
                                    file_path = path_model,
                                    bool_instance_eval = True,
                                    loss_type = para_loss_type,
                                    num_src = len(src_ts_x), 
                                    snapshot_features = [], 
                                    hpara_dict = best_hpara, 
                                    para_model_type = para_model_type, 
                                    para_loss_type = para_loss_type)
    log_test_performance(path = path_log_error, 
                         error_tuple = [error_tuple], 
                         ensemble_str = "Global-top-steps-multi-retrain ")
    # dump predictions
    pickle.dump(py_tuple, open(path_py + "_global" + ".p", "wb"))
    

  np.random.seed(1)




12021 1716 3434
src 0 :  (12021, 9, 6)
src 1 :  (12021, 9, 6)
src 2 :  (12021, 9, 13)
src 3 :  (12021, 9, 13)
src 0 :  (1716, 9, 6)
src 1 :  (1716, 9, 6)
src 2 :  (1716, 9, 13)
src 3 :  (1716, 9, 13)
src 0 :  (3434, 9, 6)
src 1 :  (3434, 9, 6)
src 2 :  (3434, 9, 13)
src 3 :  (3434, 9, 13)
training:  12021

  tmpx.append(np.asarray([tmp[2][src_idx] for tmp in data]))
  print("src " + str(src_idx) + " : ", np.shape(tmpx[-1]))
  tmpy = np.asarray([tmp[0] for tmp in data])
  if len(np.shape(tmpy)) == 1:
  max_dim_t =  max([np.shape(x[i][0])[0] for i in range(num_src)])
  max_dim_d =  max([np.shape(x[i][0])[1] for i in range(num_src)])
  zero_mask = np.zeros(target_shape)
  tmp_t = np.shape(x[tmp_src][0])[0]
  tmp_d = np.shape(x[tmp_src][0])[1]


 12021
validation:  1716 1716
testing:  3434 3434
Shapes after padding:  (4, 12021, 9, 13) (4, 1716, 9, 13) (4, 3434, 9, 13)
src 0 shape:  (9, 13)
src 1 shape:  (9, 13)
src 2 shape:  (9, 13)
src 3 shape:  (9, 13)
----- test  (12021, 3)


  np.random.seed(100)
  tmp_hpara = tmp_hpara + (i[0] + (i[1] - i[0])*np.random.random(), )
  tr_dict["batch_per_epoch"] = int(np.ceil(1.0*shape_x_dict["N"]/int(hpara_dict["batch_size"])))





The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.






  lk_src = tf.exp(-0.5*tf.square(self.log_y - mean_stack)*inv_var_stack)*tf.sqrt(0.5/np.pi*inv_var_stack)
  tmp_lk_src = tf.exp(-0.5*tf.square(self.log_y - log_py_mean_src)*log_py_var_src_inv)*tf.sqrt(0.5/np.pi*log_py_var_src_inv)/(1.0*self.y+1e-5)




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



  np.random.seed(1)
  self.num_batch = int(np.ceil(1.0*num_ins/int(batch_size)))
  np.random.shuffle(self.ids)
  np.random.shuffle(self.ids)




 --- At epoch 0 : 
  [0, [inf, inf, inf, 1.1162723], [inf, inf, inf, 1.1020833], 0] 

   loss and regualization : 
 [1.5575769, 4.2385535, 7.7926545, array([[0.29959998, 0.24651441, 0.20214471, 0.2517409 ],
       [0.2640678 , 0.26146352, 0.25563344, 0.2188352 ],
       [0.21293578, 0.17082335, 0.40057388, 0.21566696]], dtype=float32)]





 --- At epoch 1 : 
  [1, [inf, inf, inf, 1.0931559], [inf, inf, inf, 1.0839709], 1] 

   loss and regualization : 
 [1.5394647, 3.528108, 6.4443808, array([[0.3050458 , 0.25058392, 0.21207379, 0.23229648],
       [0.26399258, 0.2673511 , 0.2655235 , 0.20313281],
       [0.21248761, 0.17432451, 0.4126369 , 0.20055102]], dtype=float32)]

 --- At epoch 2 : 
  [2, [inf, inf, inf, 1.0629119], [inf, inf, inf, 1.0600338], 2] 

   loss and regualization : 
 [1.5155182, 2.5917635, 4.6826406, array([[0.30856806, 0.25250974, 0.22533025, 0.213592  ],
       [0.26146752, 0.2736724 , 0.27880806, 0.18605202],
       [0.21094747, 0.17785428, 0.42262033, 0.1885779 ]], dtype=float32)]

 --- At epoch 3 : 
  [3, [inf, inf, inf, 1.0337036], [inf, inf, inf, 1.0369582], 3] 

   loss and regualization : 
 [1.49243, 1.6686708, 2.9663005, array([[0.31002483, 0.24989961, 0.24125364, 0.19882196],
       [0.25727877, 0.27723575, 0.296546  , 0.16893947],
       [0.2082895 , 0.17815395, 0.43402538, 0.17953116]], dt

  if np.isnan(i[1][0]) == True:



 --- At epoch 0 : 
  [0, [inf, inf, inf, 1.0978049], [inf, inf, inf, 1.0859071], 0] 

   loss and regualization : 
 [1.5414149, 0.6601222, 0.4126157, array([[0.3009503 , 0.24665223, 0.20524228, 0.24715522],
       [0.26403186, 0.26195216, 0.25862607, 0.21538985],
       [0.21251683, 0.17082106, 0.40482998, 0.21183217]], dtype=float32)]

 --- At epoch 1 : 
  [1, [inf, inf, inf, 1.037639], [inf, inf, inf, 1.0385056], 1] 

   loss and regualization : 
 [1.4940296, 0.5049215, 0.31328523, array([[0.31491318, 0.2459139 , 0.22048616, 0.21868674],
       [0.26581797, 0.26591817, 0.27550787, 0.19275598],
       [0.21196677, 0.17235078, 0.4246926 , 0.1909899 ]], dtype=float32)]

 --- At epoch 2 : 
  [2, [inf, inf, inf, 0.9763019], [inf, inf, inf, 0.9936899], 2] 

   loss and regualization : 
 [1.4491948, 0.32350382, 0.19615994, array([[0.3406417 , 0.22601517, 0.24192528, 0.19141787],
       [0.26858166, 0.25716555, 0.30321464, 0.17103814],
       [0.21056287, 0.16490519, 0.44794047, 0.17659155]

  hp_err.append([hp_epoch_err[0], hp_epoch_err[1], np.mean([k[2][metric_idx] for k in hp_epoch_err[1][:val_snapshot_num]])])


INFO:tensorflow:../results/m1_t10_1/linear_0_0 is not in all_model_checkpoint_paths. Manually adding it.

 --- At epoch 0 : 
  [0, [inf, inf, inf, 1.1112221], [inf, inf, inf, 1.0980334], 0] 

   loss and regualization : 
 [1.5535271, 4.272368, 8.544281, array([[0.30061316, 0.24687995, 0.20490853, 0.24759838],
       [0.26387104, 0.2621609 , 0.25826064, 0.21570742],
       [0.21251132, 0.1710502 , 0.40428343, 0.21215507]], dtype=float32)]

    [MODEL SAVED] best_snapshots 
 ../results/m1_t10_1/linear_0_0
INFO:tensorflow:../results/m1_t10_1/linear_0_1 is not in all_model_checkpoint_paths. Manually adding it.

 --- At epoch 1 : 
  [1, [inf, inf, inf, 1.0783597], [inf, inf, inf, 1.0723407], 1] 

   loss and regualization : 
 [1.5278312, 3.1608121, 6.254539, array([[0.30768397, 0.25213563, 0.21859354, 0.22158687],
       [0.26323414, 0.27037665, 0.27201456, 0.19437468],
       [0.21170452, 0.17591022, 0.42011085, 0.19227442]], dtype=float32)]

    [MODEL SAVED] best_snapshots 
 ../results/m

  val_error = np.mean([tmp_step[2][metric_idx] for tmp_step in train_log][:val_snapshot_num])


INFO:tensorflow:../results/m1_t10_1/linear_1_0 is not in all_model_checkpoint_paths. Manually adding it.

 --- At epoch 0 : 
  [0, [inf, inf, inf, 1.1654652], [3.9402116e+16, 951176200000000.0, 677822660000.0, 1.2355559], 0] 

   loss and regualization : 
 [1.6911153, 4.1361837, 8.874229, array([[0.24867454, 0.32556757, 0.16998473, 0.2557732 ],
       [0.30574968, 0.30484667, 0.23789275, 0.15151091],
       [0.24045463, 0.3357109 , 0.14494316, 0.27889132]], dtype=float32)]

    [MODEL SAVED] best_snapshots 
 ../results/m1_t10_1/linear_1_0
INFO:tensorflow:../results/m1_t10_1/linear_1_1 is not in all_model_checkpoint_paths. Manually adding it.

 --- At epoch 1 : 
  [1, [inf, inf, inf, 1.1274269], [263251700.0, 6363876.5, 4555.5923, 1.1877048], 1] 

   loss and regualization : 
 [1.6432158, 3.0519514, 6.5505633, array([[0.24863535, 0.31699982, 0.19723715, 0.23712772],
       [0.29593816, 0.29940724, 0.2626983 , 0.14195628],
       [0.23788635, 0.33205608, 0.16092008, 0.2691374 ]], dtype=f

  m_src_sample = np.asarray(self.py_mean_src_samples)
  v_src_sample = np.asarray(self.py_var_src_samples)
  g_src_sample = np.asarray(self.py_gate_src_samples)
  m_sample = np.asarray(self.py_mean_samples)
  v_sample = np.asarray(self.py_var_samples)
  lk_sample = np.asarray(self.py_lk_samples)
  y_ori = np.asarray([tmp[0] for tmp in y])
  y_z   = np.asarray([tmp[1] for tmp in y])
  y_dese = np.asarray([tmp[2] for tmp in y])
  bayes_mean = np.mean(np.squeeze(m_sample, -1), axis = 0)
  var_plus_sq_mean = np.squeeze(v_sample + m_sample**2, -1)
  bayes_var_total = np.mean(var_plus_sq_mean, 0) - sq_mean
  bayes_var_data = np.mean(np.squeeze(v_sample, -1), 0)
  bayes_var_model = np.mean(np.squeeze(m_sample**2, -1), 0) - sq_mean
  nnllk = np.mean(-1.0*np.log(np.mean(lk_sample, 0) + 1e-5))
  bayes_gate_src = np.mean(g_src_sample, axis = 0)
  bayes_gate_src_var = np.var(g_src_sample, axis = 0)
  std_total_mean = np.mean(np.sqrt(bayes_var_total))
  std_total_mean = np.mean(np.sqrt(bayes_var_to

INFO:tensorflow:Restoring parameters from ../results/m1_t10_1/linear_6_66
INFO:tensorflow:Restoring parameters from ../results/m1_t10_1/linear_0_66
INFO:tensorflow:Restoring parameters from ../results/m1_t10_1/linear_3_66
INFO:tensorflow:Restoring parameters from ../results/m1_t10_1/linear_6_66
INFO:tensorflow:Restoring parameters from ../results/m1_t10_1/linear_6_43
INFO:tensorflow:Restoring parameters from ../results/m1_t10_1/linear_6_71
INFO:tensorflow:Restoring parameters from ../results/m1_t10_1/linear_6_60
INFO:tensorflow:Restoring parameters from ../results/m1_t10_1/linear_6_50
INFO:tensorflow:Restoring parameters from ../results/m1_t10_1/linear_6_66
INFO:tensorflow:Restoring parameters from ../results/m1_t10_1/linear_6_43
INFO:tensorflow:Restoring parameters from ../results/m1_t10_1/linear_6_71
INFO:tensorflow:Restoring parameters from ../results/m1_t10_1/linear_6_60
INFO:tensorflow:Restoring parameters from ../results/m1_t10_1/linear_6_50
INFO:tensorflow:Restoring parameters f