In [70]:
import gc, os
# import cudf
# import talib as ta
import numpy as np
import pandas as pd
# import jpx_tokyo_market_prediction

from lightgbm import LGBMRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import RobustScaler

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.backend as K
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

import warnings
warnings.filterwarnings("ignore")


In [71]:
import numpy.polynomial.hermite as Herm
import math
from tensorflow.python.ops import math_ops
from scipy import stats
import tensorflow_probability as tfp

In [72]:
from random import choices
import random
import keras_tuner as kt

In [73]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [74]:
#@title GroupTimeSeriesSplit { display-mode: "form" }
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class GroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_size : int, default=None
        Maximum size for a single training set.
    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.model_selection import GroupTimeSeriesSplit
    >>> groups = np.array(['a', 'a', 'a', 'a', 'a', 'a',\
                           'b', 'b', 'b', 'b', 'b',\
                           'c', 'c', 'c', 'c',\
                           'd', 'd', 'd'])
    >>> gtss = GroupTimeSeriesSplit(n_splits=3)
    >>> for train_idx, test_idx in gtss.split(groups, groups=groups):
    ...     print("TRAIN:", train_idx, "TEST:", test_idx)
    ...     print("TRAIN GROUP:", groups[train_idx],\
                  "TEST GROUP:", groups[test_idx])
    TRAIN: [0, 1, 2, 3, 4, 5] TEST: [6, 7, 8, 9, 10]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a']\
    TEST GROUP: ['b' 'b' 'b' 'b' 'b']
    TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] TEST: [11, 12, 13, 14]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b']\
    TEST GROUP: ['c' 'c' 'c' 'c']
    TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]\
    TEST: [15, 16, 17]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b' 'c' 'c' 'c' 'c']\
    TEST GROUP: ['d' 'd' 'd']
    """
    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_size=None
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_size = max_train_size

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
        group_test_size = n_groups // n_folds
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []
            for train_group_idx in unique_groups[:group_test_start]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)
            train_end = train_array.size
            if self.max_train_size and self.max_train_size < train_end:
                train_array = train_array[train_end -
                                          self.max_train_size:train_end]
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)
            yield [int(i) for i in train_array], [int(i) for i in test_array]
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# modified code for group gaps; source
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]

                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size

            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]


            if self.verbose > 0:
                    pass

            yield [int(i) for i in train_array], [int(i) for i in test_array]
            
def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    cmap_cv = plt.cm.coolwarm
    jet     = plt.cm.get_cmap('jet', 256)
    seq     = np.linspace(0, 1, 256)
    _       = np.random.shuffle(seq)   # inplace
    cmap_data = ListedColormap(jet(seq))    
    for ii, (tr, tt) in enumerate(list(cv.split(X=X, y=y, groups=group))):
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0        
        ax.scatter(range(len(indices)), [ii + .5] * len(indices), c=indices, marker='_', lw=lw, cmap=cmap_cv, vmin=-.2, vmax=1.2)
    ax.scatter(range(len(X)), [ii + 1.5] * len(X), c=y, marker='_', lw=lw, cmap=plt.cm.Set3)
    ax.scatter(range(len(X)), [ii + 2.5] * len(X), c=group, marker='_', lw=lw, cmap=cmap_data)
    yticklabels = list(range(n_splits)) + ['target', 'day']
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels, xlabel='Sample index', ylabel="CV iteration", ylim=[n_splits+2.2, -.2], xlim=[0, len(y)])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

In [75]:
device = "CPU"

In [76]:
if device == "TPU":
    print("connecting to TPU...")
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None
    if tpu:
        try:
            print("initializing  TPU ...")
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            strategy = tf.distribute.TPUStrategy(tpu)
            print("TPU initialized")
        except: print("failed to initialize TPU")
    else: device = "GPU"

if device != "TPU": strategy = tf.distribute.get_strategy()
if device == "GPU": print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
AUTO     = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync

In [77]:
def set_all_seeds(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    tf.keras.utils.set_random_seed(seed)

In [78]:
# SEED = 2025
# set_all_seeds(SEED)

In [116]:
col_use  = [
    # 'day', 'Volume',
    'ScaledAdjustedClose',
    # 'close_arccos_deg',
    'trend_psa_indicator',
    'trend_aroon_ind_diff1',
    # 'volume_pct_change_ror_1',
    'sma_5_25', 'sma_25_30',
    'd_atr',
    'ror_1', 'ror_5', 'ror_10',
    'TradedAmount_1', 'TradedAmount_5',
    # 'ror1_ror2', 'ror1_ror3', 'ror1_ror4', 'ror1_ror5',
    'ror_1_shift1', 'ror_1_shift2', 'ror_1_shift3', 'ror_1_shift4', 'ror_1_shift5',
    'ror_1_shift6', 'ror_1_shift7', 'ror_1_shift8', 'ror_1_shift9',
    'd_Amount',
    'range_1', 'range_5',
    'gap_range_1', 'gap_range_5',
    'day_range_1', 'day_range_5',
    'hig_range_1', 'hig_range_5',
    'mi_1', 'mi_5',
    'vola_10',
    'hl_5', 'hl_10',
]

In [117]:
read_cols = col_use + ['Date', 'SecuritiesCode', 'Target']

In [118]:
X = pd.read_parquet('../Output/train_scaling.parquet', columns=read_cols)
groups = pd.factorize(pd.to_datetime(X['Date']).dt.strftime('%d').astype(str) + '_' + pd.to_datetime(X['Date']).dt.strftime('%m').astype(str) + '_' +pd.to_datetime(X['Date']).dt.strftime('%Y').astype(str))
y = X.Target
# X = X.drop(['RowId','Target','AdjustmentFactor','ExpectedDividend','SupervisionFlag','Date'],axis=1)
X = X[col_use]
# valid = pd.read_parquet('../input/scaling/valid_scaling.parquet')
# test = pd.read_parquet('../input/scaling/test_scaling.parquet')

In [119]:
# CV PARAMS
FOLDS                = 3
GROUP_GAP            = 14
# MAX_TEST_GROUP_SIZE  = 180  
# MAX_TRAIN_GROUP_SIZE = 485

# USE VERBOSE=0 for silent, VERBOSE=1 for interactive, VERBOSE=2 for commit
VERBOSE = 2

In [120]:
X.shape, y.shape

((2286531, 34), (2286531,))

In [121]:
from tensorflow.python.keras import backend as K
def e_swish(beta=0.25):
    def beta_swish(x): return x*K.sigmoid(x)*(1+beta)
    return beta_swish

In [122]:
def correlationLoss(x,y, axis=-2):
    
    """Loss function that maximizes the pearson correlation coefficient between the predicted values and the labels,
    while trying to have the same mean and variance"""
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xsqsum = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    ysqsum = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xsqsum * ysqsum)
    return tf.convert_to_tensor( K.mean(tf.constant(1.0, dtype=x.dtype) - corr ) , dtype=tf.float32 )

In [123]:
def correlation(x, y, axis=-2):
    """Metric returning the Pearson correlation coefficient of two tensors over some axis, default -2."""
    x = tf.convert_to_tensor(x)
    y = math_ops.cast(y, x.dtype)
    n = tf.cast(tf.shape(x)[axis], x.dtype)
    xsum = tf.reduce_sum(x, axis=axis)
    ysum = tf.reduce_sum(y, axis=axis)
    xmean = xsum / n
    ymean = ysum / n
    xvar = tf.reduce_sum( tf.math.squared_difference(x, xmean), axis=axis)
    yvar = tf.reduce_sum( tf.math.squared_difference(y, ymean), axis=axis)
    cov = tf.reduce_sum( (x - xmean) * (y - ymean), axis=axis)
    corr = cov / tf.sqrt(xvar * yvar)
    return tf.constant(1.0, dtype=x.dtype) - corr

In [124]:
def sharpe_loss(X_train,y_pred):
    y_pred = tf.Variable(y_pred,dtype=tf.float64)
    port_ret = tf.reduce_sum(tf.multiply(_,y_pred),axis=1)
    s_ratio = K.mean(port_ret)/K.std(port_ret)
    
    return tf.math.exp(-s_ratio,  name='sharpe_loss')

In [171]:
def build_model(hp, dim = 128, fold=0):
    # SEED = 2025
    SEED = 1
    set_all_seeds(SEED)
    
    features_inputs = tf.keras.layers.Input(shape = (dim, ))
    x0      =  tf.keras.layers.BatchNormalization()(features_inputs)
    
    weight = tf.Variable(tf.keras.backend.random_normal((dim, 1), stddev=hp.Float(f'weight_{fold}',1e-10, 0.09), dtype=tf.float32))
    var    = tf.Variable(tf.zeros((1,1), dtype=tf.float32))
   
    encoder = tf.keras.layers.GaussianNoise(0.4)(x0)
    encoder = tf.keras.layers.Dense(hp.Int(f'layers{fold}_en0',32, 1024))(encoder)
    encoder = tf.keras.layers.Dense(hp.Int(f'layers{fold}_en1',32, 1024))(encoder)
    encoder = tf.keras.layers.Dense(hp.Int(f'layers{fold}_en2',32, 1024))(encoder)
    encoder = tf.keras.layers.Dense(hp.Int(f'layers{fold}_en2',16, 256))(encoder)
    encoder = tf.keras.layers.Dense(hp.Int(f'layers{fold}_en2',16, 256))(encoder)
    encoder = tf.keras.layers.Dense(hp.Int(f'layers{fold}_en2',4, 64))(encoder)
    encoder = tf.keras.layers.Dense(hp.Int(f'layers{fold}_en2',4, 64))(encoder)
    encoder = tf.keras.layers.BatchNormalization()(encoder)
    encoder = tf.keras.layers.Activation(e_swish(beta=hp.Float(f'e{fold}_en0',0.001, 1 )))(encoder)
    
    decoder = tf.keras.layers.Dense(hp.Int(f'layers{fold}_de0',4, 64))(encoder)
    decoder = tf.keras.layers.Dense(hp.Int(f'layers{fold}_de1',32, 1024), name='decoder')(decoder)
#     decoder = tf.keras.layers.Dropout(hp.Float(f'dropout{fold}_de0',0.001, 0.8))(encoder)
#     decoder = tf.keras.layers.Dense(hp.Int(f'layers{fold}_de0',32, 1024), name='decoder')(decoder)
    
    x_ae = tf.keras.layers.Dense(hp.Int(f'layers{fold}_ae0',32, 1024))(decoder)
    x_ae = tf.keras.layers.BatchNormalization()(x_ae)
    x_ae = tf.keras.layers.Activation(e_swish(beta=hp.Float(f'e{fold}_ae0',0.001, 1 )))(x_ae)
#     x_ae = tf.keras.layers.Dropout(hp.Float(f'dropout{fold}_ae0',0.001, 0.8))(x_ae) 
    
    feature_x = tf.keras.layers.Concatenate()([x0, encoder])
    feature_x = tf.keras.layers.BatchNormalization()(feature_x)
    feature_x = tf.keras.layers.Dense(hp.Int(f'layers{fold}_fx0',32, 1024))(feature_x)
    feature_x = tf.keras.layers.Activation(e_swish(beta=hp.Float(f'e_fx0',0.001, 1 )))(feature_x)
#     feature_x = tf.keras.layers.Dropout(hp.Float(f'dropout{fold}_fx0',0.001, 0.8))(feature_x)

    x = layers.Dense(hp.Int(f'layers{fold}_x0',32, 1024), activation= e_swish(beta=hp.Float(f'e{fold}_x0',0.001, 1 )), kernel_regularizer="l2")(feature_x)
    x = layers.Dense(hp.Int(f'layers{fold}_x1',32, 1024), activation= e_swish(beta=hp.Float(f'e{fold}_x1',0.001, 1 )), kernel_regularizer="l2")(x)
    x = layers.Dense(hp.Int(f'layers{fold}_x2',32, 1024), activation= e_swish(beta=hp.Float(f'e{fold}_x2',0.001, 1 )), kernel_regularizer="l2")(x)
    x = layers.Dense(hp.Int(f'layers{fold}_x3',32, 1024), activation= e_swish(beta=hp.Float(f'e{fold}_x3',0.001, 1 )), kernel_regularizer="l2")(x)
    x = layers.Dense(hp.Int(f'layers{fold}_x3',16, 256), activation= e_swish(beta=hp.Float(f'e{fold}_x4',0.001, 1 )), kernel_regularizer="l2")(x)
    x = layers.Dense(hp.Int(f'layers{fold}_x3',16, 256), activation= e_swish(beta=hp.Float(f'e{fold}_x5',0.001, 1 )), kernel_regularizer="l2")(x)
#     x = tf.keras.layers.Dropout(hp.Float(f'dropout{fold}_x0',0.001, 0.8))(x)

    mlp_out = layers.Dense(1, name ='mlp_out')(x)

    model  = tf.keras.Model(inputs=[features_inputs], outputs=[decoder, mlp_out])
    
    loss_out = tf.add(tf.matmul(features_inputs,weight), tf.math.reduce_sum(weight*var))
    tf.compat.v1.losses.add_loss(loss_out)
  
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=hp.Float(f'lr_adam{fold}',1e-3, 1e-5)),
                  loss = {'decoder': [tf.keras.losses.CosineSimilarity(axis=-2), 
                                      tf.keras.losses.MeanSquaredError(), 
                                      correlationLoss],         
                          
                          'mlp_out' : [sharpe_loss],
                         },
                  metrics = {'decoder': [tf.keras.metrics.CosineSimilarity(name='cosine'),
                                         tf.keras.metrics.MeanAbsoluteError(name="mae"), 
                                         correlation, 
                                         tf.keras.metrics.RootMeanSquaredError(name='rmse')], 
                             
                             'mlp_out' : [tf.keras.metrics.CosineSimilarity(name='cosine'),
                                          tf.keras.metrics.MeanAbsoluteError(name="mae"), 
                                          correlation, 
                                          tf.keras.metrics.RootMeanSquaredError(name='rmse')],
                            },
                 ) 
    return model

In [172]:
hp = pd.read_pickle(f'../Output/hp-jpx-aemlp/best_hp_ae_jpx_3gkf.pkl')

In [173]:
# tf.keras.utils.plot_model(build_model(hp, fold=0), show_shapes=True, expand_nested=True, show_dtype=True)

In [174]:
batch_size = [4096*4,4096,4096*8]

In [175]:
gkf = PurgedGroupTimeSeriesSplit(n_splits = FOLDS, 
                                 group_gap = GROUP_GAP, 
                                #  max_train_group_size = MAX_TRAIN_GROUP_SIZE, 
                                #  max_test_group_size  = MAX_TEST_GROUP_SIZE
                                 ).split(X, y, groups[0])
models = []
for fold, (train_idx, val_idx) in enumerate(list(gkf)):
    x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    print(f'>>> AEMLP_FOLD:{fold}')
    K.clear_session()
    with strategy.scope(): model = build_model(hp, dim = x_train.shape[1], fold=fold)
    model_save = tf.keras.callbacks.ModelCheckpoint('./fold-%i.hdf5' %(fold), 
                                                         monitor = 'val_mlp_out_rmse', verbose = 0, 
                                                         save_best_only = True, save_weights_only = True,
                                                         mode = 'min', save_freq = 'epoch')
    early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_mlp_out_rmse', patience=15, mode='min', restore_best_weights=True)
    history = model.fit(x_train, y_train ,
                        epochs          = 200, 
                        callbacks       = [model_save, early_stop], 
                        validation_data = (x_val, y_val), 
                        batch_size      = batch_size[fold],
                        verbose         = 2) 
    print('='*96)
    models.append(model)
    gc.collect()


>>> AEMLP_FOLD:0
Epoch 1/200
33/33 - 38s - loss: 7.4191 - decoder_loss: -1.6052e-02 - mlp_out_loss: 0.0131 - decoder_cosine: 0.0493 - decoder_mae: 0.4479 - decoder_correlation: 0.9900 - decoder_rmse: 0.7990 - mlp_out_cosine: 0.0078 - mlp_out_mae: 0.0333 - mlp_out_correlation: 0.9956 - mlp_out_rmse: 0.1145 - val_loss: 3.8747 - val_decoder_loss: -1.2320e-02 - val_mlp_out_loss: 0.0054 - val_decoder_cosine: 0.0558 - val_decoder_mae: 0.6109 - val_decoder_correlation: 0.9912 - val_decoder_rmse: 1.5068 - val_mlp_out_cosine: -2.8031e-02 - val_mlp_out_mae: 0.0229 - val_mlp_out_correlation: 1.0018 - val_mlp_out_rmse: 0.0736 - 38s/epoch - 1s/step
Epoch 2/200
33/33 - 32s - loss: 2.4388 - decoder_loss: -5.9293e-02 - mlp_out_loss: 5.2857e-04 - decoder_cosine: 0.1754 - decoder_mae: 0.5166 - decoder_correlation: 0.9536 - decoder_rmse: 0.8015 - mlp_out_cosine: 0.0128 - mlp_out_mae: 0.0142 - mlp_out_correlation: 0.9948 - mlp_out_rmse: 0.0230 - val_loss: 1.5077 - val_decoder_loss: -3.1444e-02 - val_mlp_o

In [131]:
valid = pd.read_parquet('../Output/valid_scaling.parquet')
test = pd.read_parquet('../Output/test_scaling.parquet')

In [132]:
def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    # sort records to set Rank
    df = df.sort_values("predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["predict"]))
    return df

In [133]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2, rank_col='Rank') -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """

    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df[rank_col].min() == 0
        assert df[rank_col].max() == len(df[rank_col]) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by=rank_col)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by=rank_col, ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio, buf

In [152]:
valid[col_use]

Unnamed: 0,ScaledAdjustedClose,trend_psa_indicator,trend_aroon_ind_diff1,sma_5_25,sma_25_30,d_atr,ror_1,ror_5,ror_10,TradedAmount_1,TradedAmount_5,ror_1_shift1,ror_1_shift2,ror_1_shift3,ror_1_shift4,ror_1_shift5,ror_1_shift6,ror_1_shift7,ror_1_shift8,ror_1_shift9,d_Amount,range_1,range_5,gap_range_1,gap_range_5,day_range_1,day_range_5,hig_range_1,hig_range_5,mi_1,mi_5,vola_10,hl_5,hl_10
2288402,-0.969892,0.0,0.0,0.076375,0.090829,4.706147,-0.076190,-0.116397,-0.121730,7.289201e+09,3.646480e+09,-0.011506,-0.017472,-0.017172,0.002024,0.000000,-0.005035,-0.003012,0.001005,0.001006,3.865548,0.087831,0.039321,0.052910,0.013879,0.034921,0.027917,0.011640,0.011974,1.204943e-11,1.143334e-11,0.007177,16.0,30.0
2288523,0.245653,0.0,0.0,-0.040451,-0.040275,0.923833,0.026768,-0.009081,0.014588,8.768386e+10,1.504537e+11,0.008508,-0.026907,-0.033692,0.017692,-0.033737,0.005476,-0.020560,0.044019,0.030619,0.763711,0.033095,0.034460,0.017034,0.011593,0.018170,0.026869,0.008436,0.013331,3.774399e-13,2.919821e-13,0.011041,31.0,65.0
2287372,-0.519343,0.0,-64.0,0.051674,0.053797,5.053454,-0.068919,-0.071429,-0.096986,3.008863e+08,1.058135e+08,-0.012016,0.013532,-0.015979,0.012129,-0.006693,-0.007968,-0.009211,-0.011704,0.007864,4.300527,0.139189,0.045818,0.016216,0.006204,0.139189,0.045279,0.054054,0.019414,4.625973e-10,5.030325e-10,0.027868,42.0,120.0
2288388,0.802228,0.0,4.0,0.121066,-0.155476,1.314994,0.016082,0.040514,0.038803,1.897406e+10,6.092362e+10,0.004523,0.002916,0.004884,0.011528,-0.008167,-0.000653,-0.001304,0.009214,-0.000658,0.688261,0.016404,0.015455,0.003538,0.005867,0.014796,0.013618,0.002252,0.007612,8.645482e-13,5.510298e-13,0.009210,24.0,54.0
2288181,-0.933359,1.0,0.0,-0.026280,0.038176,2.447600,0.038579,0.021072,0.023537,1.426336e+08,9.625044e+07,-0.006086,-0.022024,0.019417,-0.007827,-0.006579,0.004808,0.004225,0.001814,-0.001811,2.528974,0.043478,0.024599,0.022045,0.008837,0.026944,0.019626,0.010410,0.008703,3.048248e-10,2.504696e-10,0.021658,57.0,69.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2369992,0.061290,0.0,0.0,0.313069,-0.295134,1.450737,-0.075838,0.013540,0.459610,5.396728e+09,5.738070e+09,0.164271,-0.170358,0.087037,0.044487,0.109442,0.071264,0.058394,0.084433,0.055710,2.047311,0.156966,0.164847,0.022928,0.020691,0.156966,0.163615,0.058201,0.045203,2.908549e-11,2.925303e-11,0.011594,89.0,123.0
2369865,0.798726,0.0,4.0,0.197518,0.184281,1.272239,0.034179,0.227749,0.532680,2.999349e+09,1.786231e+09,0.129514,0.021628,-0.033210,0.064136,-0.015464,0.086835,0.060921,-0.054775,0.163399,2.907139,0.127894,0.124434,0.007718,0.011465,0.127894,0.124434,0.085998,0.062521,4.264064e-11,7.882136e-11,0.017156,66.0,159.0
2368917,0.507060,-1.0,0.0,-0.023935,-0.023789,2.670614,-0.112158,-0.036245,-0.042475,1.088228e+09,4.011016e+08,0.044723,0.056711,-0.031136,0.014870,-0.014652,0.025352,0.030978,-0.030047,-0.016620,3.986210,0.145548,0.064465,0.011986,0.007327,0.145548,0.062648,0.021404,0.008866,1.337477e-10,2.071032e-10,0.014817,360.0,550.0
2370104,0.644526,0.0,4.0,0.043922,0.041250,3.676971,0.086057,0.152601,0.147296,2.909146e+09,1.060751e+09,0.037288,0.005682,0.006865,0.010405,0.002317,0.029833,-0.008284,-0.036488,0.009206,4.536994,0.087146,0.033348,0.023965,0.006631,0.068627,0.029644,0.006536,0.007016,2.995586e-11,3.186361e-11,0.013241,50.0,87.0


In [151]:
len(models[i].predict(valid[col_use])[0])



84000

In [153]:
np.unique(models[0].predict(valid[col_use])[-1] * ap[0], return_counts=True)



(array([-0.00027696, -0.00022373, -0.00021226, ...,  0.00110479,
         0.00115732,  0.00122536], dtype=float32),
 array([1, 1, 1, ..., 1, 1, 1]))

In [134]:
ap = [0.10,0.10,0.80]
model_x = list()
for i in range(FOLDS):
    prediction_x = models[i].predict(valid[col_use])[-1] * ap[i]
    model_x.append(prediction_x)
model_x = np.mean(model_x, axis = 0)
valid['predict'] = model_x



In [135]:
from scipy import stats
pearson_score = stats.pearsonr(valid['predict'], valid.Target)[0]
print('Pearson:', pearson_score)

Pearson: 0.017824402718611


In [136]:
valid = valid.sort_values(["Date", "predict"], ascending=[True, False])
ranking = valid.groupby("Date").apply(set_rank).reset_index(drop=True)
sharp_ratio, _ = calc_spread_return_sharpe(ranking, portfolio_size=200)

In [137]:
sharp_ratio

-0.07574970389415503

In [162]:
ap = [0.50,0.70,0.00]
model_x = list()
for i in range(FOLDS):
    prediction_x = models[i].predict(test[col_use])[-1] * ap[i]
    model_x.append(prediction_x)
model_x = np.mean(model_x, axis = 0)
test['predict'] = model_x



In [163]:
from scipy import stats
pearson_score = stats.pearsonr(test['predict'], test.Target)[0]
print('Pearson:', pearson_score)

Pearson: 0.036311828734799385


In [164]:
test = test.sort_values(["Date", "predict"], ascending=[True, False])
ranking = test.groupby("Date").apply(set_rank).reset_index(drop=True)
sharp_ratio, _ = calc_spread_return_sharpe(ranking, portfolio_size=200)

In [165]:
sharp_ratio

0.1479013168850063

In [166]:
_

Date
2022-01-04    0.944416
2022-01-05    0.725218
2022-01-06    1.544686
2022-01-07   -0.058226
2022-01-11   -1.304273
2022-01-12   -0.395666
2022-01-13    0.725902
2022-01-14    2.188461
2022-01-17   -2.537617
2022-01-18    3.016354
2022-01-19    0.963744
2022-01-20   -0.678302
2022-01-21    1.363827
2022-01-24    0.066028
2022-01-25   -4.680783
2022-01-26   -1.008589
2022-01-27    1.886614
2022-01-28    2.388035
2022-01-31    3.193490
2022-02-01   -0.385363
2022-02-02   -0.510959
2022-02-03   -0.571943
2022-02-04   -0.791617
2022-02-07    0.129298
2022-02-08    1.672243
2022-02-09   -3.298243
2022-02-10    0.550891
2022-02-14   -0.029040
2022-02-15   -2.051278
2022-02-16    0.125233
2022-02-17   -1.531243
2022-02-18   -0.470706
2022-02-21   -2.637605
2022-02-22    7.368525
2022-02-24    1.464058
2022-02-25    6.983547
2022-02-28   -1.342039
dtype: float64

In [None]:
valid
0.02019443429209232

test
0.010806184456573602

In [None]:
valid
-0.21702511460140766

test
-0.1842915053496203