In [1]:
import pandas as pd
import sys
import mlflow

sys.path.append("..")
from common.constants import DATAFOLDER, ASSET_INFO
from preprocess.src.extract_data import splitted_data_timestamps, cpcv_split

In [2]:
def splitted_data_index(n_splits: int, asset_name: str, asset_id: int):
    data = pd.read_parquet(os.path.join("../data/", f"{asset_name}.parquet.gzip"), engine="pyarrow")
    
    _timestamps = data.index.values
    split_length = len(_timestamps) // n_splits

    splitted_timestamps = {}
    first_timestamps = 0
    for i in range(split_length, len(_timestamps), split_length):
        splitted_timestamps[i // split_length - 1] = _timestamps[first_timestamps:i]
        first_timestamps = i
    splitted_timestamps[len(_timestamps)//split_length - 1] = _timestamps[first_timestamps:]
    return {"asset_name": asset_name, "asset_id": asset_id, "splitted_indexes": splitted_timestamps}


res = splitted_data_index(n_splits=6, asset_name="Bitcoin", asset_id=1)

In [3]:
cpcv_fold = cpcv_split(n_test_folds=2, data_timestamps=res)
print(cpcv_fold)

{'asset_name': 'Bitcoin', 'asset_id': 1, 'cpcv_folds': [{0: {'train': {3: array([1573473660, 1573473720, 1573473780, ..., 1593043080, 1593043140,
       1593043200]), 4: array([1593043260, 1593043320, 1593043380, ..., 1612612680, 1612612740,
       1612612800]), 5: array([1612612860, 1612612920, 1612612980, ..., 1632182280, 1632182340,
       1632182400])}, 'valid': {0: array([1514764860, 1514764920, 1514764980, ..., 1534334280, 1534334340,
       1534334400]), 1: array([1534334460, 1534334520, 1534334580, ..., 1553903880, 1553903940,
       1553904000])}, 'test': {2: array([1553904060, 1553904120, 1553904180, ..., 1573473480, 1573473540,
       1573473600])}}, 1: {'train': {1: array([1534334460, 1534334520, 1534334580, ..., 1553903880, 1553903940,
       1553904000]), 4: array([1593043260, 1593043320, 1593043380, ..., 1612612680, 1612612740,
       1612612800]), 5: array([1612612860, 1612612920, 1612612980, ..., 1632182280, 1632182340,
       1632182400])}, 'valid': {2: array([1553904

In [19]:
import itertools
import numpy as np

folds = list(res["splitted_indexes"].keys())
n_folds = len(folds)
n_test_folds = 2

selected_fold_bounds = list(itertools.combinations(folds, n_test_folds))

num_senarios = int(len(selected_fold_bounds) / (n_folds/n_test_folds))

cpcv_folds = []
for i in range(num_senarios):
    _jump_folds = list(range(num_senarios-1, 0, -1))[:i]
    
    while len(_jump_folds) < num_senarios - 1:
        _jump_folds.append(1)

    test_folds = []
    test_folds.append(i)
    for jump_idx, jump_val in enumerate(_jump_folds):
        test_folds.append(test_folds[jump_idx]+jump_val)

    if i == num_senarios - 1:
        # flip left-right side
        _senario = {}
        for key in list(cpcv_folds[0].keys())[::-1]:
            _senario[num_senarios - 1 - key] = {"train": {}, "valid": {}, "test": {}}
            for _type in ["train", "valid", "test"]:
                for fold_idx in [n_folds - 1 - x for x in cpcv_folds[0][key][_type]]:
                    _senario[num_senarios - 1 - key][_type][fold_idx] = res["splitted_indexes"][fold_idx]   
        cpcv_folds.append(_senario)
    else:
        _senario = {}
        _selected_valid_folds = []
        _selected_test_folds = []
        for sub_senario_idx, test_fold in enumerate(test_folds):
            _senario[sub_senario_idx] = {"train": {}, "valid": {}, "test": {}}

            # select valid set.
            for valid_fold_idx in selected_fold_bounds[test_fold]:
                if valid_fold_idx not in _selected_valid_folds:
                    _senario[sub_senario_idx]["valid"][valid_fold_idx] = res["splitted_indexes"][valid_fold_idx]
                    _selected_valid_folds.append(valid_fold_idx)

            # select test set.
            top_valid_fold = sorted(selected_fold_bounds[test_fold])[::-1][0]
            _test_fold_idx = top_valid_fold + 1 if top_valid_fold < num_senarios else 0
            while True:
                if _test_fold_idx not in selected_fold_bounds[test_fold] and _test_fold_idx not in _selected_test_folds:
                    _senario[sub_senario_idx]["test"][_test_fold_idx] = res["splitted_indexes"][_test_fold_idx]
                    _selected_test_folds.append(_test_fold_idx)
                    break
                _test_fold_idx += 1
                if _test_fold_idx > num_senarios:
                    _test_fold_idx = 0

            # select train set.
            _selected_folds = [_test_fold_idx]
            _selected_folds += selected_fold_bounds[test_fold]
            for train_fold_idx in [v for v in range(n_folds) if v not in _selected_folds]:
                _senario[sub_senario_idx]["train"][train_fold_idx] = res["splitted_indexes"][train_fold_idx]
        cpcv_folds.append(_senario)
        
print(cpcv_folds[0])

{0: {'train': {3: array([1573473660, 1573473720, 1573473780, ..., 1593043080, 1593043140,
       1593043200]), 4: array([1593043260, 1593043320, 1593043380, ..., 1612612680, 1612612740,
       1612612800]), 5: array([1612612860, 1612612920, 1612612980, ..., 1632182280, 1632182340,
       1632182400])}, 'valid': {0: array([1514764860, 1514764920, 1514764980, ..., 1534334280, 1534334340,
       1534334400]), 1: array([1534334460, 1534334520, 1534334580, ..., 1553903880, 1553903940,
       1553904000])}, 'test': {2: array([1553904060, 1553904120, 1553904180, ..., 1573473480, 1573473540,
       1573473600])}}, 1: {'train': {1: array([1534334460, 1534334520, 1534334580, ..., 1553903880, 1553903940,
       1553904000]), 4: array([1593043260, 1593043320, 1593043380, ..., 1612612680, 1612612740,
       1612612800]), 5: array([1612612860, 1612612920, 1612612980, ..., 1632182280, 1632182340,
       1632182400])}, 'valid': {2: array([1553904060, 1553904120, 1553904180, ..., 1573473480, 1573473540

In [20]:
for cpcv_fold in cpcv_folds:
    print(cpcv_folds)
    print("="*10)
    # arrs = []
    # for key in cpcv_fold.keys():
    #     o_x = ["x" for _ in range(n_folds)]
    #     for train_idx in cpcv_fold[key]["train"]:
    #         o_x[train_idx] = "o"
    #     for valid_idx in cpcv_fold[key]["valid"]:
    #         o_x[valid_idx] = "x"

    #     for test_idx in cpcv_fold[key]["test"]:
    #         o_x[test_idx] = "v"

    #     arrs.append(o_x)
    
    # for i in range(6):
    #     l = [arrs[v][i] for v in range(5)]
    #     print(*l)

[{0: {'train': {3: array([1573473660, 1573473720, 1573473780, ..., 1593043080, 1593043140,
       1593043200]), 4: array([1593043260, 1593043320, 1593043380, ..., 1612612680, 1612612740,
       1612612800]), 5: array([1612612860, 1612612920, 1612612980, ..., 1632182280, 1632182340,
       1632182400])}, 'valid': {0: array([1514764860, 1514764920, 1514764980, ..., 1534334280, 1534334340,
       1534334400]), 1: array([1534334460, 1534334520, 1534334580, ..., 1553903880, 1553903940,
       1553904000])}, 'test': {2: array([1553904060, 1553904120, 1553904180, ..., 1573473480, 1573473540,
       1573473600])}}, 1: {'train': {1: array([1534334460, 1534334520, 1534334580, ..., 1553903880, 1553903940,
       1553904000]), 4: array([1593043260, 1593043320, 1593043380, ..., 1612612680, 1612612740,
       1612612800]), 5: array([1612612860, 1612612920, 1612612980, ..., 1632182280, 1632182340,
       1632182400])}, 'valid': {2: array([1553904060, 1553904120, 1553904180, ..., 1573473480, 157347354

In [21]:
cpcv_folds[-1]

{0: {'train': {3: array([1573473660, 1573473720, 1573473780, ..., 1593043080, 1593043140,
          1593043200]),
   2: array([1553904060, 1553904120, 1553904180, ..., 1573473480, 1573473540,
          1573473600]),
   1: array([1534334460, 1534334520, 1534334580, ..., 1553903880, 1553903940,
          1553904000])},
  'valid': {0: array([1514764860, 1514764920, 1514764980, ..., 1534334280, 1534334340,
          1534334400])},
  'test': {4: array([1593043260, 1593043320, 1593043380, ..., 1612612680, 1612612740,
          1612612800])}},
 1: {'train': {4: array([1593043260, 1593043320, 1593043380, ..., 1612612680, 1612612740,
          1612612800]),
   3: array([1573473660, 1573473720, 1573473780, ..., 1593043080, 1593043140,
          1593043200]),
   2: array([1553904060, 1553904120, 1553904180, ..., 1573473480, 1573473540,
          1573473600])},
  'valid': {1: array([1534334460, 1534334520, 1534334580, ..., 1553903880, 1553903940,
          1553904000])},
  'test': {0: array([15147