https://www.kaggle.com/code/konradb/ts-10-validation-methods-for-time-series

除了那些因为YOLO而在周五晚上部署到生产环境的人之外，我们都同意模型验证很重要：测量机器学习模型的性能（以及它的泛化能力）使我们能够评估鲁棒性，优化参数并估计看不见的数据的性能。如果有充分的理由相信底层数据生成过程是平稳的（没有概念漂移），那么你通常可以接受训练验证测试分割（尽管对验证集过拟合）。如果时间维度很重要，情况会变得稍微复杂一些：在这一集中，我们将在不打破时间箭头的情况下，介绍评估时间序列模型性能的不同方式。

In [32]:
import os
from IPython.display import Image
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc

from scipy.stats import pearsonr as p

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight') 

import warnings
warnings.simplefilter(action='ignore', category= FutureWarning)

# 随机拆分¶
我们将使用最近结束的Ubiquant市场预测竞赛的数据：

In [33]:
import os
os.chdir('E:\python code\量化交易数据')

xtrain = pd.read_parquet("train_2.parquet")
xtrain.head(10)

Unnamed: 0,row_id,time_id,investment_id,target,f_0,f_1,f_2,f_3,f_4,f_5,...,f_290,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299
652,0_1062,0,1062,-0.468671,-0.706653,-0.765238,0.62083,-0.581358,0.993605,0.408986,...,-0.170365,0.912726,-0.169476,-1.220772,0.941183,0.464294,-1.087009,0.540411,0.375438,0.003703
705,0_1144,0,1144,-0.107676,-0.74841,-2.271974,0.381967,-0.581879,-0.606941,0.703325,...,0.82156,-1.09562,-0.633994,-1.220772,0.941183,-0.669425,0.104928,-0.303525,-1.86127,-0.815836
1294,0_2140,0,2140,-0.82436,0.876086,-1.769729,-0.230321,-0.579754,-0.366735,0.240823,...,0.82156,-1.09562,-0.654613,-1.220772,0.941183,-0.503797,1.296864,-0.862824,-1.492674,-0.418889
1445,0_2385,0,2385,0.282452,0.217906,-1.141922,0.399086,-0.576741,2.127874,0.175264,...,-0.170365,0.912726,-1.11984,0.819155,0.941183,0.136213,0.104928,1.146075,-1.286455,0.538988
1659,0_2727,0,2727,-0.736424,-0.928419,3.001602,-1.925446,1.21515,1.979153,1.603922,...,-2.105698,-1.09562,-0.648119,0.819155,-1.060166,1.25145,1.296864,0.412373,1.431981,2.084112
2924,1_1062,1,1062,-0.223018,-0.80585,-0.814999,0.723359,-0.608408,0.982574,0.235933,...,0.693888,0.924511,0.08271,-1.721066,-1.189598,0.011736,0.027711,-0.838125,-0.76669,0.002693
2976,1_1144,1,1144,0.557137,-0.779184,-2.173314,0.353022,-0.610101,-0.597564,0.413387,...,0.167517,-1.081652,-1.077176,-1.721066,-1.189598,-0.698124,-1.173816,-0.468102,-1.912625,-0.83107
3568,1_2140,1,2140,-0.012454,0.951884,-1.802864,-0.211035,-0.608806,-0.372428,0.778097,...,1.746629,-1.081652,-0.181377,-1.721066,0.839101,-0.491055,0.027711,-0.94395,-0.992202,-0.432896
3717,1_2385,1,2385,0.864889,0.739182,-1.185449,0.10372,-0.605656,2.109318,-0.808031,...,-0.341965,0.924511,-1.039949,0.581035,0.839101,0.73425,0.027711,1.556703,-1.654856,0.739229
3930,1_2727,1,2727,0.2783,0.868855,2.889494,-1.982309,1.506996,2.033059,-0.112873,...,-0.341965,-1.081652,-0.050472,0.581035,0.839101,1.223184,1.229239,1.31094,1.565812,2.147083


这个问题有很多特点，使其比平时更具挑战性：
多项投资-并非所有投资都出现在每个时间戳
（可能）时间戳之间的间隔不同
试验期结束后未立即进行试验观察

In [34]:
xtrain['time_id'].min(), xtrain['time_id'].max()

(0, 1219)

PSA：为了使设置真正正确，我们应该围绕这个块进行循环，并更改测试集
→
否则，我们就有过度拟合测试集的风险
我们将保留从时间戳1100开始的观测值作为我们的坚持测试集，并尝试使用其余的观测值：

In [35]:
xtrain.index= range(xtrain.shape[0])

In [36]:
xtrain.head()

Unnamed: 0,row_id,time_id,investment_id,target,f_0,f_1,f_2,f_3,f_4,f_5,...,f_290,f_291,f_292,f_293,f_294,f_295,f_296,f_297,f_298,f_299
0,0_1062,0,1062,-0.468671,-0.706653,-0.765238,0.62083,-0.581358,0.993605,0.408986,...,-0.170365,0.912726,-0.169476,-1.220772,0.941183,0.464294,-1.087009,0.540411,0.375438,0.003703
1,0_1144,0,1144,-0.107676,-0.74841,-2.271974,0.381967,-0.581879,-0.606941,0.703325,...,0.82156,-1.09562,-0.633994,-1.220772,0.941183,-0.669425,0.104928,-0.303525,-1.86127,-0.815836
2,0_2140,0,2140,-0.82436,0.876086,-1.769729,-0.230321,-0.579754,-0.366735,0.240823,...,0.82156,-1.09562,-0.654613,-1.220772,0.941183,-0.503797,1.296864,-0.862824,-1.492674,-0.418889
3,0_2385,0,2385,0.282452,0.217906,-1.141922,0.399086,-0.576741,2.127874,0.175264,...,-0.170365,0.912726,-1.11984,0.819155,0.941183,0.136213,0.104928,1.146075,-1.286455,0.538988
4,0_2727,0,2727,-0.736424,-0.928419,3.001602,-1.925446,1.21515,1.979153,1.603922,...,-2.105698,-1.09562,-0.648119,0.819155,-1.060166,1.25145,1.296864,0.412373,1.431981,2.084112


In [37]:
# general settings
class CFG:
    data_folder = '../input/tsdata-1/'
    img_dim1 = 20
    img_dim2 = 10
    seed = 13
    nfolds = 5
    nof_trees = 150
    cutoff_point = 1100
    
# adjust the parameters for displayed figures    
plt.rcParams.update({'figure.figsize': (CFG.img_dim1,CFG.img_dim2)})   

# train / validation split
xtest = xtrain.loc[xtrain.time_id > CFG.cutoff_point].copy()
xtrain = xtrain.loc[xtrain.time_id <= CFG.cutoff_point].copy()
print(xtrain.shape, xtest.shape)

(5452, 304) (623, 304)


In [38]:
# normal cleanup and preparation

id_train, id_test = xtrain['row_id'].copy(), xtest['row_id'].copy()

In [39]:
xtrain.drop('row_id', axis = 1, inplace = True)
xtest.drop('row_id', axis = 1, inplace = True)

ytrain, ytest = xtrain['target'].copy(), xtest['target'].copy()
inv_train, inv_test = xtrain['investment_id'].copy(), xtest['investment_id'].copy()
time_train, time_test = xtrain['time_id'].copy(), xtest['time_id'].copy()

xtrain.drop(['time_id', 'investment_id', 'target'], axis = 1, inplace = True)
xtest.drop(['time_id', 'investment_id', 'target'], axis = 1, inplace = True)

xtrain = reduce_mem_usage(xtrain)
xtest = reduce_mem_usage(xtest)

gc.collect()

Memory usage of dataframe is 12.52 MB
Memory usage after optimization is: 3.16 MB
Decreased by 74.8%
Memory usage of dataframe is 1.43 MB
Memory usage after optimization is: 0.36 MB
Decreased by 74.8%


59

In [40]:
from sklearn.model_selection import train_test_split

x0, x1, y0, y1 = train_test_split(xtrain, ytrain, test_size=0.33, random_state=42)

In [41]:
lgb_parameters = {'objective': 'regression', 'metric': 'rmse', 'num_iterations': CFG.nof_trees, 
                          'num_leaves': 32, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.6}

model = lgb.LGBMRegressor(**lgb_parameters)


model.fit(x0, y0, eval_metric='rmse', eval_set=[(x0, y0), (x1, y1)], verbose= 250, early_stopping_rounds=100)
val_preds = model.predict(x1)



In [42]:
# validation score    
score = np.round(p(val_preds, y1)[0],4)
print("validation score: " + str(score))


# actual test performance
test_preds = model.predict(xtest)
score = np.round(p(test_preds, ytest)[0],4)
print("test score: " + str(score))


del x0, x1, y0, y1; gc.collect()

validation score: 0.33
test score: -0.0038


27

# KFold¶
A second most common approach to validation is KFold:

In [44]:
from sklearn.model_selection import KFold

kf = KFold(n_splits = CFG.nfolds,  shuffle = True, random_state = CFG.seed)
res_vec = np.zeros((CFG.nfolds, 1))

prv = np.zeros((xtest.shape[0],CFG.nfolds))

for (ii, (id0, id1)) in enumerate(kf.split(xtrain)):
    
    print('fold: ' + str(ii))
    x0, x1 = xtrain.loc[id0], xtrain.loc[id1]
    y0, y1 = ytrain.loc[id0], ytrain.loc[id1]
    
    model = lgb.LGBMRegressor(**lgb_parameters)

    model.fit(x0, y0, eval_metric='rmse', eval_set=[(x0, y0), (x1, y1)], verbose= 250, early_stopping_rounds=100)
    val_preds = model.predict(x1)
    prv[:,ii] += model.predict(xtest) / CFG.nfolds
    
    # validation score    
    score = np.round(p(val_preds, y1)[0],4)
    res_vec[ii] = score
    print("validation score: " + str(score))

    # actual test performance
    score = np.round(p(test_preds, ytest)[0],4)
    print("test score: " + str(score))


    del model, x0, x1, y0, y1

fold: 0




validation score: 0.3584
test score: -0.0038
fold: 1




validation score: 0.2983
test score: -0.0038
fold: 2




validation score: 0.2688
test score: -0.0038
fold: 3




validation score: 0.3195
test score: -0.0038
fold: 4




validation score: 0.278
test score: -0.0038


In [45]:
avg_score = np.round(np.mean(res_vec),4)
print("average score across folds: " + str(avg_score))
score = np.round(p(ytest, prv.mean(axis = 1))[0],4)

print("full test score: " + str(score))

average score across folds: 0.3046
full test score: 0.0108


# 时间序列

In [46]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits = CFG.nfolds , max_train_size = None)

res_vec = np.zeros((CFG.nfolds, 1))

`from sklearn.model_selection import TimeSeriesSplit` 是在Python的`scikit-learn`库中的一个导入语句，它的作用是导入`TimeSeriesSplit`类。`TimeSeriesSplit`是`scikit-learn`中用于时间序列数据交叉验证的一个分割器。 在机器学习中，交叉验证是一种评估模型泛化能力的技术，它通过将数据集分割成多个小子集来工作，通常包括训练集和验证集（或测试集）

In [47]:
for (ii, (id0, id1)) in enumerate(tscv.split(xtrain)):
    x0, x1 = xtrain.loc[id0], xtrain.loc[id1]
    y0, y1 = ytrain.loc[id0], ytrain.loc[id1]
    
    
    model = lgb.LGBMRegressor(**lgb_parameters)

    model.fit(x0, y0, eval_metric='rmse', eval_set=[(x0, y0), (x1, y1)], verbose= 250, early_stopping_rounds=100)
    val_preds = model.predict(x1)
    prv[:,ii] += model.predict(xtest) / CFG.nfolds
    
    # validation score    
    score = np.round(p(val_preds, y1)[0],4)
    res_vec[ii] = score
    print("validation score: " + str(score))

    # actual test performance
    test_preds = model.predict(xtest)
    score = np.round(p(test_preds, ytest)[0],4)
    print("test score: " + str(score))


    del model, x0, x1, y0, y1
    
gc.collect()



validation score: -0.0358
test score: 0.0486




validation score: 0.0004
test score: 0.036




validation score: 0.0317
test score: 0.0216




validation score: 0.1077
test score: 0.0492




validation score: 0.0936
test score: 0.0656


27

In [48]:
tscv = TimeSeriesSplit(n_splits = CFG.nfolds , max_train_size = 2 * xtest.shape[0])

res_vec = np.zeros((CFG.nfolds, 1))


for (ii, (id0, id1)) in enumerate(tscv.split(xtrain)):
    x0, x1 = xtrain.loc[id0], xtrain.loc[id1]
    y0, y1 = ytrain.loc[id0], ytrain.loc[id1]
    
    
    model = lgb.LGBMRegressor(**lgb_parameters)

    model.fit(x0, y0, eval_metric='rmse', eval_set=[(x0, y0), (x1, y1)], verbose= 250, early_stopping_rounds=100)
    val_preds = model.predict(x1)
    prv[:,ii] += model.predict(xtest) / CFG.nfolds
    
    # validation score    
    score = np.round(p(val_preds, y1)[0],4)
    res_vec[ii] = score
    print("validation score: " + str(score))

    # actual test performance
    test_preds = model.predict(xtest)
    score = np.round(p(test_preds, ytest)[0],4)
    print("test score: " + str(score))


    del model, x0, x1, y0, y1



validation score: -0.0358
test score: 0.0486




validation score: 0.0645
test score: -0.0521




validation score: 0.0393
test score: 0.0424




validation score: 0.0861
test score: 0.0063




validation score: 0.0531
test score: -0.0499


# WFV 

In [49]:
for train_idx, test_idx in TimeSeriesSplit().split(xtrain):
    id1 = time_train.loc[train_idx].unique()
    id2 = time_train.loc[test_idx].unique()
    
    print(np.intersect1d(id1,id2))

[182]
[365]
[]
[737]
[919]


# 团体时间序列
到目前为止的故事：

GroupKFold迭代器确实尊重分组：任何组都不会是两个折叠的一部分，但会打乱时间顺序

TimeSeriesSplit正好相反

In [50]:
from sklearn.model_selection import GroupKFold

for train_idx, test_idx in GroupKFold().split(xtrain, groups = pd.DataFrame(time_train)['time_id']):
    id1 = time_train.loc[train_idx].unique()
    id2 = time_train.loc[test_idx].unique()
    
    print(id1[0:10])
    print(id2[0:10])
    print(np.intersect1d(id1,id2))

    print('---')

[ 0  1  3  4  5  6  8  9 10 11]
[ 2  7 12 17 22 27 35 40 45 49]
[]
---
[ 0  2  3  4  5  7  8  9 10 12]
[ 1  6 11 15 16 21 26 34 39 44]
[]
---
[ 0  1  2  3  4  6  7  8  9 11]
[ 5 10 20 25 30 31 32 38 43 48]
[]
---
[ 1  2  3  5  6  7  8 10 11 12]
[ 0  4  9 14 19 24 29 33 37 42]
[]
---
[ 0  1  2  4  5  6  7  9 10 11]
[ 3  8 13 18 23 28 36 41 46 51]
[]
---


In [51]:
# Taken from the notebo
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

In [53]:
# Taken from the notebo
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class GroupTimeSeriesSplit(_BaseKFold):
    """具有非重叠组的时间序列交叉验证器变体。
    为分割时间序列数据样本提供训练/测试指标
    根据a以固定时间间隔观察到
    第三方提供的组。
    在每次拆分中，测试指数必须高于以前，因此会洗牌
    在交叉验证器中是不合适的。
    这个交叉验证对象是：class:`KFold`的变体。
    在第k次拆分中，它返回前k个折叠作为训练集
    第（k+1）倍作为测试集。
    同一组不会出现在两个不同的折叠中（
    不同的组必须至少等于折叠数）。
    请注意，与标准交叉验证方法不同，连续
    训练集是之前训练集的超集。
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_size : int, default=None
        Maximum size for a single training set.
    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.model_selection import GroupTimeSeriesSplit
    >>> groups = np.array(['a', 'a', 'a', 'a', 'a', 'a',\
                           'b', 'b', 'b', 'b', 'b',\
                           'c', 'c', 'c', 'c',\
                           'd', 'd', 'd'])
    >>> gtss = GroupTimeSeriesSplit(n_splits=3)
    >>> for train_idx, test_idx in gtss.split(groups, groups=groups):
    ...     print("TRAIN:", train_idx, "TEST:", test_idx)
    ...     print("TRAIN GROUP:", groups[train_idx],\
                  "TEST GROUP:", groups[test_idx])
    TRAIN: [0, 1, 2, 3, 4, 5] TEST: [6, 7, 8, 9, 10]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a']\
    TEST GROUP: ['b' 'b' 'b' 'b' 'b']
    TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] TEST: [11, 12, 13, 14]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b']\
    TEST GROUP: ['c' 'c' 'c' 'c']
    TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]\
    TEST: [15, 16, 17]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b' 'c' 'c' 'c' 'c']\
    TEST GROUP: ['d' 'd' 'd']
    """
    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_size=None
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_size = max_train_size

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
        group_test_size = n_groups // n_folds
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []
            for train_group_idx in unique_groups[:group_test_start]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)
            train_end = train_array.size
            if self.max_train_size and self.max_train_size < train_end:
                train_array = train_array[train_end -
                                          self.max_train_size:train_end]
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [54]:
# from sklearn.model_selection import GroupTimeSeriesSplit

In [55]:
# sanity check
for train_idx, test_idx in GroupTimeSeriesSplit().split(xtrain, groups = pd.DataFrame(time_train)['time_id']):
    id1 = time_train.loc[train_idx].unique()
    id2 = time_train.loc[test_idx].unique()
    
    print(id1[0:25])
    print(id2[0:25])
    print(np.intersect1d(id1,id2))    
    
    print('---')

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
[182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
 200 201 202 203 204 205 206]
[]
---
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
[365 366 367 373 374 375 376 377 378 379 380 381 383 384 385 386 387 388
 389 390 391 392 393 394 395]
[]
---
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
[555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572
 573 574 575 576 577 578 579]
[]
---
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
[737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754
 755 756 757 758 759 760 761]
[]
---
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
[919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936
 937 938 939 940 941 942 943]
[]
---


In [56]:
res_vec = np.zeros((CFG.nfolds, 1))

for (ii, (id0, id1)) in enumerate(GroupTimeSeriesSplit().split(xtrain, groups = pd.DataFrame(time_train)['time_id'])):
    
    x0, x1 = xtrain.loc[id0], xtrain.loc[id1]
    y0, y1 = ytrain.loc[id0], ytrain.loc[id1]
    
    
    model = lgb.LGBMRegressor(**lgb_parameters)

    model.fit(x0, y0, eval_metric='rmse', eval_set=[(x0, y0), (x1, y1)],
              verbose= 250, early_stopping_rounds=100)
    
    val_preds = model.predict(x1)
    prv[:,ii] += model.predict(xtest) / CFG.nfolds
    
    # validation score    
    score = np.round(p(val_preds, y1)[0],4)
    res_vec[ii] = score
    print("validation score: " + str(score))

    # actual test performance
    test_preds = model.predict(xtest)
    score = np.round(p(test_preds, ytest)[0],4)
    print("test score: " + str(score))


    del model, x0, x1, y0, y1



validation score: -0.0487
test score: 0.0235




validation score: 0.0788
test score: 0.0715




validation score: 0.0382
test score: 0.0286




validation score: 0.0998
test score: 0.0244




validation score: 0.0947
test score: 0.0655


In [59]:
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [60]:
cv = PurgedGroupTimeSeriesSplit( n_splits=5,
    max_train_group_size=15, group_gap=5, max_test_group_size=5)

In [61]:
res_vec = np.zeros((CFG.nfolds, 1))

for (ii, (id0, id1)) in enumerate(cv.split(xtrain, groups = pd.DataFrame(time_train)['time_id'])):
    
    x0, x1 = xtrain.loc[id0], xtrain.loc[id1]
    y0, y1 = ytrain.loc[id0], ytrain.loc[id1]
    
    
    model = lgb.LGBMRegressor(**lgb_parameters)

    model.fit(x0, y0, eval_metric='rmse', eval_set=[(x0, y0), (x1, y1)],
              verbose= 250, early_stopping_rounds=100)
    
    val_preds = model.predict(x1)
    prv[:,ii] += model.predict(xtest) / CFG.nfolds
    
    # validation score    
    score = np.round(p(val_preds, y1)[0],4)
    res_vec[ii] = score
    print("validation score: " + str(score))

    # actual test performance
    test_preds = model.predict(xtest)
    score = np.round(p(test_preds, ytest)[0],4)
    print("test score: " + str(score))


    del model, x0, x1, y0, y1



validation score: 0.5917
test score: -0.0284
validation score: -0.345
test score: 0.0567




validation score: 0.3501
test score: 0.0465
validation score: 0.05
test score: 0.0204
validation score: 0.1763
test score: 0.0393


In [62]:
import numpy as np
from scipy.special import comb
from itertools import combinations

class CombinatorialPurgedGroupKFold():
    def __init__(self, n_splits = 6, n_test_splits = 2, purge = 1, pctEmbargo = 0.01, **kwargs):
        self.n_splits = n_splits
        self.n_test_splits = n_test_splits
        self.purge = purge
        self.pctEmbargo = pctEmbargo
        
    def split(self, X, y = None, groups = None):
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
            
        u, ind = np.unique(groups, return_index = True)
        unique_groups = u[np.argsort(ind)]
        n_groups = len(unique_groups)
        group_dict = {}
        for idx in range(len(X)):
            if groups[idx] in group_dict:
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
                
        n_folds = comb(self.n_splits, self.n_test_splits, exact = True)
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
            
        mbrg = int(n_groups * self.pctEmbargo)
        if mbrg < 0:
            raise ValueError(
                "The number of 'embargoed' groups should not be negative")
        
        split_dict = {}
        group_test_size = n_groups // self.n_splits
        for split in range(self.n_splits):
            if split == self.n_splits - 1:
                split_dict[split] = unique_groups[int(split * group_test_size):].tolist()
            else:
                split_dict[split] = unique_groups[int(split * group_test_size):int((split + 1) * group_test_size)].tolist()
        
        for test_splits in combinations(range(self.n_splits), self.n_test_splits):
            test_groups = []
            banned_groups = []
            for split in test_splits:
                test_groups += split_dict[split]
                banned_groups += unique_groups[split_dict[split][0] - self.purge:split_dict[split][0]].tolist()
                banned_groups += unique_groups[split_dict[split][-1] + 1:split_dict[split][-1] + self.purge + mbrg + 1].tolist()
            train_groups = [i for i in unique_groups if (i not in banned_groups) and (i not in test_groups)]

            train_idx = []
            test_idx = []
            for train_group in train_groups:
                train_idx += group_dict[train_group]
            for test_group in test_groups:
                test_idx += group_dict[test_group]
            yield train_idx, test_idx

In [63]:
n_splits = 6
n_test_splits = 1
elements = list(range(10 * (n_splits + n_test_splits)))
groups = [element // n_splits for element in elements]
data = pd.DataFrame({"group": groups, "element": elements})
kfold = CombinatorialPurgedGroupKFold(n_splits, n_test_splits)
for index, (train_indices, test_indices) in enumerate(kfold.split(data, groups=data["group"])):
    print("=" * 100)
    print(f"Fold {index}")
    print("=" * 100)
    print("Train indices:", train_indices, "Length:", len(train_indices))
    print("Test Indices:", test_indices, "Length:", len(test_indices))

Fold 0
Train indices: [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69] Length: 52
Test Indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] Length: 12
Fold 1
Train indices: [0, 1, 2, 3, 4, 5, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69] Length: 46
Test Indices: [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23] Length: 12
Fold 2
Train indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69] Length: 46
Test Indices: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35] Length: 12
Fold 3
Train indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 2

In [64]:
n_splits = 5
n_test_splits = 1
kfold = CombinatorialPurgedGroupKFold(n_splits, n_test_splits)

for (ii, (id0, id1)) in enumerate(kfold.split(xtrain, groups = pd.DataFrame(time_train)['time_id'])):
    
    x0, x1 = xtrain.loc[id0], xtrain.loc[id1]
    y0, y1 = ytrain.loc[id0], ytrain.loc[id1]
    
    
    model = lgb.LGBMRegressor(**lgb_parameters)

    model.fit(x0, y0, eval_metric='rmse', eval_set=[(x0, y0), (x1, y1)],
              verbose= 250, early_stopping_rounds=100)
    
    val_preds = model.predict(x1)
    prv[:,ii] += model.predict(xtest) / CFG.nfolds
    
    # validation score    
    score = np.round(p(val_preds, y1)[0],4)
    res_vec[ii] = score
    print("validation score: " + str(score))

    # actual test performance
    test_preds = model.predict(xtest)
    score = np.round(p(test_preds, ytest)[0],4)
    print("test score: " + str(score))


    del model, x0, x1, y0, y1



validation score: 0.061
test score: -0.0703




validation score: 0.0984
test score: -0.0174




validation score: 0.0548
test score: 0.0358




validation score: 0.0959
test score: 0.0503




validation score: -0.0073
test score: -0.0249
