In [84]:
import numpy as np
import pandas as pd
from numpy.testing import assert_array_equal
from sklearn.model_selection import BaseCrossValidator
from sklearn.utils import shuffle
from IPython.display import display


In [196]:
def test_time_split_on_column(end_date, shuffle_data):

    date = pd.date_range(
        start='2020-01-01 00:00', end=end_date, freq='D'
    )
    n_samples = len(date)
    X = pd.DataFrame({'val': range(n_samples), 'date': date})
    y = pd.DataFrame(
        np.array([i % 2 for i in range(n_samples)])
    )

    if shuffle_data:
        X, y = shuffle(X, y, random_state=0)

    cv = MonthlySplit(time_col='date')

    # Test that train, test indices returned are integers and
    # data is correctly ordered
    n_splits = 0
    last_time = None
    print(X)
    for train, test in cv.split(X, y):
        X_train, X_test = X.iloc[train], X.iloc[test]
        assert X_train['date'].max() < X_test['date'].min()
        assert X_train['date'].dt.month.nunique() == 1
        assert X_test['date'].dt.month.nunique() == 1
        assert X_train['date'].dt.year.nunique() == 1
        assert X_test['date'].dt.year.nunique() == 1
        if last_time is not None:
            assert X_test['date'].min() > last_time
        last_time = X_test['date'].max()
        n_splits += 1

    assert 'idx' not in X.columns

    assert n_splits == cv.get_n_splits(X, y)
    
def test_time_split(end_date, expected_splits, shuffle_data):

    date = pd.date_range(start='2020-01-01', end=end_date, freq='D')
    n_samples = len(date)
    X = pd.DataFrame(range(n_samples), index=date, columns=['val'])
    y = pd.DataFrame(
        np.array([i % 2 for i in range(n_samples)]),
        index=date
    )

    if shuffle_data:
        X, y = shuffle(X, y, random_state=0)

    X_1d = X['val']

    cv = MonthlySplit()
    cv_repr = "MonthlySplit(time_col='index')"

    # Test if the repr works without any errors
    assert cv_repr == repr(cv)

    # Test if get_n_splits works correctly
    assert cv.get_n_splits(X, y) == expected_splits

    # Test if the cross-validator works as expected even if
    # the data is 1d
    #print('2D')
    #print(list(cv.split(X, y)) == list(cv.split(X_1d, y) ))
    #print()
    #print('1D')
    #display(list(cv.split(X_1d, y)))
    np.testing.assert_equal(
        list(cv.split(X, y)), list(cv.split(X_1d, y))
    )

    # Test that train, test indices returned are integers and
    # data is correctly ordered
    for train, test in cv.split(X, y):
        assert np.asarray(train).dtype.kind == "i"
        assert np.asarray(test).dtype.kind == "i"

        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]
        assert X_train.index.max() < X_test.index.min()
        assert y_train.index.max() < y_test.index.min()
        assert X.index.equals(y.index)

    #with pytest.raises(ValueError, match='datetime'):
        cv = MonthlySplit(time_col='val')
        next(cv.split(X, y))


In [220]:
class MonthlySplit(BaseCrossValidator):
    """CrossValidator based on monthly split.

    Split data based on the given `time_col` (or default to index). Each split
    corresponds to one month of data for the training and the next month of
    data for the test.

    Parameters
    ----------
    time_col : str, defaults to 'index'
        Column of the input DataFrame that will be used to split the data. This
        column should be of type datetime. If split is called with a DataFrame
        for which this column is not a datetime, it will raise a ValueError.
        To use the index as column just set `time_col` to `'index'`.
    """

    def __init__(self, time_col='index'):  # noqa: D107
        self.time_col = time_col

    def get_n_splits(self, X, y=None, groups=None):
        """Return the number of splitting iterations in the cross-validator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where `n_samples` is the number of samples
            and `n_features` is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.

        Returns
        -------
        n_splits : int
            The number of splits.
        """
        # if self.time_col == 'index':
        #     if not isinstance(X.index, pd.DatetimeIndex):
        #         raise ValueError("The index should be of type datetime")
        #     time_col = X.index
        # else:
        #     if not isinstance(X[self.time_col].dtype, pd.DatetimeTZDtype):
        #         raise ValueError(f"The column {self.time_col} should be of type datetime")
        #     time_col = X[self.time_col]
        # return len(time_col.to_period("M").unique())-1
        X_grouped = X.copy()
        if self.time_col !='index' :
            X_grouped = X.set_index(self.time_col)
        else :
            X_grouped = X.copy()
        X_grouped = X_grouped.resample('M').mean()
        return X_grouped.shape[0]-1

    def split(self, X, y, groups=None):
        """Generate indices to split data into training and test set.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where `n_samples` is the number of samples
            and `n_features` is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.

        Yields
        ------
        idx_train : ndarray
            The training set indices for that split.
        idx_test : ndarray
            The testing set indices for that split.
        """
        X_indiced = X.copy()
        print('X : ', type(X))
        if isinstance(X, pd.core.series.Series) :
            X_indiced = pd.DataFrame(X)
            #display(X_indiced)
        #
        if self.time_col !='index' :
            X_indiced= X_indiced.set_index(self.time_col)
            if not isinstance(X_indiced.index, pd.core.indexes.datetimes.DatetimeIndex) :
                raise ValueError('This column should be at the datetime format.')
        else : 
            X_indiced = X_indiced.copy()
        #display(X_indiced)
        print(self.time_col)
        display( X_indiced)
        display(X)
        X_grouped_dates = X_indiced.resample('M').mean().sort_index().index
        X_indiced['idx'] = np.arange(X.shape[0])
        n_splits = self.get_n_splits(X, y, groups)
       
        #print('type X : ', type(X))
        for i in range(n_splits):   
            #print(type(X_indiced.index))
            #if not isinstance(X_indiced.index, pd.core.indexes.datetimes.DatetimeIndex) :
            #    print( X_indiced.index)
            #    X_indiced['datetime'] =  pd.to_datetime(X_indiced.index)
            #    X_indiced.set_index('datetime')
                
            idx_train = X_indiced[(X_indiced.index.month==X_grouped_dates[i].month) & (X_indiced.index.year==X_grouped_dates[i].year)]['idx'].values
            idx_test = X_indiced[(X_indiced.index.month==X_grouped_dates[i+1].month) & (X_indiced.index.year==X_grouped_dates[i+1].year)]['idx'].values
            #print('idx_train : ', idx_train)
            #print('first train : ', X.iloc[idx_train])
            yield (
                idx_train, idx_test
            )

In [221]:
end_date = '2021-01-31'
expected_splits = 12
shuffle_data = True
test_time_split(end_date, expected_splits, shuffle_data)

X :  <class 'pandas.core.frame.DataFrame'>
index


Unnamed: 0,val
2020-03-06,65
2020-05-12,132
2020-03-15,74
2020-03-19,78
2020-02-07,37
...,...
2020-11-19,323
2020-07-11,192
2020-04-27,117
2020-02-17,47


Unnamed: 0,val
2020-03-06,65
2020-05-12,132
2020-03-15,74
2020-03-19,78
2020-02-07,37
...,...
2020-11-19,323
2020-07-11,192
2020-04-27,117
2020-02-17,47


X :  <class 'pandas.core.series.Series'>
index


Unnamed: 0,val
2020-03-06,65
2020-05-12,132
2020-03-15,74
2020-03-19,78
2020-02-07,37
...,...
2020-11-19,323
2020-07-11,192
2020-04-27,117
2020-02-17,47


2020-03-06     65
2020-05-12    132
2020-03-15     74
2020-03-19     78
2020-02-07     37
             ... 
2020-11-19    323
2020-07-11    192
2020-04-27    117
2020-02-17     47
2020-06-21    172
Name: val, Length: 397, dtype: int64

X :  <class 'pandas.core.frame.DataFrame'>
index


Unnamed: 0,val
2020-03-06,65
2020-05-12,132
2020-03-15,74
2020-03-19,78
2020-02-07,37
...,...
2020-11-19,323
2020-07-11,192
2020-04-27,117
2020-02-17,47


Unnamed: 0,val
2020-03-06,65
2020-05-12,132
2020-03-15,74
2020-03-19,78
2020-02-07,37
...,...
2020-11-19,323
2020-07-11,192
2020-04-27,117
2020-02-17,47


X :  <class 'pandas.core.frame.DataFrame'>


ValueError: This column should be at the datetime format.

In [219]:
end_date= '2020-12-31'
shuffle_data = False 
test_time_split_on_column(end_date, shuffle_data)

     val       date
0      0 2020-01-01
1      1 2020-01-02
2      2 2020-01-03
3      3 2020-01-04
4      4 2020-01-05
..   ...        ...
361  361 2020-12-27
362  362 2020-12-28
363  363 2020-12-29
364  364 2020-12-30
365  365 2020-12-31

[366 rows x 2 columns]
X :  <class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
date


Unnamed: 0_level_0,val
date,Unnamed: 1_level_1
2020-01-01,0
2020-01-02,1
2020-01-03,2
2020-01-04,3
2020-01-05,4
...,...
2020-12-27,361
2020-12-28,362
2020-12-29,363
2020-12-30,364


Unnamed: 0,val,date
0,0,2020-01-01
1,1,2020-01-02
2,2,2020-01-03
3,3,2020-01-04
4,4,2020-01-05
...,...,...
361,361,2020-12-27
362,362,2020-12-28
363,363,2020-12-29
364,364,2020-12-30


In [102]:
 if self.time_col !='index' :
                #print( X[(X[self.time_col].dt.month==X_grouped_dates[i].month) & (X[self.time_col].dt.year==X_grouped_dates[i].year)])
                idx_train = X[(X[self.time_col].dt.month==X_grouped_dates[i].month) & (X[self.time_col].dt.year==X_grouped_dates[i].year)]
                idx_test = X[(X[self.time_col].dt.month==X_grouped_dates[i+1].month) & (X[self.time_col].dt.year==X_grouped_dates[i+1].year)]
                print('idx_train : ', np.sort(idx_train))
                print('first train : ', X.iloc[idx_train])
            else : 
                idx_train = X[(X.index.dt.month==X_grouped_dates[i].month) & (X.index.dt.year==X_grouped_dates[i].year)].index
                idx_test = X[(X.index.dt.month==X_grouped_dates[i+1].month) & (X.index.dt.year==X_grouped_dates[i+1].year)].index

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 7)