# TimeSeriesSplitByDate Demo

This notebook demonstrates `TimeSeriesSplitByDate` using a small synthetic dataset.

It shows for each fold:
- train/test date ranges
- train/test sizes
- a simple text-based visual summary


In [None]:
import sys
from pathlib import Path

ROOT = Path.cwd()
if not (ROOT / 'src').exists():
    ROOT = ROOT.parent

sys.path.insert(0, str(ROOT / 'src'))


In [None]:
import pandas as pd

from timeseriessplitbydate import TimeSeriesSplitByDate, make_synthetic_time_series_dataset


In [None]:
DATE_COL = 'event_timestamp'
data = make_synthetic_time_series_dataset(
    n_samples=40,
    seed=21,
    shuffle=True,
    date_col=DATE_COL,
)

print('Rows are shuffled:', not data[DATE_COL].is_monotonic_increasing)
data.head(8)


In [None]:
def summarize_splits(splitter, X: pd.DataFrame, date_col: str) -> pd.DataFrame:
    rows = []
    for fold, (train_idx, test_idx) in enumerate(splitter.split(X), start=1):
        train_dates = X.loc[train_idx, date_col]
        test_dates = X.loc[test_idx, date_col]
        rows.append(
            {
                'fold': fold,
                'train_size': len(train_idx),
                'test_size': len(test_idx),
                'train_start': train_dates.min(),
                'train_end': train_dates.max(),
                'test_start': test_dates.min(),
                'test_end': test_dates.max(),
                'no_leakage': train_dates.max() < test_dates.min(),
            }
        )
    return pd.DataFrame(rows)


def print_fold_bars(summary: pd.DataFrame) -> None:
    if summary.empty:
        print('No folds produced.')
        return

    scale = max(summary[['train_size', 'test_size']].to_numpy().max() // 20, 1)
    for row in summary.itertuples(index=False):
        train_bar = '#' * max(row.train_size // scale, 1)
        test_bar = '=' * max(row.test_size // scale, 1)
        print(
            f'Fold {row.fold:>2} | train {row.train_size:>3}: {train_bar} | '
            f'test {row.test_size:>3}: {test_bar}'
        )


In [None]:
configs = [
    ('days', TimeSeriesSplitByDate(n_splits=4, date_col=DATE_COL, split_by='days', gap=1)),
    (
        'weeks (monday start)',
        TimeSeriesSplitByDate(
            n_splits=4,
            date_col=DATE_COL,
            split_by='weeks',
            week_start='monday',
            gap=1,
        ),
    ),
    (
        'weeks (sunday start)',
        TimeSeriesSplitByDate(
            n_splits=4,
            date_col=DATE_COL,
            split_by='weeks',
            week_start='sunday',
            gap=1,
        ),
    ),
    ('months', TimeSeriesSplitByDate(n_splits=4, date_col=DATE_COL, split_by='months', gap=1)),
]

for label, splitter in configs:
    print(f'\n=== {label.upper()} ===')
    summary = summarize_splits(splitter, data, DATE_COL)
    display(summary)
    print_fold_bars(summary)
