In [16]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [59]:
import logging
from copy import deepcopy
from functools import reduce

import numpy as np
import pandas as pd
import warnings
from sklearn.utils import indexable
from sklearn.model_selection._split import _BaseKFold,_num_samples

logger = logging.getLogger('backtestlogger')

"""
ToDo:
1. Add option to ancor splits to a specified day of week. For example, train every sunday and predict for the next week 
"""


def generate_cutoffs(df, horizon, initial, period):
    """Generate cutoff dates
    Parameters
    ----------
    df: pd.DataFrame with historical data.
    horizon: pd.Timedelta forecast horizon.
    initial: pd.Timedelta window of the initial forecast period.
    period: pd.Timedelta simulated forecasts are done with this period.
    Returns
    -------
    list of pd.Timestamp
    """
    # Last cutoff is 'latest date in data - horizon' date
    cutoff = df['ds'].max() - horizon
    if cutoff < df['ds'].min():
        raise ValueError('Less data than horizon.')
    result = [cutoff]
    while result[-1] >= min(df['ds']) + initial:
        cutoff -= period
        # If data does not exist in data range (cutoff, cutoff + horizon]
        if not (((df['ds'] > cutoff) & (df['ds'] <= cutoff + horizon)).any()):
            # Next cutoff point is 'last date before cutoff in data - horizon'
            if cutoff > df['ds'].min():
                closest_date = df[df['ds'] <= cutoff].max()['ds']
                cutoff = closest_date - horizon
            # else no data left, leave cutoff as is, it will be dropped.
        result.append(cutoff)
    result = result[:-1]
    if len(result) == 0:
        raise ValueError(
            'Less data than horizon after initial window. '
            'Make horizon or initial shorter.'
        )
    logger.info('Making {} forecasts with cutoffs between {} and {}'.format(
        len(result), result[-1], result[0]
    ))
    return reversed(result)


class BacktestSplit(_BaseKFold):
    """Cross-Validation for time series.
    Computes forecasts from historical cutoff points. Beginning from
    (end - horizon), works backwards making cutoffs with a spacing of period
    until initial is reached.
    When period is equal to the time interval of the data, this is the
    technique described in https://robjhyndman.com/hyndsight/tscv/ .
    Parameters
    ----------
    n_splits : int, default=3
        Always ignored. Just for compatibility reasons
    roll_or_expand: str,
        Could  be 'roll' or 'expand'
    horizon: string with pd.Timedelta compatible style, e.g., '5 days',
        '3 hours', '10 seconds'.
    period: string with pd.Timedelta compatible style. Simulated forecast will
        be done at every this period. If not provided, 0.5 * horizon is used.
    initial: string with pd.Timedelta compatible style. The first training
        period will begin here. If not provided, 3 * horizon is used.
    Examples
    --------
    >>> from itertools import product
    >>> dates=pd.date_range('2019-01-01','2019-03-31')
    >>> assets=['AAPL','MSFT']
    >>> history=pd.DataFrame(product(dates,assets))
    >>> print ('Original Data')
    >>> history.head()
    >>> history.tail()
    >>> print('CV splits')
    >>> bts=BacktestSplit(roll_or_expand='roll',horizon='7 days',period=' 7 days',initial='31 days')
    >>> for train_index, test_index in bts.split(history,groups=history[0]):
    ...     print("TRAIN head:\n", history.loc[train_index].head())
    ...     print("TRAIN tail:\n", history.loc[train_index].tail())
    ...     print("Train size:\n",history.loc[train_index].shape)
    ...     print("TEST:\n", history.loc[test_index])

    """
    def __init__(self, n_splits='warn', roll_or_expand='roll',horizon='7 days',period='7 days',initial='3 months',cutoffs=None):
        if n_splits == 'warn': # this looks like a required parameter
            warnings.warn('n_splits will be ignored')
            n_splits = 3
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.roll_or_expand = roll_or_expand
        self.horizon = pd.Timedelta(horizon)
        self.period = 0.5 * self.horizon if period is None else pd.Timedelta(period)
        self.initial=3*self.horizon if initial is None else pd.Timedelta(initial)
        self.cutoffs=cutoffs

    """
    ToDo: if ElasticNextCV doesn't pass groups to the CV, you'll need to pass groups at init 
    """
    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like, with shape (n_samples,)
            Dates for each sample.Equivalent to 'ds' column in the Prophet time series
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise RuntimeError('Groups cannot be none')
            
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        indices = np.arange(n_samples)
        
        if self.cutoffs is None:
            cutoffs = generate_cutoffs(pd.DataFrame(np.array(groups),columns=['ds']), self.horizon, self.initial, self.period)
        else:
            cutoffs=self.cutoffs
                
        for cutoff in cutoffs:
            test_index=indices[(groups>cutoff) & (groups<=cutoff+self.horizon)]
            if self.roll_or_expand=='roll':
                train_index=indices[(groups>=cutoff-self.initial)&(groups<=cutoff)]
            else:
                train_index=indices[groups<=cutoff]
            yield (train_index,test_index)


'\nToDo:\n1. Add option to ancor splits to a specified day of week. For example, train every sunday and predict for the next week \n'

In [56]:
from itertools import product
dates=pd.date_range('2019-01-01','2019-03-31')
assets=['AAPL','MSFT']
history=pd.DataFrame(product(dates,assets))
print ('Original Data')
history.head()
history.tail()
print('CV splits')
bts=BacktestSplit(roll_or_expand='roll',horizon='7 days',period=' 7 days',initial='31 days')
for train_index, test_index in bts.split(history,groups=history[0]):
    print("TRAIN head:\n", history.loc[train_index].head())
    print("TRAIN tail:\n", history.loc[train_index].tail())
    print("Train size:\n",history.loc[train_index].shape)
    print("TEST:\n", history.loc[test_index])

Original Data


Unnamed: 0,0,1
0,2019-01-01,AAPL
1,2019-01-01,MSFT
2,2019-01-02,AAPL
3,2019-01-02,MSFT
4,2019-01-03,AAPL


Unnamed: 0,0,1
175,2019-03-29,MSFT
176,2019-03-30,AAPL
177,2019-03-30,MSFT
178,2019-03-31,AAPL
179,2019-03-31,MSFT


CV splits
TRAIN head:
            0     1
4 2019-01-03  AAPL
5 2019-01-03  MSFT
6 2019-01-04  AAPL
7 2019-01-04  MSFT
8 2019-01-05  AAPL
TRAIN tail:
             0     1
63 2019-02-01  MSFT
64 2019-02-02  AAPL
65 2019-02-02  MSFT
66 2019-02-03  AAPL
67 2019-02-03  MSFT
Train size:
 (64, 2)
TEST:
             0     1
68 2019-02-04  AAPL
69 2019-02-04  MSFT
70 2019-02-05  AAPL
71 2019-02-05  MSFT
72 2019-02-06  AAPL
73 2019-02-06  MSFT
74 2019-02-07  AAPL
75 2019-02-07  MSFT
76 2019-02-08  AAPL
77 2019-02-08  MSFT
78 2019-02-09  AAPL
79 2019-02-09  MSFT
80 2019-02-10  AAPL
81 2019-02-10  MSFT
TRAIN head:
             0     1
18 2019-01-10  AAPL
19 2019-01-10  MSFT
20 2019-01-11  AAPL
21 2019-01-11  MSFT
22 2019-01-12  AAPL
TRAIN tail:
             0     1
77 2019-02-08  MSFT
78 2019-02-09  AAPL
79 2019-02-09  MSFT
80 2019-02-10  AAPL
81 2019-02-10  MSFT
Train size:
 (64, 2)
TEST:
             0     1
82 2019-02-11  AAPL
83 2019-02-11  MSFT
84 2019-02-12  AAPL
85 2019-02-12  MSFT
86 2019-



In [57]:
from itertools import product
dates=pd.date_range('2019-01-01','2019-03-31',freq='B')
assets=['AAPL','MSFT']
history=pd.DataFrame(product(dates,assets))
print ('Original Data')
history.head()
history.tail()
print('CV splits')
bts=BacktestSplit(roll_or_expand='roll',horizon='7 days',period=' 7 days',initial='31 days')
for train_index, test_index in bts.split(history,groups=history[0]):
    print("TRAIN head:\n", history.loc[train_index].head())
    print("TRAIN tail:\n", history.loc[train_index].tail())
    print("Train size:\n",history.loc[train_index].shape)
    print("TEST:\n", history.loc[test_index])

Original Data


Unnamed: 0,0,1
0,2019-01-01,AAPL
1,2019-01-01,MSFT
2,2019-01-02,AAPL
3,2019-01-02,MSFT
4,2019-01-03,AAPL


Unnamed: 0,0,1
123,2019-03-27,MSFT
124,2019-03-28,AAPL
125,2019-03-28,MSFT
126,2019-03-29,AAPL
127,2019-03-29,MSFT


CV splits
TRAIN head:
            0     1
0 2019-01-01  AAPL
1 2019-01-01  MSFT
2 2019-01-02  AAPL
3 2019-01-02  MSFT
4 2019-01-03  AAPL
TRAIN tail:
             0     1
43 2019-01-30  MSFT
44 2019-01-31  AAPL
45 2019-01-31  MSFT
46 2019-02-01  AAPL
47 2019-02-01  MSFT
Train size:
 (48, 2)
TEST:
             0     1
48 2019-02-04  AAPL
49 2019-02-04  MSFT
50 2019-02-05  AAPL
51 2019-02-05  MSFT
52 2019-02-06  AAPL
53 2019-02-06  MSFT
54 2019-02-07  AAPL
55 2019-02-07  MSFT
56 2019-02-08  AAPL
57 2019-02-08  MSFT
TRAIN head:
             0     1
10 2019-01-08  AAPL
11 2019-01-08  MSFT
12 2019-01-09  AAPL
13 2019-01-09  MSFT
14 2019-01-10  AAPL
TRAIN tail:
             0     1
53 2019-02-06  MSFT
54 2019-02-07  AAPL
55 2019-02-07  MSFT
56 2019-02-08  AAPL
57 2019-02-08  MSFT
Train size:
 (48, 2)
TEST:
             0     1
58 2019-02-11  AAPL
59 2019-02-11  MSFT
60 2019-02-12  AAPL
61 2019-02-12  MSFT
62 2019-02-13  AAPL
63 2019-02-13  MSFT
64 2019-02-14  AAPL
65 2019-02-14  MSFT
66 2019-



In [65]:
from itertools import product
dates=pd.date_range('2019-01-01','2019-03-31',freq='B')
assets=['AAPL','MSFT']
history=pd.DataFrame(product(dates,assets))
print ('Original Data')
history.head()
history.tail()
print('CV splits')
bts=BacktestSplit(roll_or_expand='expand',horizon='7 days',period=' 7 days',initial='60 days')
for train_index, test_index in bts.split(history,groups=history[0]):
    print("TRAIN head:\n", history.loc[train_index].head())
    print("TRAIN tail:\n", history.loc[train_index].tail())
    print("Train size:\n",history.loc[train_index].shape)
    print("TEST:\n", history.loc[test_index])

Original Data


Unnamed: 0,0,1
0,2019-01-01,AAPL
1,2019-01-01,MSFT
2,2019-01-02,AAPL
3,2019-01-02,MSFT
4,2019-01-03,AAPL


Unnamed: 0,0,1
123,2019-03-27,MSFT
124,2019-03-28,AAPL
125,2019-03-28,MSFT
126,2019-03-29,AAPL
127,2019-03-29,MSFT


CV splits
TRAIN head:
            0     1
0 2019-01-01  AAPL
1 2019-01-01  MSFT
2 2019-01-02  AAPL
3 2019-01-02  MSFT
4 2019-01-03  AAPL
TRAIN tail:
             0     1
93 2019-03-06  MSFT
94 2019-03-07  AAPL
95 2019-03-07  MSFT
96 2019-03-08  AAPL
97 2019-03-08  MSFT
Train size:
 (98, 2)
TEST:
              0     1
98  2019-03-11  AAPL
99  2019-03-11  MSFT
100 2019-03-12  AAPL
101 2019-03-12  MSFT
102 2019-03-13  AAPL
103 2019-03-13  MSFT
104 2019-03-14  AAPL
105 2019-03-14  MSFT
106 2019-03-15  AAPL
107 2019-03-15  MSFT
TRAIN head:
            0     1
0 2019-01-01  AAPL
1 2019-01-01  MSFT
2 2019-01-02  AAPL
3 2019-01-02  MSFT
4 2019-01-03  AAPL
TRAIN tail:
              0     1
103 2019-03-13  MSFT
104 2019-03-14  AAPL
105 2019-03-14  MSFT
106 2019-03-15  AAPL
107 2019-03-15  MSFT
Train size:
 (108, 2)
TEST:
              0     1
108 2019-03-18  AAPL
109 2019-03-18  MSFT
110 2019-03-19  AAPL
111 2019-03-19  MSFT
112 2019-03-20  AAPL
113 2019-03-20  MSFT
114 2019-03-21  AAPL
115 2019

