In [1]:
import os
import pandas as pd

from embargo_purge import get_embargo_table, embargo, purge

In [3]:
raw_data = pd.read_csv(os.path.join('../_data/', 'AFML Ch7.csv'),
                       index_col=0)

# Change to Series
all_times = raw_data['t1']
all_times

t0
2017-01-02    2017-02-12
2017-02-03    2017-03-13
2017-03-04    2017-04-14
2017-04-05    2017-05-15
2017-05-06    2017-06-16
2017-06-07    2017-07-17
2017-07-08    2017-08-18
2017-08-09    2017-09-19
2017-09-10    2017-10-11
2017-10-11    2017-11-12
Name: t1, dtype: object

In [4]:
test_times = all_times.iloc[4:7]
test_times

t0
2017-05-06    2017-06-16
2017-06-07    2017-07-17
2017-07-08    2017-08-18
Name: t1, dtype: object

In [5]:
purged_train_times = purge(all_times, test_times)
purged_train_times

t0
2017-01-02    2017-02-12
2017-02-03    2017-03-13
2017-03-04    2017-04-14
2017-09-10    2017-10-11
2017-10-11    2017-11-12
Name: t1, dtype: object

In [6]:
embargo_table = get_embargo_table(all_times.index, embargo_pct=.2)
embargo_table

t0
2017-01-02    2017-03-04
2017-02-03    2017-04-05
2017-03-04    2017-05-06
2017-04-05    2017-06-07
2017-05-06    2017-07-08
2017-06-07    2017-08-09
2017-07-08    2017-09-10
2017-08-09    2017-10-11
2017-09-10    2017-10-11
2017-10-11    2017-10-11
dtype: object

In [7]:
embargoed_times = embargo(all_times, test_times, embargo_table)
embargoed_times

t0
2017-01-02    2017-02-12
2017-02-03    2017-03-13
2017-03-04    2017-04-14
2017-04-05    2017-05-15
2017-10-11    2017-11-12
Name: t1, dtype: object

In [8]:
purged_train_times = purge(embargoed_times, test_times)
purged_train_times

t0
2017-01-02    2017-02-12
2017-02-03    2017-03-13
2017-03-04    2017-04-14
2017-10-11    2017-11-12
Name: t1, dtype: object

In [None]:
"""
Hyperparameter tuning of a financial ML model by PCV(Purged Cross-Validation).
Expanded scikit-learn's k-fold cross-validation to "embargo" and "purge" the training set.
"""
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection._split import _BaseKFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingClassifier


class PurgedKFold(_BaseKFold):
    """
    Extend KFold to work with labels that span intervals.
    Training set is embargoed, then purged of observations overlapping test-label intervals.
    Test set is assumed contiguous (shuffle=False) w/o training example in between.
    """
    def __init__(self, n_splits=3, event_times=None, embargo_pct=.01):
        """
        Args:
            n_splits(int): number of folds
            event_times(Series): endtime of each event
            embargo_pct(float): percentage of bars to embargo
        """
        assert(isinstance(event_times, pd.Series)), 'event_times must be a pandas Series!!!'
        super(PurgedKFold, self).__init__(n_splits, shuffle=False, random_state=None)
        self.event_times = event_times
        self.embargo_pct = embargo_pct

    def split(self, X, y=None, groups=None):
        """
        Generate training/test indices for PCV.
        Args:
            X(DataFrame): Features for ML
            y(Series): Labels for ML (for consistency with other objects in pipeline)
            groups:

        Returns: generator of training/test indices
        """
        if (X.index == self.event_times.index).sum() != len(self.event_times):
            raise ValueError('X and ThruDateValues must have the same index.')
        indices = np.arange(X.shape[0])
        mbrg = int(X.shape[0] * self.embargo_pct)
        test_starts = [(i[0], i[-1] + 1) for i in
                       np.array_split(np.arange(X.shape[0]), self.n_splits)]
        for i, j in test_starts:
            t0 = self.event_times.index[i]  # start of test set
            test_indices = indices[i:j]
            maxT1Idx = self.event_times.index.searchsorted(self.event_times[test_indices].max())
            train_indices = self.event_times.index.searchsorted(
                self.event_times[self.event_times <= t0].index)
            train_indices = np.concatenate((train_indices, indices[maxT1Idx+mbrg:]))
            yield train_indices, test_indices


def cv_score(clf, X, y, sample_weight, scoring='neg_log_loss',
             event_times=None, n_splits=None, cv_gen=None, embargo_pct=.01):
    """
    Using class PurgedKFold to fix bug in scikit-learn's cross_val_score()
    https://github.com/scikit-learn/scikit-learn/issues/9144

    Args:
        clf(Pipeline): classifier
        X(DataFrame): features
        y(Series): labels
        sample_weight(?): will specify later
        scoring(str): performance metric
        event_times(Series): endtime of each event
        n_splits(int): number of folds
        cv_gen: k-fold cross-validation object
        embargo_pct(float): percentage of bars to embargo

    Returns:
    """
    if scoring not in ['neg_log_loss', 'accuracy']:
        raise Exception('wrong scoring method.')

    if cv_gen is None:
        cv_gen = PurgedKFold(n_splits=n_splits, event_times=event_times,
                             embargo_pct=embargo_pct)  # Also embargoed of course

    score = []
    for train, test in cv_gen.split(X=X):
        fit = clf.fit(X=X.iloc[train, :], y=y.iloc[train],
                      sample_weight=sample_weight.iloc[train].values)
        if scoring == 'neg_log_loss':
            prob = fit.predict_proba(X.iloc[test, :])
            _score = -log_loss(y.iloc[test], prob,
                               sample_weight=sample_weight.iloc[test].values, labels=clf.classes_)
        else:
            pred = fit.predict(X.iloc[test, :])
            _score = accuracy_score(y.iloc[test], pred,
                                    sample_weight=sample_weight.iloc[test].values)
        score.append(_score)
    return np.array(score)


class WeightedPipeline(Pipeline):
    """
    Expand scikit-learn's Pipeline to accept sample_weight argument.
    Used in clf_hyper_fit() when bagging is done.
    """
    def fit(self, X, y, sample_weight=None, **fit_params):
        if sample_weight is not None:
            fit_params[self.steps[-1][0] + '__sample_weight'] = sample_weight

        return super(WeightedPipeline, self).fit(X, y, **fit_params)


def clf_hyper_fit(X, y, event_times, pipe_clf, param_grid, n_rand_iter=0, is_meta=False,
                  embargo_pct=.01, bagging=[0, None, 1.], n_splits=3, n_jobs=-1, **fit_params):
    """
    Perform GridSearchCV or RandomizedSearchCV with purged k-fold cross-validation

    Args:
        X(DataFrame): features
        y(Series): labels
        event_times(Series): endtime of each event
        pipe_clf(Pipeline): scikit-learn's pipeline
        param_grid(): grid of hyperparameters to search for tuning
        n_rand_iter(int): RandomSearchCV iterations to perform. If 0, GridSearchCV
        is_meta(bool): if True, meta-labeling
        embargo_pct(float): percentage of bars to embargo
        bagging(list): option to control bagging
        n_splits(int): number of folds
        n_jobs(int): number of concurrent jobs
        **fit_params():

    Returns:
        search(Pipeline): the best performing estimator found by tuning
    """
    # 'f1' for balancing recall vs precision; 'neg_log_loss' to be symmetric towards all labels
    scoring = 'f1' if is_meta else 'neg_log_loss'

    # Hyperparameter search on training data
    purged_cv = PurgedKFold(n_splits=n_splits, event_times=event_times, embargo_pct=embargo_pct)
    if n_rand_iter == 0:
        search = GridSearchCV(estimator=pipe_clf, param_grid=param_grid,
                              scoring=scoring, cv=purged_cv, n_jobs=n_jobs, iid=False)
    else:
        search = RandomizedSearchCV(estimator=pipe_clf, param_distributions=param_grid,
                                    scoring=scoring, cv=purged_cv, n_jobs=n_jobs, iid=False,
                                    n_iter=n_rand_iter)

    search = search.fit(X, y, **fit_params).best_estimator_  # Pipeline object

    # Fit validated model on the entirety of the data
    if bagging[1] > 0:
        search = BaggingClassifier(
            base_estimator=WeightedPipeline(search.steps), n_estimators=int(bagging[0]),
            max_samples=float(bagging[1]), max_features=float(bagging[2]), n_jobs=n_jobs)
        search = search.fit(X, y,
                            sample_weight=fit_params[search.base_estimator.steps[-1][0]
                                                     + '__sample_weight'])
        search = Pipeline([('bag', search)])

    return search