In [1]:
import numpy as np
import pandas as pd

from finance_ml.model_selection import PurgedKFold

In [30]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.pipeline import Pipeline

class MyPipeline(Pipeline):
    def fit(self, X, y, sample_weight=None, **fit_params):
        if sample_weight is not None:
            fit_params[self.steps[-1][0] + '__sample_weight'] = sample_weight
        return super(MyPipeline, self).fit(X, y, **fit_params)


def clf_hyper_fit(feat, label, t1, pipe_clf, search_params, scoring=None,
                  n_splits=3, bagging=[0, None, 1.],
                  rnd_search_iter=0, n_jobs=-1, pct_embargo=0., **fit_params):
    # Set defaut value for scoring
    if scoring is None:
        if set(label.values) == {0, 1}:
            scoring = 'f1'
        else:
            scoring = 'neg_log_loss'
    # HP serach on traing data
    inner_cv = PurgedKFold(n_splits=n_splits, t1=t1, pct_embargo=pct_embargo)
    if rnd_search_iter == 0:
        search = GridSearchCV(estimator=pipe_clf, param_grid=search_params,
                              scoring=scoring, cv=inner_cv, n_jobs=n_jobs, iid=False)
    else:
        search = RandomizedSearchCV(estimator=pipe_clf, param_distributions=search_params,
                                    scoring=scoring, cv=inner_cv, n_jobs=n_jobs, iid=False)
    best_pipe = search.fit(feat, label, **fit_params).best_estimator_
    # Fit validated model on the entirely of dawta
    if bagging[0] > 0:
        bag_est = BaggingClassifier(base_estimator=MyPipeline(best_pipe.steps),
                                   n_estimators=int(bagging[0]), max_samples=float(bagging[1]),
                                   max_features=float(bagging[2]), n_jobs=n_jobs)
        bag_est = best_pipe.fit(feat, label,
                                sample_weight=fit_params[bag_est.base_estimator.steps[-1][0] + '__sample_weight'])
        best_pipe = Pipeline([('bag', bag_est)])
    return best_pipe

In [3]:
from scipy.stats import rv_continuous


class LogUniformGen(rv_continuous):
    def _cdf(self, x):
        return np.log(x / self.a) / np.log(self.b / self.a)
    
def log_uniform(a=1, b=np.exp(1)):
    return LogUniformGen(a=a, b=b, name='log_uniform')

In [4]:
a = 1e-3
b = 1e3
size = 10000
vals = log_uniform(a=a, b=b).rvs(size=size)

In [5]:
vals.shape

(10000,)

# 9.1

In [7]:
from finance_ml.datasets import get_cls_data

In [14]:
X, label = get_cls_data(n_features=10, n_informative=5, n_redundant=0, n_samples=10000)
print(X.head())
print(label.head())

                                 I_0       I_1       I_2       I_3       I_4  \
1980-03-17 07:14:33.589988  2.105359  2.861661  0.104159  0.686149  1.369429   
1980-03-18 07:14:33.589988 -0.330754  1.464379 -1.405119  0.396713 -1.722305   
1980-03-19 07:14:33.589988 -0.461334 -0.160432 -2.169501 -0.137535  0.398229   
1980-03-20 07:14:33.589988 -1.573667  3.110105  0.073939  1.232501  1.069429   
1980-03-21 07:14:33.589988  0.528677  1.538982 -1.603758  2.056413  0.777722   

                                 N_0       N_1       N_2       N_3       N_4  
1980-03-17 07:14:33.589988 -0.868903 -1.297125 -0.160205 -0.481024  0.841338  
1980-03-18 07:14:33.589988  0.471952 -1.443687 -0.433773  0.123114 -0.102970  
1980-03-19 07:14:33.589988 -0.278979 -1.860566  0.909540 -0.396742  2.455228  
1980-03-20 07:14:33.589988  0.700720 -1.097145  0.157145 -1.699373  1.167458  
1980-03-21 07:14:33.589988 -0.644594 -0.304476  0.682256 -0.644368  0.280994  
                            bin       w      

In [28]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

name = 'svc'
params_grid = {name + '__C': [1e-2, 1e-1, 1, 10, 100], name + '__gamma': [1e-2, 1e-1, 1, 10, 100]}
kernel = 'rbf'
clf = SVC(kernel=kernel, probability=True)
pipe_clf = Pipeline([(name, clf)])
fit_params = dict()

clf = clf_hyper_fit(X, label['bin'], t1=label['t1'], pipe_clf=pipe_clf, scoring='neg_log_loss',
                    search_params=params_grid, n_splits=3, bagging=[0, None, 1.],
                    rnd_search_iter=0, n_jobs=-1, pct_embargo=0., **fit_params)

# 9.2

In [31]:
name = 'svc'
params_dist = {name + '__C': log_uniform(a=1e-2, b=1e2),
               name + '__gamma': log_uniform(a=1e-2, b=1e2)}
kernel = 'rbf'
clf = SVC(kernel=kernel, probability=True)
pipe_clf = Pipeline([(name, clf)])
fit_params = dict()

clf = clf_hyper_fit(X, label['bin'], t1=label['t1'], pipe_clf=pipe_clf, scoring='neg_log_loss',
                    search_params=params_grid, n_splits=3, bagging=[0, None, 1.],
                    rnd_search_iter=25, n_jobs=-1, pct_embargo=0., **fit_params)