In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df = pd.read_parquet("datasets/timeseries.parquet")

# EDA

## Take a Quick Look at the Data Structure

In [3]:
df.head(100)

Unnamed: 0_level_0,2yr_yield_90d_change,T10y2y_90d_change,high_yield_90d_change,vug_vtv_60d_ret,t1
build_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2006-01-03,0.105012,-2.0,0.036786,-0.017685,2006-03-30
2006-01-04,0.122249,-1.428571,0.038835,-0.024207,2006-03-31
2006-01-05,0.146949,-1.428571,0.042718,-0.023621,2006-04-03
2006-01-06,0.150432,-1.733333,0.052478,-0.026593,2006-04-04
2006-01-09,0.150432,-1.733333,0.048497,-0.028333,2006-04-05
2006-01-10,0.137255,-1.741935,0.05814,-0.024177,2006-04-06
2006-01-11,0.130435,-1.733333,0.073786,-0.022117,2006-04-07
2006-01-12,0.137184,-1.733333,0.07451,-0.025376,2006-04-10
2006-01-13,0.111922,-1.714286,0.05703,-0.026654,2006-04-11
2006-01-17,0.111922,-1.714286,0.051181,-0.022997,2006-04-12


In [4]:
df.describe()

Unnamed: 0,2yr_yield_90d_change,T10y2y_90d_change,high_yield_90d_change,vug_vtv_60d_ret
count,4188.0,4188.0,4188.0,4188.0
mean,-0.0002,-0.03269,-0.005079,0.005173
std,0.433406,0.714401,0.11609,0.04889
min,-1.647799,-2.0,-0.666667,-0.196183
25%,-0.169492,-0.261746,-0.078016,-0.019326
50%,0.029271,-0.066176,-0.015262,0.004006
75%,0.222222,0.225417,0.064,0.030814
max,1.338583,2.0,0.449213,0.223951


In [5]:
import matplotlib.pyplot as plt

plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

df.drop(columns=["t1"]).hist(bins=50, figsize=(12, 8))
plt.show()

## Create a test set

In [6]:
import purged_kfold

In [7]:
df_train, df_test = purged_kfold.train_test_split(df, df.t1, 0.2)

In [8]:
len(df_train), len(df_test), len(df_train) + len(df_test), len(df)

(3291, 838, 4129, 4188)

In [9]:
train_X = df_train.drop(["vug_vtv_60d_ret", "t1"], axis=1)
train_y = df_train.loc[:, "vug_vtv_60d_ret"]
train_Y = df_train.loc[:, ["vug_vtv_60d_ret", "t1"]]

## Discover and Visualize the Data to Gain Insights

In [10]:
corr_matrix = df_train.corr()

In [11]:
corr_matrix["vug_vtv_60d_ret"].sort_values(ascending=False)

vug_vtv_60d_ret          1.000000
T10y2y_90d_change        0.221856
high_yield_90d_change    0.029721
2yr_yield_90d_change    -0.223042
Name: vug_vtv_60d_ret, dtype: float64

In [12]:
from pandas.plotting import scatter_matrix

scatter_matrix(df_train, figsize=(12, 8))
plt.show()

In [13]:
df_train.plot(kind="scatter", x="2yr_yield_90d_change", y="vug_vtv_60d_ret", alpha=0.1, grid=True)
plt.show()

In [14]:
df_train.plot(kind="scatter", x="T10y2y_90d_change", y="vug_vtv_60d_ret", alpha=0.1, grid=True)
plt.show()

In [15]:
df_train.plot(kind="scatter", x="high_yield_90d_change", y="vug_vtv_60d_ret", alpha=0.1, grid=True)
plt.show()

## Transformation Pipelines

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


def preprocessing():
    return Pipeline([("standardize", StandardScaler()), ])

# Regression

## Select and Train a model

In [22]:
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

lin_reg = make_pipeline(preprocessing(), LinearRegression())
lin_reg.fit(train_X, train_y)
predictions = lin_reg.predict(train_X)
lin_r2 = r2_score(train_y, predictions)
lin_rmse = mean_squared_error(train_y, predictions, squared=False)
lin_r2, lin_rmse

(0.1005627065415633, 0.031973454127755443)

In [69]:
lin_reg_func = lambda: make_pipeline(preprocessing(), LinearRegression())
scores = purged_kfold.cv_score(lin_reg_func, train_X, train_Y, n_folds=3, scoring=["neg_mean_squared_error","r2_score"])
scores.agg(["mean", "std"])

Unnamed: 0,neg_mean_squared_error,r2_score
mean,-0.035193,-0.175435
std,0.005469,0.206774


In [70]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(preprocessing(), RandomForestRegressor())
forest_reg.fit(train_X, train_y)
predictions = forest_reg.predict(train_X)
forest_r2 = r2_score(train_y, predictions)
forest_rmse = mean_squared_error(train_y, predictions, squared=False)
forest_r2, forest_rmse

(0.9444760218916632, 0.007944094429669705)

In [71]:
forest_reg = lambda: make_pipeline(preprocessing(), RandomForestRegressor(random_state=42))
forest_rmse = purged_kfold.cv_score(forest_reg, train_X, train_Y, n_folds=3, scoring=["neg_mean_squared_error",
                                                                                      "r2_score"])
forest_rmse.agg(["mean", "std"])

Unnamed: 0,neg_mean_squared_error,r2_score
mean,-0.040982,-0.645107
std,0.004252,0.529071


In [72]:
from sklearn.svm import SVR

svm_reg = make_pipeline(preprocessing(), SVR())
svm_reg.fit(train_X, train_y)
predictions = svm_reg.predict(train_X)
svm_r2 = r2_score(train_y, predictions)
svm_rmse = mean_squared_error(train_y, predictions, squared=False)
svm_r2, svm_rmse

(0.11637182481215591, 0.03169121489405202)

In [73]:
svm_reg = lambda: make_pipeline(preprocessing(), SVR())
svm_rmse = purged_kfold.cv_score(svm_reg, train_X, train_Y, n_folds=3, scoring=["neg_mean_squared_error",
                                                                                "r2_score"])
svm_rmse.agg(["mean", "std"])

Unnamed: 0,neg_mean_squared_error,r2_score
mean,-0.032986,5.1e-05
std,0.008031,0.047046


In [74]:
import xgboost as xgb

xgb_reg = make_pipeline(preprocessing(), xgb.XGBRegressor())
xgb_reg.fit(train_X, train_y)
predictions = xgb_reg.predict(train_X)
xgb_r2 = r2_score(train_y, predictions)
xgb_rmse = mean_squared_error(train_y, predictions, squared=False)
xgb_r2, xgb_rmse

(0.9008837291043885, 0.01061393900906928)

In [75]:
xgb_reg = lambda: make_pipeline(preprocessing(), xgb.XGBRegressor())
xgb_rmse = purged_kfold.cv_score(xgb_reg, train_X, train_Y, n_folds=3, scoring=["neg_mean_squared_error",
                                                                               "r2_score"])
xgb_rmse.agg(["mean", "std"])

Unnamed: 0,neg_mean_squared_error,r2_score
mean,-0.041638,-0.681789
std,0.005612,0.506503


# Fine-Tune Your Model

In [76]:
from sklearn.model_selection import GridSearchCV

full_pipeline = Pipeline([
    ("preprocessing", preprocessing()),
    ("random_forest", RandomForestRegressor(random_state=42, max_features=None)),
])
param_grid = [
    {
        'random_forest__max_depth': [1, 2, 4],
        'random_forest__n_estimators': [10, 100, 1000],
    },
]
cv = purged_kfold.PurgedKFold(train_Y.t1, n_folds=3)
cv_iter = cv.split(train_X, train_y)

grid_search = GridSearchCV(full_pipeline, param_grid, cv=cv_iter, scoring='neg_root_mean_squared_error')
_ = grid_search.fit(train_X, train_y)

In [77]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res.loc[:, ["params", "mean_test_score", "std_test_score"]]

Unnamed: 0,params,mean_test_score,std_test_score
3,"{'random_forest__max_depth': 2, 'random_forest...",-0.035402,0.00384
0,"{'random_forest__max_depth': 1, 'random_forest...",-0.035437,0.005862
5,"{'random_forest__max_depth': 2, 'random_forest...",-0.035496,0.003872
2,"{'random_forest__max_depth': 1, 'random_forest...",-0.035503,0.00571
1,"{'random_forest__max_depth': 1, 'random_forest...",-0.035527,0.005666
4,"{'random_forest__max_depth': 2, 'random_forest...",-0.035594,0.003815
8,"{'random_forest__max_depth': 4, 'random_forest...",-0.037121,0.004088
6,"{'random_forest__max_depth': 4, 'random_forest...",-0.037309,0.004054
7,"{'random_forest__max_depth': 4, 'random_forest...",-0.03737,0.003633


In [78]:
full_pipeline = Pipeline([
    ("preprocessing", preprocessing()),
    ("svr", SVR()),
])
param_grid = [
    # {'svr__kernel': ['linear'], 'svr__C': [10., 30., 100., 300., 1000.,
    #                                        3000., 10000., 30000.0]},
    {'svr__kernel': ['rbf'], 'svr__C': [1.0, 3.0, 10., 30., 100., 300.,
                                        1000.0],
     # 'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
     'svr__gamma': [1.5, 1.75, 2.0, 2.1, 2.25, 2.5, 2.75]},
]
cv = purged_kfold.PurgedKFold(train_Y.t1, n_folds=3)
cv_iter = cv.split(train_X, train_y)

grid_search = GridSearchCV(full_pipeline, param_grid, cv=cv_iter, scoring='neg_root_mean_squared_error')
_ = grid_search.fit(train_X, train_y)

In [79]:
svm_cv_res = pd.DataFrame(grid_search.cv_results_)
svm_cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
svm_cv_res.loc[:, ["params", "mean_test_score", "std_test_score"]]

Unnamed: 0,params,mean_test_score,std_test_score
0,"{'svr__C': 1.0, 'svr__gamma': 1.5, 'svr__kerne...",-0.034477,0.005459
21,"{'svr__C': 30.0, 'svr__gamma': 1.5, 'svr__kern...",-0.034477,0.005459
28,"{'svr__C': 100.0, 'svr__gamma': 1.5, 'svr__ker...",-0.034477,0.005459
42,"{'svr__C': 1000.0, 'svr__gamma': 1.5, 'svr__ke...",-0.034477,0.005459
7,"{'svr__C': 3.0, 'svr__gamma': 1.5, 'svr__kerne...",-0.034477,0.005459
14,"{'svr__C': 10.0, 'svr__gamma': 1.5, 'svr__kern...",-0.034477,0.005459
35,"{'svr__C': 300.0, 'svr__gamma': 1.5, 'svr__ker...",-0.034477,0.005459
1,"{'svr__C': 1.0, 'svr__gamma': 1.75, 'svr__kern...",-0.034583,0.005402
22,"{'svr__C': 30.0, 'svr__gamma': 1.75, 'svr__ker...",-0.034583,0.005402
29,"{'svr__C': 100.0, 'svr__gamma': 1.75, 'svr__ke...",-0.034583,0.005402


In [80]:
full_pipeline = Pipeline([
    ("preprocessing", preprocessing()),
    ("xgb", xgb.XGBRegressor()),
])
param_grid = [
    {
        'xgb__max_depth': [1],
        'xgb__eta': [0.03, 0.05, 0.06, 0.07, 0.1, 0.3],
    },
]
cv = purged_kfold.PurgedKFold(train_Y.t1, n_folds=3)
cv_iter = cv.split(train_X, train_y)

grid_search = GridSearchCV(full_pipeline, param_grid, cv=cv_iter, scoring='neg_root_mean_squared_error')
_ = grid_search.fit(train_X, train_y)

In [81]:
xgb_cv_res = pd.DataFrame(grid_search.cv_results_)
xgb_cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
xgb_cv_res.loc[:, ["params", "mean_test_score", "std_test_score"]]

Unnamed: 0,params,mean_test_score,std_test_score
1,"{'xgb__eta': 0.05, 'xgb__max_depth': 1}",-0.034598,0.004533
2,"{'xgb__eta': 0.06, 'xgb__max_depth': 1}",-0.034888,0.004569
3,"{'xgb__eta': 0.07, 'xgb__max_depth': 1}",-0.035209,0.004448
4,"{'xgb__eta': 0.1, 'xgb__max_depth': 1}",-0.035682,0.004143
5,"{'xgb__eta': 0.3, 'xgb__max_depth': 1}",-0.037149,0.003496
0,"{'xgb__eta': 0.03, 'xgb__max_depth': 1}",-0.039916,0.005008


# Classification

## Select and Train a Model

In [17]:
cls_train_Y = train_Y.assign(vug_vtv_60d_ret=np.sign(train_Y.vug_vtv_60d_ret).clip(0, 1), w=train_Y.vug_vtv_60d_ret
                             .abs())
cls_train_y = cls_train_Y.vug_vtv_60d_ret
cls_train_w = cls_train_Y.w

In [18]:
counts = cls_train_Y.groupby("vug_vtv_60d_ret").count()
counts

Unnamed: 0_level_0,t1,w
vug_vtv_60d_ret,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,1488,1488
1.0,1803,1803


In [19]:
# Your success rate if you just guessed positive.
guess_positive_accuracy = counts.loc[1.0, :].t1 / counts.t1.sum()
guess_positive_accuracy

0.5478577939835916

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score

log_reg = make_pipeline(preprocessing(), LogisticRegression())
log_reg.fit(train_X, cls_train_y, logisticregression__sample_weight=cls_train_w)
predicted_probs = log_reg.predict_proba(train_X)
predicted_class = log_reg.predict(train_X)
log_loss_ = log_loss(cls_train_y, predicted_probs, sample_weight=cls_train_w)
log_accuracy = accuracy_score(cls_train_y, predicted_class, sample_weight=cls_train_w)
log_loss_, log_accuracy

(0.6201794349888833, 0.6241172398239985)

In [24]:
log_reg_func = Pipeline([("standardize", StandardScaler()), ('log', LogisticRegression())])
scores = purged_kfold.cv_score(log_reg_func, train_X, cls_train_Y, n_folds=3, scoring=["neg_log_loss", "accuracy"],
                               has_weights=True)
scores.agg(["mean", "std"])

Unnamed: 0,neg_log_loss,accuracy
mean,-0.755882,0.512251
std,0.106812,0.132271


In [25]:
from sklearn.ensemble import RandomForestClassifier

forest_reg = make_pipeline(preprocessing(), RandomForestClassifier())
forest_reg.fit(train_X, cls_train_y, randomforestclassifier__sample_weight=cls_train_w)
predicted_probs = forest_reg.predict_proba(train_X)
predicted_class = forest_reg.predict(train_X)
forest_loss = log_loss(cls_train_y, predicted_probs, sample_weight=cls_train_w)
forest_accuracy = accuracy_score(cls_train_y, predicted_class, sample_weight=cls_train_w)
forest_loss, forest_accuracy

(0.09833784116993635, 1.0)

In [26]:
forest_cls_func = Pipeline([("standardize", StandardScaler()), ('log', RandomForestClassifier())])
scores = purged_kfold.cv_score(forest_cls_func, train_X, cls_train_Y, n_folds=3, scoring=["neg_log_loss", "accuracy"],
                               has_weights=True)
scores.agg(["mean", "std"])

Unnamed: 0,neg_log_loss,accuracy
mean,-0.867198,0.528308
std,0.118149,0.069525


In [27]:
from sklearn.svm import SVC

svm_cls = make_pipeline(preprocessing(), SVC(probability=True))
svm_cls.fit(train_X, cls_train_y, svc__sample_weight=cls_train_w)
predicted_probs = svm_cls.predict_proba(train_X)
predicted_class = svm_cls.predict(train_X)
svm_loss = log_loss(cls_train_y, predicted_probs, sample_weight=cls_train_w)
svm_accuracy = accuracy_score(cls_train_y, predicted_class, sample_weight=cls_train_w)
svm_loss, svm_accuracy

(0.5673400957654541, 0.6873481237309548)

In [43]:
svc_cls_func = Pipeline([("standardize", StandardScaler()), ('svc', SVC(probability=True))])
cv_gen = purged_kfold.PurgedKFold(cls_train_Y.t1, 3)
scores = purged_kfold.cv_score(svc_cls_func, train_X, cls_train_Y, cv_gen=cv_gen, scoring=["neg_log_loss", "accuracy"], has_weights=True)
scores.agg(["mean", "std"])
#scores

Unnamed: 0,neg_log_loss,accuracy
mean,-0.756013,0.487998
std,0.019573,0.030238


In [182]:
xgb_cls = make_pipeline(preprocessing(), xgb.XGBClassifier())
xgb_cls.fit(train_X, cls_train_y, xgbclassifier__sample_weight=cls_train_w)
predicted_probs = xgb_cls.predict_proba(train_X)
predicted_class = xgb_cls.predict(train_X)
xgb_loss = log_loss(cls_train_y, predicted_probs, sample_weight=cls_train_w)
xgb_accuracy = accuracy_score(cls_train_y, predicted_class, sample_weight=cls_train_w)
xgb_loss, xgb_accuracy

(0.32001637946255607, 0.8661235005652731)

In [129]:
xgb_cls_func = lambda: Pipeline([("standardize", StandardScaler()), ('xgb', xgb.XGBClassifier())])
scores = purged_kfold.cv_score(xgb_cls_func, train_X, cls_train_Y, n_folds=3, scoring=["neg_log_loss", "accuracy"],
                               has_weights=True)
scores.agg(["mean", "std"])

Unnamed: 0,neg_log_loss,accuracy
mean,-0.878299,0.494194
std,0.089502,0.039329


## Fine tune your model

In [56]:
from sklearn.model_selection import GridSearchCV

full_pipeline = Pipeline([
    ("preprocessing", preprocessing()),
    ("random_forest", RandomForestClassifier(random_state=42, max_features=None)),
])
param_grid = [
    {
        'random_forest__max_depth': [1, 2, 4],
        'random_forest__n_estimators': [10, 100, 1000],
    },
]
cv = purged_kfold.PurgedKFold(train_Y.t1, n_folds=3)
cv_iter = cv.split(train_X, cls_train_y)

grid_search = GridSearchCV(full_pipeline, param_grid, cv=cv_iter, scoring='accuracy')
_ = grid_search.fit(train_X, cls_train_y)#, random_forest__sample_weight=cls_train_w)

In [57]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res.loc[:, ["params", "mean_test_score", "std_test_score"]]

Unnamed: 0,params,mean_test_score,std_test_score
0,"{'random_forest__max_depth': 1, 'random_forest...",0.455789,0.019819
1,"{'random_forest__max_depth': 1, 'random_forest...",0.454877,0.022192
2,"{'random_forest__max_depth': 1, 'random_forest...",0.454573,0.021857
3,"{'random_forest__max_depth': 2, 'random_forest...",0.452446,0.024689
6,"{'random_forest__max_depth': 4, 'random_forest...",0.437861,0.019414
5,"{'random_forest__max_depth': 2, 'random_forest...",0.435734,0.028605
4,"{'random_forest__max_depth': 2, 'random_forest...",0.431176,0.034327
7,"{'random_forest__max_depth': 4, 'random_forest...",0.424491,0.023077
8,"{'random_forest__max_depth': 4, 'random_forest...",0.421149,0.025042


In [96]:
full_pipeline = Pipeline([
    ("preprocessing", preprocessing()),
    ("svc", SVC()),
])
param_grid = [
    {'svc__kernel': ['linear'], 'svc__C': [10., 30., 100., 300., 1000.,
                                           3000., 10000., 30000.0]},
    {'svc__kernel': ['rbf'], 'svc__C': [1.0, 3.0, 10., 30., 100., 300.,
                                        1000.0],
     'svc__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
]
cv = purged_kfold.PurgedKFold(train_Y.t1, n_folds=3)
cv_iter = cv.split(train_X, cls_train_y)

grid_search = GridSearchCV(full_pipeline, param_grid, cv=cv_iter, scoring='accuracy')
_ = grid_search.fit(train_X, cls_train_y, svc__sample_weight=cls_train_w)

In [97]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res.loc[:, ["params", "mean_test_score", "std_test_score"]]

Unnamed: 0,params,mean_test_score,std_test_score
8,"{'svc__C': 1.0, 'svc__gamma': 0.01, 'svc__kern...",0.547858,0.035407
13,"{'svc__C': 1.0, 'svc__gamma': 3.0, 'svc__kerne...",0.5433,0.021469
41,"{'svc__C': 300.0, 'svc__gamma': 0.3, 'svc__ker...",0.534488,0.033322
31,"{'svc__C': 30.0, 'svc__gamma': 3.0, 'svc__kern...",0.527803,0.023525
25,"{'svc__C': 10.0, 'svc__gamma': 3.0, 'svc__kern...",0.526284,0.022739
24,"{'svc__C': 10.0, 'svc__gamma': 1.0, 'svc__kern...",0.525372,0.028801
47,"{'svc__C': 1000.0, 'svc__gamma': 0.3, 'svc__ke...",0.524461,0.035636
49,"{'svc__C': 1000.0, 'svc__gamma': 3.0, 'svc__ke...",0.523549,0.03376
37,"{'svc__C': 100.0, 'svc__gamma': 3.0, 'svc__ker...",0.523549,0.030365
43,"{'svc__C': 300.0, 'svc__gamma': 3.0, 'svc__ker...",0.523245,0.030571


In [51]:
full_pipeline = Pipeline([("standardize", StandardScaler()), ('svc', SVC(probability=True))])
param_grid = [
    {'svc__C': [1.0, 2, 5, 10]},
]
cv = purged_kfold.PurgedKFold(cls_train_Y.t1, n_folds=3)
grid_search = GridSearchCV(full_pipeline, param_grid, cv=cv, scoring=['neg_log_loss', 'accuracy'], refit="neg_log_loss")
_ = grid_search.fit(train_X, cls_train_y)#, standardize__sample_weight=cls_train_w)#, svc__sample_weight=cls_train_w)

In [52]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_accuracy", ascending=False, inplace=True)
cv_res.loc[:, ["params", "mean_test_accuracy", "std_test_accuracy", "mean_test_neg_log_loss", "std_test_neg_log_loss"]]

Unnamed: 0,params,mean_test_accuracy,std_test_accuracy,mean_test_neg_log_loss,std_test_neg_log_loss
1,{'svc__C': 2},0.529626,0.025686,-0.78826,0.023074
3,{'svc__C': 10},0.528411,0.031651,-0.903147,0.040774
0,{'svc__C': 1.0},0.526588,0.024689,-0.766461,0.024327
2,{'svc__C': 5},0.524765,0.027147,-0.844637,0.017397


In [54]:
import xgboost as xgb
full_pipeline = Pipeline([
    ("preprocessing", preprocessing()),
    ("xgb", xgb.XGBClassifier()),
])
param_grid = [
    {
        'xgb__max_depth': [1, 3, 4],
        'xgb__eta': [0.03, 0.05, 0.06, 0.07, 0.1, 0.3],
    },
]
cv = purged_kfold.PurgedKFold(train_Y.t1, n_folds=3)
cv_iter = cv.split(train_X, cls_train_y)

grid_search = GridSearchCV(full_pipeline, param_grid, cv=cv_iter, scoring='accuracy')
_ = grid_search.fit(train_X, cls_train_y)#, xgb__sample_weight=cls_train_w)

In [55]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score", ascending=False, inplace=True)
cv_res.loc[:, ["params", "mean_test_score", "std_test_score"]]

Unnamed: 0,params,mean_test_score,std_test_score
12,"{'xgb__eta': 0.1, 'xgb__max_depth': 1}",0.522334,0.061642
3,"{'xgb__eta': 0.05, 'xgb__max_depth': 1}",0.519903,0.062788
9,"{'xgb__eta': 0.07, 'xgb__max_depth': 1}",0.517472,0.061971
0,"{'xgb__eta': 0.03, 'xgb__max_depth': 1}",0.516256,0.045043
6,"{'xgb__eta': 0.06, 'xgb__max_depth': 1}",0.508356,0.046483
17,"{'xgb__eta': 0.3, 'xgb__max_depth': 4}",0.506229,0.04208
16,"{'xgb__eta': 0.3, 'xgb__max_depth': 3}",0.504102,0.047127
15,"{'xgb__eta': 0.3, 'xgb__max_depth': 1}",0.503798,0.072524
14,"{'xgb__eta': 0.1, 'xgb__max_depth': 4}",0.503494,0.073003
8,"{'xgb__eta': 0.06, 'xgb__max_depth': 4}",0.49924,0.07632
