In [1]:
from sklearn.datasets import fetch_california_housing 
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
from sklearn import ensemble
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from mlxtend.feature_selection import ColumnSelector
from skopt import BayesSearchCV

  from numpy.core.umath_tests import inner1d


In [2]:
data = fetch_california_housing()

In [3]:
df = pd.DataFrame(data=data.data, columns=data.feature_names, index=None)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [4]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(df, data.target, train_size=0.5)



In [5]:
med_inc = Pipeline([
    ("selector", ColumnSelector(cols=['MedInc'])),
    ("scaler", preprocessing.StandardScaler()),
])

house_age = Pipeline([
    ("selector", ColumnSelector(cols=['HouseAge'])),
    ("scaler", preprocessing.StandardScaler()),
])

ave_rooms = Pipeline([
    ("selector", ColumnSelector(cols=['AveRooms'])),
    ("scaler", preprocessing.StandardScaler()),
])

ave_bedrms = Pipeline([
    ("selector", ColumnSelector(cols=['AveBedrms'])),
    ("scaler", preprocessing.StandardScaler()),
])

population = Pipeline([
    ("selector", ColumnSelector(cols=['Population'])),
    ("scaler", preprocessing.StandardScaler()),
])

ave_occup = Pipeline([
    ("selector", ColumnSelector(cols=['AveOccup'])),
    ("scaler", preprocessing.StandardScaler()),
])

latitude = Pipeline([
    ("selector", ColumnSelector(cols=['Latitude'])),
    ("scaler", preprocessing.StandardScaler()),
])

longitude = Pipeline([
    ("selector", ColumnSelector(cols=['Longitude'])),
    ("scaler", preprocessing.StandardScaler()),
])

In [6]:
features = FeatureUnion([
    ('med_inc', med_inc),
    ('house_age', house_age),
    ('ave_rooms', ave_rooms),
    ('ave_bedrms', ave_bedrms),
    ('population', population),
    ('ave_occup', ave_occup),
    ('latitude', latitude),
    ('longitude', longitude),
])

In [7]:
features.fit_transform(train_X)

array([[ 3.1453035 , -1.96819645,  0.56663288, ..., -0.02521668,
        -0.89116512,  0.5931743 ],
       [-0.83886423,  0.66474506, -0.06964893, ..., -0.05173696,
         1.12852018, -0.46979039],
       [-1.02775412, -0.45226043, -1.05568989, ...,  0.00591558,
        -0.74623399,  0.65305963],
       ...,
       [-0.15687955,  1.2232478 , -0.11993659, ..., -0.05351869,
         0.78255557, -1.16346219],
       [ 1.80077402, -1.80862423,  1.24229514, ...,  0.02692304,
        -0.75090919,  1.30680787],
       [-1.35987404, -0.53204654, -0.37359129, ..., -0.05033411,
        -0.89116512,  1.31179832]])

In [8]:
algo = linear_model.LinearRegression()
algo = ensemble.GradientBoostingRegressor()

In [9]:
pipeline = Pipeline([
    ('features', features),
    ('algo', algo),
])
pipeline.get_params()

{'memory': None, 'steps': [('features', FeatureUnion(n_jobs=1,
          transformer_list=[('med_inc', Pipeline(memory=None,
        steps=[('selector', ColumnSelector(cols=['MedInc'], drop_axis=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('house_age', Pipeline(memory=None,
        steps=[('selector', ColumnSelector(cols=['HouseAge'], drop...itude'], drop_axis=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))]))],
          transformer_weights=None)),
  ('algo',
   GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
                max_leaf_nodes=None, min_impurity_decrease=0.0,
                min_impurity_split=None, min_samples_leaf=1,
                min_samples_split=2, min_weight_fraction_leaf=0.0,
                n_estimators=100, presort='auto', random_state=None,
                subsample=1.0, verbose=0, warm_star

In [10]:
pipeline.fit(train_X, train_y)

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('med_inc', Pipeline(memory=None,
     steps=[('selector', ColumnSelector(cols=['MedInc'], drop_axis=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('house_age', Pipeline(memory=None,
     steps=[(...s=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False))])

In [11]:
predictions = pipeline.predict(test_X)
metrics.mean_squared_error(test_y, predictions)

0.2881289005799861

In [12]:
params = {
    "algo__alpha": [0.01, 1],
    'algo__n_estimators': [10, 1000],
}
cv = BayesSearchCV(pipeline, params, cv=5, n_jobs=-1, verbose=2, n_iter=10)

In [13]:
cv.fit(train_X, train_y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] algo__alpha=0.767458778270336, algo__n_estimators=769 ...........
[CV] algo__alpha=0.767458778270336, algo__n_estimators=769 ...........
[CV] algo__alpha=0.767458778270336, algo__n_estimators=769 ...........
[CV] algo__alpha=0.767458778270336, algo__n_estimators=769 ...........
[CV]  algo__alpha=0.767458778270336, algo__n_estimators=769, total=  11.2s
[CV] algo__alpha=0.767458778270336, algo__n_estimators=769 ...........
[CV]  algo__alpha=0.767458778270336, algo__n_estimators=769, total=  11.1s
[CV]  algo__alpha=0.767458778270336, algo__n_estimators=769, total=  11.3s
[CV]  algo__alpha=0.767458778270336, algo__n_estimators=769, total=  11.4s
[CV]  algo__alpha=0.767458778270336, algo__n_estimators=769, total=   5.7s
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   17.0s finished


[CV] algo__alpha=0.7678180168946732, algo__n_estimators=953 ..........
[CV] algo__alpha=0.7678180168946732, algo__n_estimators=953 ..........
[CV] algo__alpha=0.7678180168946732, algo__n_estimators=953 ..........
[CV] algo__alpha=0.7678180168946732, algo__n_estimators=953 ..........
[CV]  algo__alpha=0.7678180168946732, algo__n_estimators=953, total=  13.1s
[CV] algo__alpha=0.7678180168946732, algo__n_estimators=953 ..........
[CV]  algo__alpha=0.7678180168946732, algo__n_estimators=953, total=  13.2s
[CV]  algo__alpha=0.7678180168946732, algo__n_estimators=953, total=  13.2s
[CV]  algo__alpha=0.7678180168946732, algo__n_estimators=953, total=  13.3s
[CV]  algo__alpha=0.7678180168946732, algo__n_estimators=953, total=   7.1s
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   20.4s finished


[CV] algo__alpha=0.20339710153737395, algo__n_estimators=569 .........
[CV] algo__alpha=0.20339710153737395, algo__n_estimators=569 .........
[CV] algo__alpha=0.20339710153737395, algo__n_estimators=569 .........
[CV] algo__alpha=0.20339710153737395, algo__n_estimators=569 .........
[CV]  algo__alpha=0.20339710153737395, algo__n_estimators=569, total=   8.7s
[CV] algo__alpha=0.20339710153737395, algo__n_estimators=569 .........
[CV]  algo__alpha=0.20339710153737395, algo__n_estimators=569, total=   8.9s
[CV]  algo__alpha=0.20339710153737395, algo__n_estimators=569, total=   8.9s
[CV]  algo__alpha=0.20339710153737395, algo__n_estimators=569, total=   8.9s
[CV]  algo__alpha=0.20339710153737395, algo__n_estimators=569, total=   4.4s
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.2s finished


[CV] algo__alpha=0.21814637471597098, algo__n_estimators=728 .........
[CV] algo__alpha=0.21814637471597098, algo__n_estimators=728 .........
[CV] algo__alpha=0.21814637471597098, algo__n_estimators=728 .........
[CV] algo__alpha=0.21814637471597098, algo__n_estimators=728 .........
[CV]  algo__alpha=0.21814637471597098, algo__n_estimators=728, total=  11.6s
[CV] algo__alpha=0.21814637471597098, algo__n_estimators=728 .........
[CV]  algo__alpha=0.21814637471597098, algo__n_estimators=728, total=  11.6s
[CV]  algo__alpha=0.21814637471597098, algo__n_estimators=728, total=  11.8s
[CV]  algo__alpha=0.21814637471597098, algo__n_estimators=728, total=  11.8s
[CV]  algo__alpha=0.21814637471597098, algo__n_estimators=728, total=   6.1s
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   17.9s finished


[CV] algo__alpha=0.08827980449943333, algo__n_estimators=248 .........
[CV] algo__alpha=0.08827980449943333, algo__n_estimators=248 .........
[CV] algo__alpha=0.08827980449943333, algo__n_estimators=248 .........
[CV] algo__alpha=0.08827980449943333, algo__n_estimators=248 .........
[CV]  algo__alpha=0.08827980449943333, algo__n_estimators=248, total=   3.6s
[CV] algo__alpha=0.08827980449943333, algo__n_estimators=248 .........
[CV]  algo__alpha=0.08827980449943333, algo__n_estimators=248, total=   3.6s
[CV]  algo__alpha=0.08827980449943333, algo__n_estimators=248, total=   3.7s
[CV]  algo__alpha=0.08827980449943333, algo__n_estimators=248, total=   3.8s
[CV]  algo__alpha=0.08827980449943333, algo__n_estimators=248, total=   3.6s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] algo__alpha=0.5551634207625608, algo__n_estimators=274 ..........
[CV] algo__alpha=0.5551634207625608, algo__n_estimators=274 ..........
[CV] algo__alpha=0.5551634207625608, algo__n_estimators=274 ..........
[CV] algo__alpha=0.5551634207625608, algo__n_estimators=274 ..........
[CV]  algo__alpha=0.5551634207625608, algo__n_estimators=274, total=   5.9s
[CV] algo__alpha=0.5551634207625608, algo__n_estimators=274 ..........
[CV]  algo__alpha=0.5551634207625608, algo__n_estimators=274, total=   5.9s
[CV]  algo__alpha=0.5551634207625608, algo__n_estimators=274, total=   6.0s
[CV]  algo__alpha=0.5551634207625608, algo__n_estimators=274, total=   6.1s
[CV]  algo__alpha=0.5551634207625608, algo__n_estimators=274, total=   2.0s
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.1s finished


[CV] algo__alpha=0.2860586862819768, algo__n_estimators=40 ...........
[CV] algo__alpha=0.2860586862819768, algo__n_estimators=40 ...........
[CV] algo__alpha=0.2860586862819768, algo__n_estimators=40 ...........
[CV] algo__alpha=0.2860586862819768, algo__n_estimators=40 ...........
[CV]  algo__alpha=0.2860586862819768, algo__n_estimators=40, total=   0.8s
[CV]  algo__alpha=0.2860586862819768, algo__n_estimators=40, total=   0.7s
[CV]  algo__alpha=0.2860586862819768, algo__n_estimators=40, total=   0.8s
[CV] algo__alpha=0.2860586862819768, algo__n_estimators=40 ...........
[CV]  algo__alpha=0.2860586862819768, algo__n_estimators=40, total=   0.8s
[CV]  algo__alpha=0.2860586862819768, algo__n_estimators=40, total=   0.4s
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.3s finished


[CV] algo__alpha=0.8308319668921218, algo__n_estimators=470 ..........
[CV] algo__alpha=0.8308319668921218, algo__n_estimators=470 ..........
[CV] algo__alpha=0.8308319668921218, algo__n_estimators=470 ..........
[CV] algo__alpha=0.8308319668921218, algo__n_estimators=470 ..........
[CV]  algo__alpha=0.8308319668921218, algo__n_estimators=470, total=   7.4s
[CV] algo__alpha=0.8308319668921218, algo__n_estimators=470 ..........
[CV]  algo__alpha=0.8308319668921218, algo__n_estimators=470, total=   7.5s
[CV]  algo__alpha=0.8308319668921218, algo__n_estimators=470, total=   7.6s
[CV]  algo__alpha=0.8308319668921218, algo__n_estimators=470, total=   7.6s
[CV]  algo__alpha=0.8308319668921218, algo__n_estimators=470, total=   3.6s
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   11.2s finished


[CV] algo__alpha=0.7378867409172963, algo__n_estimators=394 ..........
[CV] algo__alpha=0.7378867409172963, algo__n_estimators=394 ..........
[CV] algo__alpha=0.7378867409172963, algo__n_estimators=394 ..........
[CV] algo__alpha=0.7378867409172963, algo__n_estimators=394 ..........
[CV]  algo__alpha=0.7378867409172963, algo__n_estimators=394, total=   5.7s
[CV] algo__alpha=0.7378867409172963, algo__n_estimators=394 ..........
[CV]  algo__alpha=0.7378867409172963, algo__n_estimators=394, total=   5.7s
[CV]  algo__alpha=0.7378867409172963, algo__n_estimators=394, total=   5.9s
[CV]  algo__alpha=0.7378867409172963, algo__n_estimators=394, total=   5.8s
[CV]  algo__alpha=0.7378867409172963, algo__n_estimators=394, total=   3.1s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.9s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] algo__alpha=0.2500256720607585, algo__n_estimators=431 ..........
[CV] algo__alpha=0.2500256720607585, algo__n_estimators=431 ..........
[CV] algo__alpha=0.2500256720607585, algo__n_estimators=431 ..........
[CV] algo__alpha=0.2500256720607585, algo__n_estimators=431 ..........
[CV]  algo__alpha=0.2500256720607585, algo__n_estimators=431, total=   6.3s
[CV] algo__alpha=0.2500256720607585, algo__n_estimators=431 ..........
[CV]  algo__alpha=0.2500256720607585, algo__n_estimators=431, total=   6.4s
[CV]  algo__alpha=0.2500256720607585, algo__n_estimators=431, total=   6.4s
[CV]  algo__alpha=0.2500256720607585, algo__n_estimators=431, total=   6.4s
[CV]  algo__alpha=0.2500256720607585, algo__n_estimators=431, total=   3.4s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.8s finished


BayesSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('med_inc', Pipeline(memory=None,
     steps=[('selector', ColumnSelector(cols=['MedInc'], drop_axis=False)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))])), ('house_age', Pipeline(memory=None,
     steps=[(...s=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_iter=10, n_jobs=-1, n_points=1,
       optimizer_kwargs=None, pre_dispatch='2*n_jobs', random_state=None,
       refit=True, return_train_score=False, scoring=None,
       search_spaces={'algo__alpha': [0.01, 1], 'algo__n_estimators': [10, 1000]},
       verbose=2)

In [14]:
best_estimator = cv.best_estimator_
predictions = best_estimator.predict(test_X)

print(cv.best_params_)
print(cv.best_score_)
print(metrics.mean_squared_error(test_y, predictions))

{'algo__alpha': 0.7678180168946732, 'algo__n_estimators': 953}
0.8260185648433542
0.23335393356090664
