# Regression
***

# Import

In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from vecstack import stacking

# Prepare data

In [2]:
X, y = fetch_california_housing(return_X_y=True)

# Make train/test split
# As usual in machine learning task we have X_train, y_train, and X_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize 1st level models

In [3]:
# Caution! All models and parameter values are just 
# demonstrational and shouldn't be considered as recommended.
models = [
    ExtraTreesRegressor(random_state=0, n_jobs=-1, 
                        n_estimators=100, max_depth=3),
        
    RandomForestRegressor(random_state=0, n_jobs=-1, 
                          n_estimators=100, max_depth=3),
        
    XGBRegressor(random_state=0, n_jobs=-1, learning_rate=0.1, 
                 n_estimators=100, max_depth=3)
]

# Perform stacking

In [4]:
S_train, S_test = stacking(models,                     # list of models
                           X_train, y_train, X_test,   # data
                           regression=True,            # regression task (if you need 
                                                       #     classification - set to False)
                           mode='oof_pred_bag',        # mode: oof for train set, predict test 
                                                       #     set in each fold and find mean
                           save_dir=None,              # do not save result and log (to save 
                                                       #     in current dir - set to '.')
                           metric=mean_absolute_error, # metric: callable
                           n_folds=4,                  # number of folds
                           shuffle=True,               # shuffle the data
                           random_state=0,             # ensure reproducibility
                           verbose=2)                  # print all info

task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [ExtraTreesRegressor]
    fold  0:  [0.65383143]
    fold  1:  [0.65059961]
    fold  2:  [0.66233582]
    fold  3:  [0.64449777]
    ----
    MEAN:     [0.65281616] + [0.00643746]
    FULL:     [0.65281616]

model  1:     [RandomForestRegressor]
    fold  0:  [0.58416160]
    fold  1:  [0.56449564]
    fold  2:  [0.57730149]
    fold  3:  [0.55014073]
    ----
    MEAN:     [0.56902487] + [0.01298795]
    FULL:     [0.56902487]

model  2:     [XGBRegressor]
    fold  0:  [0.37287275]
    fold  1:  [0.36827074]
    fold  2:  [0.37315715]
    fold  3:  [0.36447933]
    ----
    MEAN:     [0.36969499] + [0.00358177]
    FULL:     [0.36969499]



# Look at the result

So now we have OOF from 1st level models and we can build 2nd level model.  
But first let's look at the result.  
We have three 1st level models, so we expect to get three columns in `S_train` and `S_test`.  

In [5]:
S_train[:5]

array([[2.1381431 , 1.89449961, 1.85192811],
       [2.29310757, 1.89309918, 2.92809105],
       [2.07256939, 1.89449961, 2.10903692],
       [1.51938275, 1.53835871, 1.37909698],
       [1.93450337, 2.737813  , 3.23252964]])

In [6]:
S_test[:5]

array([[2.12570438, 1.88503507, 1.57685581],
       [2.57631542, 2.67168873, 2.70525175],
       [2.06940157, 1.88669837, 1.69479051],
       [1.64434775, 1.20196782, 0.96695787],
       [2.33799194, 2.98206787, 3.8881467 ]])

# Apply 2nd level model

In [7]:
# Initialize 2nd level model
model = XGBRegressor(random_state=0, n_jobs=-1, learning_rate=0.1, 
                     n_estimators=100, max_depth=3)
    
# Fit 2nd level model
model = model.fit(S_train, y_train)

# Predict
y_pred = model.predict(S_test)

# Final prediction score
print('Final prediction score: [%.8f]' % mean_absolute_error(y_test, y_pred))

Final prediction score: [0.35320658]
