# StackingTransformer (scikit-learn API for stacking)
# Regression
# 2-level stacking: step-by-step and using Pipeline
***

# Import

In [1]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from xgboost import XGBRegressor
from vecstack import StackingTransformer

# Prepare data

In [2]:
boston = load_boston()
X, y = boston.data, boston.target

# Make train/test split
# As usual in machine learning task we have X_train, y_train, and X_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize 1st level estimators

In [3]:
# Caution! All estimators and parameter values are just 
# demonstrational and shouldn't be considered as recommended.

# This is list of tuples
# Each tuple contains arbitrary unique name and estimator object
estimators_L1 = [
    ('et', ExtraTreesRegressor(random_state=0, n_jobs=-1, 
                               n_estimators=100, max_depth=3)),
        
    ('rf', RandomForestRegressor(random_state=0, n_jobs=-1, 
                                 n_estimators=100, max_depth=3)),
        
    ('xgb', XGBRegressor(random_state=0, n_jobs=-1, learning_rate=0.1, 
                         n_estimators=100, max_depth=3))
]

# Initialize StackingTransformer

In [4]:
stack = StackingTransformer(estimators=estimators_L1,   # base estimators
                            regression=True,            # regression task (if you need 
                                                        #     classification - set to False)
                            variant='A',                # oof for train set, predict test 
                                                        #     set in each fold and find mean
                            metric=mean_absolute_error, # metric: callable
                            n_folds=4,                  # number of folds
                            shuffle=True,               # shuffle the data
                            random_state=0,             # ensure reproducibility
                            verbose=2)                  # print all info

***

# 1. Step-by-step approach

# Fit StackingTransformer

In [5]:
stack = stack.fit(X_train, y_train)

task:         [regression]
metric:       [mean_absolute_error]
variant:      [A]
n_estimators: [3]

estimator  0: [et: ExtraTreesRegressor]
    fold  0:  [3.20733439]
    fold  1:  [2.87943130]
    fold  2:  [2.53026486]
    fold  3:  [2.83618694]
    ----
    MEAN:     [2.86330437] + [0.23993093]

estimator  1: [rf: RandomForestRegressor]
    fold  0:  [3.11110485]
    fold  1:  [2.78404210]
    fold  2:  [2.55707729]
    fold  3:  [2.32209992]
    ----
    MEAN:     [2.69358104] + [0.29117900]

estimator  2: [xgb: XGBRegressor]
    fold  0:  [2.40318942]
    fold  1:  [2.37286943]
    fold  2:  [1.89121526]
    fold  3:  [1.95382805]
    ----
    MEAN:     [2.15527554] + [0.23404984]



# Get stacked features: transform (predict) train set and test set

In [6]:
S_train = stack.transform(X_train)

Train set was detected.
Transforming...

estimator  0: [et: ExtraTreesRegressor]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  1: [rf: RandomForestRegressor]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  2: [xgb: XGBRegressor]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE



In [7]:
S_test = stack.transform(X_test)

Transforming...

estimator  0: [et: ExtraTreesRegressor]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  1: [rf: RandomForestRegressor]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  2: [xgb: XGBRegressor]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE



# Look at the result

Let's look at our stacked features.  
We have three 1st level estimators, so we expect to get three columns in `S_train` and `S_test`.  

In [8]:
S_train[:5]

array([[27.21782522, 28.23561508, 27.78520966],
       [22.25443115, 22.32927929, 22.57203102],
       [26.03879794, 25.80114661, 26.27923012],
       [21.82927308, 21.30478775, 21.39201546],
       [13.02143285, 12.04667683,  8.88440514]])

In [9]:
S_test[:5]

array([[24.89602382, 23.85490698, 24.85046005],
       [20.85135955, 25.05068336, 26.30952454],
       [23.13164045, 21.56864103, 23.67526102],
       [13.47709586, 11.81606315, 11.02050447],
       [21.93179664, 21.30652111, 21.75125122]])

# Apply 2nd level estimator

In [10]:
# Initialize 2nd level estimator
final_estimator = XGBRegressor(random_state=0, n_jobs=-1, learning_rate=0.1, 
                               n_estimators=100, max_depth=3)

# Fit
final_estimator = final_estimator.fit(S_train, y_train)

# Predict
y_pred = final_estimator.predict(S_test)

# Final prediction score
print('Final prediction score: [%.8f]' % mean_absolute_error(y_test, y_pred))

Final prediction score: [2.78409081]


# Some useful StackingTransformer attributes

In [11]:
# Number of base estimators
# Type: int
stack.n_estimators_

3

In [12]:
# Scores for each estimator (rows) in each fold (columns)
# Type: 2d numpy array
stack.scores_

array([[3.20733439, 2.8794313 , 2.53026486, 2.83618694],
       [3.11110485, 2.7840421 , 2.55707729, 2.32209992],
       [2.40318942, 2.37286943, 1.89121526, 1.95382805]])

In [13]:
# Mean and std for each estimator
# Type: list of tuples
stack.mean_std_

[('et', 2.8633043735634116, 0.23993092887498238),
 ('rf', 2.6935810393014306, 0.2911789973137302),
 ('xgb', 2.15527553747196, 0.23404984189134637)]

In [14]:
# Mean and std convenient representation using pandas.DataFrame
df = pd.DataFrame.from_records(stack.mean_std_, columns=['name', 'mean', 'std'])
# Sort by column 'mean' (best on the top)
df.sort_values('mean', ascending=True)

Unnamed: 0,name,mean,std
2,xgb,2.155276,0.23405
1,rf,2.693581,0.291179
0,et,2.863304,0.239931


***

# 2. Pipeline

StackingTransformer is fully scikit-learn compatible so we can easily implement **arbitrary number of stacking layers** using Pipeline


In [15]:
# Specify steps of Pipeline
steps = [('stack', stack),
         ('final_estimator', final_estimator)]

# Init Pipeline
pipe = Pipeline(steps)

In [16]:
# If we have several stacking layers our Pipeline steps would be:
# steps = [('stack_L1', stack_L1),
#          ('stack_L2', stack_L2),
#          ('stack_L99', stack_L99), # :-)
#          ('final_estimator', final_estimator)]

# Ability to set parameters of nested estimators and transformers

Following scikit-learn naming convention we can access parameters of nested estimators and transformers.  
Each nested level (should not be confused with stacking level) is separated by double underscore `__`.  
For example using Pipeline we want StackingTransformer to be silent.  
We can access StackingTransformer parameter `verbose` through `pipe` instance as `stack__verbose`.  
We can also set any parameter of 1st level estimators inside StackingTransformer.  
We can access XGBoost parameter `learning_rate` through `pipe` instance as `stack__xgb__learning_rate`.

In [17]:
pipe = pipe.set_params(stack__verbose=0)
# pipe = pipe.set_params(stack__xgb__learning_rate=0.555)

# Fit and predict using Pipeline

In [18]:
# Fit
pipe = pipe.fit(X_train, y_train)

# Predict
y_pred_pipe = pipe.predict(X_test)

# Final prediction score
print('Final prediction score using Pipeline: [%.8f]' % mean_absolute_error(y_test, y_pred_pipe))

Final prediction score using Pipeline: [2.78409081]


# Ability to save fitted Pipeline

In [19]:
# Save Pipeline
_ = joblib.dump(pipe, 'pipe_with_stack.pkl')

In [20]:
# Load Pipeline
pipe_loaded = joblib.load('pipe_with_stack.pkl')

# Predict using loaded Pipeline
y_pred_pipe_loaded = pipe_loaded.predict(X_test)

# Final prediction score
print('Final prediction score using loaded Pipeline: [%.8f]' % mean_absolute_error(y_test, y_pred_pipe_loaded))

Final prediction score using loaded Pipeline: [2.78409081]


***

# Conclusion

Step-by-step approach is useful when we need direct access to stacked features `S_train` and `S_test`.  
Pipeline approach is useful when we need to represent many steps in a single object.  
Both approaches give exactly the same score.