# StackingTransformer (scikit-learn API for stacking)
# Regression
# 2-level stacking: step-by-step and using Pipeline
***

# Import

In [1]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import joblib
from xgboost import XGBRegressor
from vecstack import StackingTransformer

# Prepare data

In [2]:
X, y = fetch_california_housing(return_X_y=True)

# Make train/test split
# As usual in machine learning task we have X_train, y_train, and X_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize 1st level estimators

In [3]:
# Caution! All estimators and parameter values are just 
# demonstrational and shouldn't be considered as recommended.

# This is list of tuples
# Each tuple contains arbitrary unique name and estimator object
estimators_L1 = [
    ('et', ExtraTreesRegressor(random_state=0, n_jobs=-1, 
                               n_estimators=100, max_depth=3)),
        
    ('rf', RandomForestRegressor(random_state=0, n_jobs=-1, 
                                 n_estimators=100, max_depth=3)),
        
    ('xgb', XGBRegressor(random_state=0, n_jobs=-1, learning_rate=0.1, 
                         n_estimators=100, max_depth=3))
]

# Initialize StackingTransformer

In [4]:
stack = StackingTransformer(estimators=estimators_L1,   # base estimators
                            regression=True,            # regression task (if you need 
                                                        #     classification - set to False)
                            variant='A',                # oof for train set, predict test 
                                                        #     set in each fold and find mean
                            metric=mean_absolute_error, # metric: callable
                            n_folds=4,                  # number of folds
                            shuffle=True,               # shuffle the data
                            random_state=0,             # ensure reproducibility
                            verbose=2)                  # print all info

***

# 1. Step-by-step approach

# Fit StackingTransformer

In [5]:
stack = stack.fit(X_train, y_train)

task:         [regression]
metric:       [mean_absolute_error]
variant:      [A]
n_estimators: [3]

estimator  0: [et: ExtraTreesRegressor]
    fold  0:  [0.65383143]
    fold  1:  [0.65059961]
    fold  2:  [0.66233582]
    fold  3:  [0.64449777]
    ----
    MEAN:     [0.65281616] + [0.00643746]

estimator  1: [rf: RandomForestRegressor]
    fold  0:  [0.58416160]
    fold  1:  [0.56449564]
    fold  2:  [0.57730149]
    fold  3:  [0.55014073]
    ----
    MEAN:     [0.56902487] + [0.01298795]

estimator  2: [xgb: XGBRegressor]
    fold  0:  [0.37287275]
    fold  1:  [0.36827074]
    fold  2:  [0.37315715]
    fold  3:  [0.36447933]
    ----
    MEAN:     [0.36969499] + [0.00358177]



# Get stacked features: transform (predict) train set and test set

In [6]:
S_train = stack.transform(X_train)

Train set was detected.
Transforming...

estimator  0: [et: ExtraTreesRegressor]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  1: [rf: RandomForestRegressor]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  2: [xgb: XGBRegressor]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE



In [7]:
S_test = stack.transform(X_test)

Transforming...

estimator  0: [et: ExtraTreesRegressor]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  1: [rf: RandomForestRegressor]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE

estimator  2: [xgb: XGBRegressor]
    model from fold  0: done
    model from fold  1: done
    model from fold  2: done
    model from fold  3: done
    ----
    DONE



# Look at the result

Let's look at our stacked features.  
We have three 1st level estimators, so we expect to get three columns in `S_train` and `S_test`.  

In [8]:
S_train[:5]

array([[2.1381431 , 1.89449961, 1.85192811],
       [2.29310757, 1.89309918, 2.92809105],
       [2.07256939, 1.89449961, 2.10903692],
       [1.51938275, 1.53835871, 1.37909698],
       [1.93450337, 2.737813  , 3.23252964]])

In [9]:
S_test[:5]

array([[2.12570438, 1.88503507, 1.57685581],
       [2.57631542, 2.67168873, 2.70525175],
       [2.06940157, 1.88669837, 1.69479051],
       [1.64434775, 1.20196782, 0.96695787],
       [2.33799194, 2.98206787, 3.8881467 ]])

# Apply 2nd level estimator

In [10]:
# Initialize 2nd level estimator
final_estimator = XGBRegressor(random_state=0, n_jobs=-1, learning_rate=0.1, 
                               n_estimators=100, max_depth=3)

# Fit
final_estimator = final_estimator.fit(S_train, y_train)

# Predict
y_pred = final_estimator.predict(S_test)

# Final prediction score
print('Final prediction score: [%.8f]' % mean_absolute_error(y_test, y_pred))

Final prediction score: [0.35320658]


# Some useful StackingTransformer attributes

In [11]:
# Number of base estimators
# Type: int
stack.n_estimators_

3

In [12]:
# Scores for each estimator (rows) in each fold (columns)
# Type: 2d numpy array
stack.scores_

array([[0.65383143, 0.65059961, 0.66233582, 0.64449777],
       [0.5841616 , 0.56449564, 0.57730149, 0.55014073],
       [0.37287275, 0.36827074, 0.37315715, 0.36447933]])

In [13]:
# Mean and std for each estimator
# Type: list of tuples
stack.mean_std_

[('et', np.float64(0.6528161581671601), np.float64(0.006437456871932304)),
 ('rf', np.float64(0.5690248659195717), np.float64(0.012987952596095562)),
 ('xgb', np.float64(0.3696949910225933), np.float64(0.00358176792828805))]

In [14]:
# Mean and std convenient representation using pandas.DataFrame
df = pd.DataFrame.from_records(stack.mean_std_, columns=['name', 'mean', 'std'])
# Sort by column 'mean' (best on the top)
df.sort_values('mean', ascending=True)

Unnamed: 0,name,mean,std
2,xgb,0.369695,0.003582
1,rf,0.569025,0.012988
0,et,0.652816,0.006437


***

# 2. Pipeline

StackingTransformer is fully scikit-learn compatible so we can easily implement **arbitrary number of stacking levels** using Pipeline


In [15]:
# Specify steps of Pipeline
steps = [('stack', stack),
         ('final_estimator', final_estimator)]

# Init Pipeline
pipe = Pipeline(steps)

In [16]:
# If we have several stacking levels our Pipeline steps would be:
# steps = [('stack_L1', stack_L1),
#          ('stack_L2', stack_L2),
#          ('stack_L99', stack_L99),  # :-)
#          ('final_estimator', final_estimator)]

# Ability to set parameters of nested estimators and transformers

Following scikit-learn naming convention we can access parameters of nested estimators and transformers.  
Each nested level (should not be confused with stacking level) is separated by double underscore `__`.  
For example using Pipeline we want StackingTransformer to be silent.  
We can access StackingTransformer parameter `verbose` through `pipe` instance as `stack__verbose`.  
We can also set any parameter of 1st level estimators inside StackingTransformer.  
We can access XGBoost parameter `learning_rate` through `pipe` instance as `stack__xgb__learning_rate`.

In [17]:
pipe = pipe.set_params(stack__verbose=0)
# pipe = pipe.set_params(stack__xgb__learning_rate=0.555)

# Fit and predict using Pipeline

In [18]:
# Fit
pipe = pipe.fit(X_train, y_train)

# Predict
y_pred_pipe = pipe.predict(X_test)

# Final prediction score
print('Final prediction score using Pipeline: [%.8f]' % mean_absolute_error(y_test, y_pred_pipe))

Final prediction score using Pipeline: [0.35320658]


# Ability to save fitted Pipeline

In [19]:
# Save Pipeline
_ = joblib.dump(pipe, 'pipe_with_stack.pkl')

In [20]:
# Load Pipeline
pipe_loaded = joblib.load('pipe_with_stack.pkl')

# Predict using loaded Pipeline
y_pred_pipe_loaded = pipe_loaded.predict(X_test)

# Final prediction score
print('Final prediction score using loaded Pipeline: [%.8f]' % mean_absolute_error(y_test, y_pred_pipe_loaded))

Final prediction score using loaded Pipeline: [0.35320658]


***

# Conclusion

Step-by-step approach is useful when we need direct access to stacked features `S_train` and `S_test`.  
Pipeline approach is useful when we need to represent many steps in a single object.  
Both approaches give exactly the same score.