In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston

A simple walk through of a stacking example using a holdout validation scheme. 

Simple holdout scheme: 
1. Split data into (train_data) and (test_data).
2. Split (train_data) into three parts: (partA) and (partB) and (partC).
3. Fit N diverse models on (partA), predict for (partB), (partC), (test_data) getting meta-features (partB_meta), (partC_meta) and (test_meta) respectively.
4. Fit a metamodel to a (partB_meta) while validating its hyperparameters on (partC_meta).
5. When the metamodel is validated, fit it to [(partB_meta), (partC_meta)] and predict for (test_meta).

(Holdout scheme suggested in Advanced Machine Learning course (Coursera))

In [70]:
np.random.seed(3) 

In [71]:
X, y = load_boston(return_X_y=True)

# Steps 1 and 2
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.1)
X_partA, X_temp, y_partA, y_temp = train_test_split(X_train,y_train, test_size=0.5)
X_partB, X_partC, y_partB, y_partC = train_test_split(X_temp,y_temp, test_size=0.5)


In [72]:
print(X_train.shape)
print(y_train.shape)

(455, 13)
(455,)


In [73]:
# Step 3

In [74]:
# Specifiy base learners.
m1 = RandomForestRegressor()
m2 = LinearRegression()

In [75]:
m1.fit(X_partA, y_partA)
m2.fit(X_partA, y_partA)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [76]:
meta_B1 = m1.predict(X_partB)
meta_B2 = m2.predict(X_partB)
meta_B = np.c_[meta_B1, meta_B2]
meta_C1 = m1.predict(X_partC)
meta_C2 = m2.predict(X_partC)
meta_C = np.c_[meta_C1, meta_C2]

In [77]:
meta_test_1 = m1.predict(X_test)
meta_test_2 = m2.predict(X_test)
meta_test = np.c_[meta_test_1, meta_test_2]

In [78]:
# meta model
meta_m = LinearRegression()

# Step 4
meta_m.fit(meta_B, y_partB)
y_meta_C_pred = meta_m.predict(meta_C)

In [79]:
from sklearn.metrics import r2_score
r2_score(y_meta_C_pred, y_partC)

0.81174986319822573

In [83]:
meta_all = np.row_stack((meta_B, meta_C))
y_all = np.concatenate([y_partB, y_partC])

# Step 5
meta_m.fit(meta_all, y_all)
y_test_pred = meta_m.predict(meta_test)
stacked = r2_score(y_test_pred, y_test)

In [84]:
meta_test_1 = m1.predict(X_test)
meta_test_2 = m2.predict(X_test)
rf_base = r2_score(meta_test_1, y_test)
lr_base = r2_score(meta_test_2, y_test)
print("Random Forest base: {}".format(rf_base))
print("Linear Regression base: {}".format(lr_base))
print("Stacked: {}".format(stacked))

Random Forest base: 0.8813375465704971
Linear Regression base: 0.7491895824299648
Stacked: 0.8833472311976777
