<a href="https://colab.research.google.com/github/venkatrebba/model_stacking/blob/main/model_stacking_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 85 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6


In [3]:
# First import necessary libraries
import pandas as pd
from sklearn.ensemble import StackingRegressor

# Decision trees
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# Neural networks
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Add, Input, Dense, Dropout, BatchNormalization, Embedding, Flatten, Concatenate
from tensorflow.keras.optimizers import Adam
#from keras.layers.merge import concatenate
from tensorflow.keras.layers import concatenate

from tensorflow.keras import regularizers
from keras.regularizers import l1
from keras.regularizers import l2

# Wrapper to make neural network compitable with StackingRegressor
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# Linear model as meta-learn
from sklearn.linear_model import LinearRegression

# Create generic dataset for regression
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

In [4]:
# Create dummy regression dataset
X, y = make_regression(n_targets=1, random_state=42)

# Convert to pandas
X = pd.DataFrame(X)
y = pd.DataFrame(y)

#Rename column
y = y.rename(columns={0: 'target'})

# Split into validation set
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42)

In [5]:
#Peak at our dummy data
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
55,-0.531214,0.91539,-0.116766,-0.54954,2.693034,-0.307808,0.292193,0.814152,-0.187329,1.228981,...,-0.321243,1.724697,-1.104863,0.482067,-0.095464,0.534817,-0.243384,0.833334,-2.084113,-0.385022
88,-0.606503,0.450603,-1.161784,-1.869816,0.855556,-1.751829,0.975312,0.251474,0.515628,-0.415967,...,-1.209477,-1.88315,-2.906988,-0.194269,-0.729602,-0.096624,0.041326,-0.369207,0.712712,0.892954
26,-0.518638,-1.077721,0.622944,0.343159,1.236832,-0.195435,-0.864285,0.239873,1.135621,0.792711,...,-0.978266,0.377263,-1.880288,1.485477,-0.398445,-0.75072,1.056101,-1.056112,1.974079,-0.63359
42,1.964725,0.120296,0.711615,0.514439,-0.599375,-0.22097,3.078881,-0.748487,1.119575,-0.575818,...,1.277677,-0.190339,-0.985726,1.551152,-0.450065,-0.530501,1.502357,0.955142,-1.037246,1.687142
69,0.997632,-0.856852,-0.256341,-1.126054,-1.090208,0.49356,-0.068123,0.558336,0.363971,0.92941,...,-1.859337,0.126268,0.209534,1.856647,0.110535,-1.121244,-1.563034,0.210668,-1.055089,1.509131


In [6]:
y_train.head()

Unnamed: 0,target
55,-68.509481
88,-162.534908
26,-249.837367
42,-83.817726
69,-325.953752


In [12]:
'''
 A neural network to show you how to incorporate models that may not be compatable with the stacking model
 (You don't need to understand this code for the purposes of the tutorial, just need to know that certain models can crash the stacking regressor without the appropriate wrapper around it)
'''
def create_neural_network(input_shape, depth=5, batch_mod=2, num_neurons=20, drop_rate=0.1, learn_rate=.01,
                      r1_weight=0.02,
                      r2_weight=0.02):
    '''A neural network architecture built using keras functional API'''
    act_reg = l1(r2_weight)
    kern_reg = l1(r1_weight)
    
    inputs = Input(shape=(input_shape,))
    batch1 = BatchNormalization()(inputs)
    hidden1 = Dense(num_neurons, activation='relu', kernel_regularizer=kern_reg, activity_regularizer=act_reg)(batch1)
    dropout1 = Dropout(drop_rate)(hidden1)
    hidden2 = Dense(int(num_neurons/2), activation='relu', kernel_regularizer=kern_reg, activity_regularizer=act_reg)(dropout1)
    
    skip_list = [batch1]
    last_layer_in_loop = hidden2
    
    for i in range(depth):
        added_layer = concatenate(skip_list + [last_layer_in_loop])
        skip_list.append(added_layer)
        b1 = None
        #Apply batch only on every i % N layers
        if i % batch_mod == 2:
            b1 = BatchNormalization()(added_layer)
        else:
            b1 = added_layer
        
        h1 = Dense(num_neurons, activation='relu', kernel_regularizer=kern_reg, activity_regularizer=act_reg)(b1)
        d1 = Dropout(drop_rate)(h1)
        h2 = Dense(int(num_neurons/2), activation='relu', kernel_regularizer=kern_reg, activity_regularizer=act_reg)(d1)
        d2 = Dropout(drop_rate)(h2)
        h3 =  Dense(int(num_neurons/2), activation='relu', kernel_regularizer=kern_reg, activity_regularizer=act_reg)(d2)
        d3 = Dropout(drop_rate)(h3)
        h4 =  Dense(int(num_neurons/2), activation='relu', kernel_regularizer=kern_reg, activity_regularizer=act_reg)(d3)
        last_layer_in_loop = h4
        c1 = concatenate(skip_list + [last_layer_in_loop])
        output = Dense(1, activation='sigmoid')(c1)
    
    model = Model(inputs=inputs, outputs=output)
    optimizer = Adam()
    optimizer.learning_rate = learn_rate
    
    model.compile(optimizer=optimizer,
                  loss='mse',
                  metrics=['accuracy'])
    return model

In [8]:
def get_stacking(input_shape=None):
    '''A stacking model that consists of CatBoostRegressor,
    XGBRegressor, a linear model, and some neural networks'''
    # First we create a list called "level0", which consists of our base models"
    # Basically, you'll want to pick a assortment of your favorite machine learning models
    # These models will get passed down to the meta-learner later
    level0 = list()
    level0.append(('cat', CatBoostRegressor(verbose=False)))
    level0.append(('cat2', CatBoostRegressor(verbose=False, learning_rate=.0001)))
    level0.append(('xgb', XGBRegressor()))
    level0.append(('xgb2', XGBRegressor(max_depth=5, learning_rate=.0001)))
    level0.append(('linear', LinearRegression()))
    #Create 5 neural networks using our function above
    for i in range(5):
        # Wrap our neural network in a Keras Regressor to make it
        #compatible with StackingRegressor
        keras_reg = KerasRegressor(
                create_neural_network, # Pass in function
                input_shape=input_shape, # Pass in the dimensions to above function
                epochs=10,
                batch_size=32,
                verbose=False)
        keras_reg._estimator_type = "regressor"
        # Append to our list
        level0.append(('nn_{num}'.format(num=i), keras_reg))
    # The "meta-learner" designated as the level1 model
    # In my experience Linear Regression performs best
    # but feel free to experiment with other models
    level1 = LinearRegression()
    # Create the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=2, verbose=1)
    return model

In [9]:
#Get our input dimensions for neural network
input_dimensions = len(X_train.columns)
# Create stacking model
model = get_stacking(input_dimensions)
model.fit(X_train, y_train.values.ravel())
# Creating a temporary dataframe so we can see how each of our models performed
temp = pd.DataFrame(y_val)
# The stacked models predictions, which should perform the best
temp['stacking_prediction'] = model.predict(X_val)
# Get each model in the stacked model to see how they individually perform
for m in model.named_estimators_:
        temp[m] = model.named_estimators_[m].predict(X_val)





[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   10.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    9.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   12.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   12.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   12.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   12.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   14.3s finished


In [10]:
# See how each of our models correlate with our target
# In most instances of running the program the stacked predictions should outperform any singular model
print("Correlations with target column")
print(temp.corr()['target'])

Correlations with target column
target                 1.000000
stacking_prediction    0.899293
cat                    0.824192
cat2                   0.537589
xgb                    0.673921
xgb2                   0.365042
linear                 0.827606
nn_0                   0.187451
nn_1                  -0.067408
nn_2                  -0.217531
nn_3                   0.157990
nn_4                  -0.059617
Name: target, dtype: float64


In [11]:
# See what our meta-learner is thinking (the linear regression)
print("Coeffecients of each specific model")
for coef in zip(model.named_estimators_, model.final_estimator_.coef_):
    print(coef)

Coeffecients of each specific model
('cat', 2.4844867098989205)
('cat2', -3.725911357362978)
('xgb', -0.2161600134651347)
('xgb2', 5.833034380925381)
('linear', 0.4857881762839509)
('nn_0', -28.52141951111702)
('nn_1', -6.807170295067638)
('nn_2', -40.74306213378174)
('nn_3', 34.65444379753925)
('nn_4', 22.27845288144535)
