## Yahia Chammami

# BLENDING:

**Blending** is an ensemble machine learning algorithm.

It is a colloquial name for stacked generalization or stacking ensemble where instead of fitting the meta-model on **out-of-fold predictions** made by the base model, it is fit on predictions made on a **holdout dataset**.

**Blending**: Stacking-type ensemble where the meta-model is trained on predictions made on a **holdout dataset**.
Stacking: Stacking-type ensemble where the meta-model is trained on **out-of-fold predictions** made during k-fold cross-validation.

In [1]:
from numpy import hstack
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
 
# get the dataset
def get_dataset():
    X, y = make_classification(n_samples=10000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
    return X, y

In [2]:
# get a list of base models
def get_models():
    models = list()
    models.append(('lr', LogisticRegression()))
    models.append(('knn', KNeighborsClassifier()))
    models.append(('cart', DecisionTreeClassifier()))
    models.append(('svm', SVC()))
    models.append(('bayes', GaussianNB()))
    return models

In [3]:
# fit the blending ensemble
def fit_ensemble(models, X_train, X_val, y_train, y_val):
    # Initialize an empty list to store the predictions of base models
    meta_X = list()
    
    # Fit all models on the training set and predict on the validation set
    for name, model in models:
        # Fit the model on the training set
        model.fit(X_train, y_train)
        
        # Predict on the validation set
        yhat = model.predict(X_val)
        
        # Reshape predictions into a matrix with one column
        yhat = yhat.reshape(len(yhat), 1)
        
        # Store predictions as input for blending
        meta_X.append(yhat)
    
    # Create 2D array from predictions, each set is an input feature
    meta_X = hstack(meta_X)
    
    # Define blending model (meta-model)
    blender = LogisticRegression()
    
    # Fit the blending model on predictions from base models
    blender.fit(meta_X, y_val)
    
    return blender

In [4]:
# make a prediction with the blending ensemble
def predict_ensemble(models, blender, X_test):
    # Initialize an empty list to store the predictions of base models
    meta_X = list()
    
    # Make predictions with base models
    for name, model in models:
        # Predict with base model
        yhat = model.predict(X_test)
        
        # Reshape predictions into a matrix with one column
        yhat = yhat.reshape(len(yhat), 1)
        
        # Store prediction
        meta_X.append(yhat)
    
    # Create 2D array from predictions, each set is an input feature
    meta_X = hstack(meta_X)
    
    # Predict using the blending model
    return blender.predict(meta_X)

In [5]:
# define dataset
X, y = get_dataset()
# split dataset into train and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.5, random_state=1)
# split training set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.33, random_state=1)
# summarize data split
print('Train: %s, Val: %s, Test: %s' % (X_train.shape, X_val.shape, X_test.shape))
# create the base models
models = get_models()
# train the blending ensemble
blender = fit_ensemble(models, X_train, X_val, y_train, y_val)
# make predictions on test set
yhat = predict_ensemble(models, blender, X_test)
# evaluate predictions
score = accuracy_score(y_test, yhat)
print('Blending Accuracy: %.3f' % (score*100))

Train: (3350, 20), Val: (1650, 20), Test: (5000, 20)
Blending Accuracy: 97.820
