In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

np.random.seed(42)
%matplotlib inline

In [2]:
# Creating train, validation and test set
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

X, y = fetch_openml('mnist_784', return_X_y=True)
x_train, x_rest, y_train, y_rest = train_test_split(X, y, 
                                                    test_size=20000, random_state=42)
x_valid, x_test, y_valid, y_test = train_test_split(x_rest, y_rest, 
                                                    test_size=10000, random_state=42)

In [3]:
x_train.shape, x_valid.shape, x_test.shape

((50000, 784), (10000, 784), (10000, 784))

In [4]:
# Scaling the data
x_train = x_train/255
x_valid = x_valid/255
x_test = x_test/255

In [5]:
"""
Training classifers for Ensemble
"""
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

In [6]:
# Objects for different predictors
rf = RandomForestClassifier(random_state=42)
ext = ExtraTreesClassifier(random_state=42)
lr = LogisticRegression(random_state=42, penalty='l1', tol=0.1, solver='saga')

In [7]:
# Training individual predictors
predictors = [rf, ext, lr]
for ind, model in enumerate(predictors):
    print("Training Model:", ind+1)
    model.fit(x_train, y_train)

Training Model: 1
Training Model: 2
Training Model: 3


In [8]:
# Getting predictions to create dataset for blender
x_val_prediction = []

for ind, model in enumerate(predictors):
    print("Predicting Model:", ind+1)
    x_val_prediction.append(model.predict(x_valid))

Predicting Model: 1
Predicting Model: 2
Predicting Model: 3


In [18]:
# Predictions in the form of numpy array
x_val_prediction = np.array(x_val_prediction).transpose()

In [35]:
# Training a blender
blender = RandomForestClassifier(n_estimators=50, oob_score=True, random_state=42) 
blender.fit(x_val_prediction, y_valid)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [36]:
blender.score(x_val_prediction, y_valid)

0.9756

In [38]:
blender.oob_score_

0.9662

In [39]:
# Checking stacking on the test set
x_test_prediction = []

for ind, model in enumerate(predictors):
    x_test_prediction.append(model.predict(x_test))
    
x_test_prediction = np.array(x_test_prediction).transpose()

In [40]:
# Final result
blender.score(x_test_prediction, y_test)

0.9674