In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

np.random.seed(42)
%matplotlib inline

In [2]:
# Creating train, validation and test set
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

X, y = fetch_openml('mnist_784', return_X_y=True)
x_train, x_rest, y_train, y_rest = train_test_split(X, y, 
                                                    test_size=20000, random_state=42)
x_valid, x_test, y_valid, y_test = train_test_split(x_rest, y_rest, 
                                                    test_size=10000, random_state=42)

In [3]:
x_train.shape, x_valid.shape, x_test.shape

((50000, 784), (10000, 784), (10000, 784))

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = x_train/255
x_valid = x_valid/255
x_test = x_test/255

In [5]:
"""
Training classifers for Ensemble
"""
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

In [6]:
# Objects for different predictors
rf = RandomForestClassifier(random_state=42)
ext = ExtraTreesClassifier(random_state=42)
lr = LogisticRegression(random_state=42, penalty='l1', tol=0.1, solver='saga')

In [7]:
from sklearn.metrics import accuracy_score
def val_score(y, y_pred):
    print("Validation accuracy:", accuracy_score(y, y_pred))

In [8]:
# Training individual predictors
predictors = [rf, ext, lr]
for ind, model in enumerate(predictors):
    print("Training Model:", ind+1)
    model.fit(x_train, y_train)

Training Model: 1
Training Model: 2
Training Model: 3


In [9]:
for ind, model in enumerate(predictors):
    val_score(y_valid, model.predict(x_valid))

Validation accuracy: 0.9677
Validation accuracy: 0.9689
Validation accuracy: 0.9219


In [10]:
# Creating Ensemble 
from sklearn.ensemble import VotingClassifier
voting_hard = VotingClassifier(estimators=[('lr', lr), ('rf', rf), ('ext', ext)],
                              voting='hard')

voting_hard.fit(x_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l1',
                                                 random_state=42, solver='saga',
                                                 tol=0.1, verbose=0,
                                                 warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                   

In [11]:
# Working better than rf and lr
val_score(y_valid, voting_hard.predict(x_valid))

Validation accuracy: 0.9673


In [12]:
# Now training above classifer with soft voting
# Voting classifier can be directly converted to 'soft' type, no need to train again

voting_soft = VotingClassifier(estimators=[('lr', lr), ('rf', rf), ('ext', ext)],
                              voting='soft')
voting_soft.fit(x_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l1',
                                                 random_state=42, solver='saga',
                                                 tol=0.1, verbose=0,
                                                 warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                   

In [13]:
# Validation score
val_score(y_valid, voting_soft.predict(x_valid))

Validation accuracy: 0.9543


In [14]:
# Doesn't work as well as hard in this case
# Now checking on test set

voting_hard.score(x_test, y_test)

0.9662

In [15]:
for ind, model in enumerate(predictors):
    print(model.score(x_test, y_test))

0.9674
0.9692
0.9219
