# Exercise 8

In [3]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [4]:
random_state = 132

In [5]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)

In [4]:
X = mnist["data"]
Y = mnist["target"]
X_train_valid, X_test, Y_train_valid, Y_test = train_test_split(X, Y, random_state = random_state, test_size = 10000)

In [21]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train_valid, Y_train_valid, random_state = random_state, test_size = 10000)

In [23]:
svc_clf = SVC(random_state = random_state, probability=True)
rf_clf = RandomForestClassifier(random_state = random_state, n_estimators = 100)
ext_clf = ExtraTreesClassifier(random_state = random_state, n_estimators = 100)
linear_clf = LinearSVC(random_state = random_state)

In [24]:
estimators = [svc_clf, rf_clf, ext_clf, linear_clf]

In [25]:
for estimator in estimators:
    print(estimator)
    estimator.fit(X_train, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=132, shrinking=True, tol=0.001,
    verbose=False)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=132,
                       verbose=0, warm_start=False)
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None



In [26]:
len(X_train), len(X_valid), len(X_test)

(50000, 10000, 10000)

In [27]:
[accuracy_score(Y_valid, est.predict(X_valid)) for est in estimators]

[0.9774, 0.9683, 0.9713, 0.8694]

In [28]:
from sklearn.ensemble import VotingClassifier

In [29]:
estimators_with_names = [
    ("svm_clf", svc_clf),
    ("random_forest_clf", rf_clf),
    ("extra_tree_clf", ext_clf),
    ("linear_clf", linear_clf)
]

voting_clf  = VotingClassifier(estimators_with_names, n_jobs=-1)

In [30]:
voting_clf.fit(X_train, Y_train)

VotingClassifier(estimators=[('svm_clf',
                              SVC(C=1.0, break_ties=False, cache_size=200,
                                  class_weight=None, coef0=0.0,
                                  decision_function_shape='ovr', degree=3,
                                  gamma='scale', kernel='rbf', max_iter=-1,
                                  probability=True, random_state=132,
                                  shrinking=True, tol=0.001, verbose=False)),
                             ('random_forest_clf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=N...
                                                   n_estimators=100,
                                                   n_jobs=None, oob_score=False,
                                                   random_state=132, verbose=0,
                                

In [31]:
voting_clf.score(X_valid, Y_valid)

0.9719

In [33]:
del voting_clf.estimators_[3]

In [34]:
voting_clf.score(X_valid, Y_valid)

0.9731

In [35]:
voting_clf.voting = "soft"

In [36]:
voting_clf.score(X_valid, Y_valid)

0.9791

In [38]:
voting_clf.estimators_

[SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
     max_iter=-1, probability=True, random_state=132, shrinking=True, tol=0.001,
     verbose=False),
 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=132,
                        verbose=0, warm_start=False),
 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                      criterion='gini', max_depth=None, max_features='auto',
                      max_leaf_nodes=None,

In [39]:
voting_clf.score(X_test, Y_test)

0.9812