In [47]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression

import numpy as np


# EXERCISE 8


In [7]:
mnist = fetch_openml("mnist_784", version=1, as_frame=False)

X, y = mnist["data"], mnist["target"]


  warn(


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5)


In [16]:
for_clf = RandomForestClassifier(
    bootstrap=True, oob_score=True, n_jobs=-1, random_state=2
).fit(X_train, y_train)
for_clf.oob_score_


0.963265306122449

In [17]:
y_pred = for_clf.predict(X_val)
accuracy_score(y_val, y_pred)


0.9685714285714285

In [18]:
ext_clf = ExtraTreesClassifier(
    bootstrap=True, oob_score=True, n_jobs=-1, random_state=26
).fit(X_train, y_train)
ext_clf.oob_score_


0.961469387755102

In [19]:
y_pred = ext_clf.predict(X_val)
accuracy_score(y_val, y_pred)


0.9680952380952381

In [20]:
svm = SVC(kernel="sigmoid", gamma="auto", probability=True).fit(X_train, y_train)
svm.score(X_val, y_val)


0.11019047619047619

In [21]:
y_pred = svm.predict(X_val)
accuracy_score(y_val, y_pred)


0.11019047619047619

In [24]:
lin_svc = LinearSVC(random_state=3456).fit(X_train, y_train)
y_pred = lin_svc.predict(X_val)
accuracy_score(y_val, y_pred)




0.8268571428571428

In [25]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


lin_svc = make_pipeline(StandardScaler(), LinearSVC(random_state=4)).fit(
    X_train, y_train
)
y_pred = lin_svc.predict(X_val)
accuracy_score(y_val, y_pred)




0.9034285714285715

In [37]:
logreg = make_pipeline(
    StandardScaler(), LogisticRegression(random_state=45, max_iter=1000)
).fit(X_train, y_train)


In [38]:
estimators = [for_clf, ext_clf, svm, logreg]
[estimator.score(X_val, y_val) for estimator in estimators]


[0.9685714285714285,
 0.9680952380952381,
 0.11019047619047619,
 0.9143809523809524]

In [41]:
soft_vote = VotingClassifier(
    [
        (
            "for",
            RandomForestClassifier(
                bootstrap=True, oob_score=True, n_jobs=-1, random_state=2
            ),
        ),
        (
            "ext",
            ExtraTreesClassifier(
                bootstrap=True, oob_score=True, n_jobs=-1, random_state=26
            ),
        ),
        # ('svc', make_pipeline(StandardScaler(), LinearSVC(random_state=4))),
        (
            "logreg",
            make_pipeline(
                StandardScaler(), LogisticRegression(random_state=45, max_iter=1000)
            ),
        ),
    ],
    voting="soft",
    n_jobs=-1,
)

soft_vote.fit(X_train, y_train)
soft_vote.score(X_val, y_val)


0.9523809523809523

In [43]:
hard_vote = VotingClassifier(
    [
        (
            "for",
            RandomForestClassifier(
                bootstrap=True, oob_score=True, n_jobs=-1, random_state=2
            ),
        ),
        (
            "ext",
            ExtraTreesClassifier(
                bootstrap=True, oob_score=True, n_jobs=-1, random_state=26
            ),
        ),
        # ('svc', make_pipeline(StandardScaler(), LinearSVC(random_state=4))),
        (
            "logreg",
            make_pipeline(
                StandardScaler(), LogisticRegression(random_state=45, max_iter=1000)
            ),
        ),
    ],
    voting="hard",
    n_jobs=-1,
)

hard_vote.fit(X_train, y_train)
hard_vote.score(X_val, y_val)


0.9676190476190476

In [51]:
hard_vote.score(X_test, y_test)


0.969047619047619

In [52]:
soft_vote.score(X_test, y_test)


0.9486666666666667

In [45]:
[estimator.score(X_test, y_test) for estimator in estimators]


[0.9697142857142858,
 0.9692380952380952,
 0.11161904761904762,
 0.9122857142857143]

In [50]:
[
    estimator.score(X_test, y_test.astype(np.int64))
    for estimator in hard_vote.estimators_
]


[0.9697142857142858, 0.9692380952380952, 0.9122857142857143]

# EXERCISE 9


In [64]:
X_for_train = for_clf.predict(X_train)
X_ext_train = ext_clf.predict(X_train)
X_log_train = logreg.predict(X_train)

X_blend_train = np.concatenate(
    [X_for_train[:, None], X_ext_train[:, None], X_log_train[:, None]], axis=1
)


In [65]:
X_for_test = for_clf.predict(X_test)
X_ext_test = ext_clf.predict(X_test)
X_log_test = logreg.predict(X_test)

X_blend_test = np.concatenate(
    (X_for_test[:, None], X_ext_test[:, None], X_log_test[:, None]), axis=1
)


In [70]:
from sklearn.linear_model import SGDClassifier

sgd_clf = make_pipeline(
    StandardScaler(),
    SGDClassifier(
        loss="squared_hinge",
        alpha=0.0001,
        n_jobs=-1,
        random_state=345,
        early_stopping=False,
    ),
).fit(X_blend_train, y_train)
sgd_clf.score(X_blend_test, y_test)


0.4901904761904762

In [74]:
rnd_for_blend = RandomForestClassifier(
    random_state=34, oob_score=True, n_jobs=-1, n_estimators=250
).fit(X_blend_train, y_train)
rnd_for_blend.score(X_blend_test, y_test)


0.9693333333333334

In [75]:
rnd_for_blend.oob_score_


1.0

In [76]:
rnd_for_blend.oob_decision_function_


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [78]:
hard_vote.estimators_


[RandomForestClassifier(n_jobs=-1, oob_score=True, random_state=2),
 ExtraTreesClassifier(bootstrap=True, n_jobs=-1, oob_score=True, random_state=26),
 Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('logisticregression',
                  LogisticRegression(max_iter=1000, random_state=45))])]

In [80]:
estimators = [
    (
        "for",
        RandomForestClassifier(
            bootstrap=True, oob_score=True, n_jobs=-1, random_state=2
        ),
    ),
    (
        "ext",
        ExtraTreesClassifier(
            bootstrap=True, oob_score=True, n_jobs=-1, random_state=26
        ),
    ),
    # ('svc', make_pipeline(StandardScaler(), LinearSVC(random_state=4))),
    (
        "logreg",
        make_pipeline(
            StandardScaler(), LogisticRegression(random_state=45, max_iter=1000)
        ),
    ),
]


In [82]:
from sklearn.ensemble import StackingClassifier


stack_clf = StackingClassifier(
    estimators=estimators,
    cv=10,
    stack_method="auto",
    n_jobs=-1,
    final_estimator=rnd_for_blend,
).fit(X[:60000], y[:60000])
stack_clf.score(X_test, y_test)


0.9960952380952381