# MNIST Exercise

##  Imports

In [5]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [6]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier, 
                              VotingClassifier, BaggingClassifier)

## Models

In [7]:
def load_data():
    return fetch_openml('mnist_784', version=1)
mnist = load_data()

In [8]:
X, y = mnist['data'], mnist['target'].astype(np.int)

In [9]:
X_train, X_valid, X_test = X[:50000], X[50000:60000], X[60000:]
y_train, y_valid, y_test = y[:50000], y[50000:60000], y[60000:]

In [10]:
X_train.shape, X_valid.shape, X_test.shape

((50000, 784), (10000, 784), (10000, 784))

In [11]:
y_train.shape, y_valid.shape, y_test.shape

((50000,), (10000,), (10000,))

In [12]:
svc_clf1 = SVC().fit(X_train, y_train)
y_pred = svc_clf1.predict(X_valid)
accuracy_score(y_valid, y_pred)

0.9802

In [13]:
rf_clf_1 = RandomForestClassifier().fit(X_train, y_train)
y_pred = rf_clf_1.predict(X_valid)
accuracy_score(y_valid, y_pred)

0.9716

In [14]:
ext_clf_1 = ExtraTreesClassifier().fit(X_train, y_train)
y_pred = ext_clf_1.predict(X_valid)
accuracy_score(y_valid, y_pred)

0.976

In [15]:
# running a ensemble classifier
svc_clf, rf_clf, ext_clf = SVC(), RandomForestClassifier(), ExtraTreesClassifier()

voting_clf = VotingClassifier(
    [('svc', svc_clf),('rf', rf_clf),('ext', ext_clf)],
    voting='hard'
)
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_valid)
accuracy_score(y_valid, y_pred)

0.9777

### Running best estimator on the test set

In [16]:
y_pred = svc_clf1.predict(X_test)
accuracy_score(y_test, y_pred)

0.9785