### Question 1

train a voting classifier

In [3]:
# common imports
import pandas as pd
import numpy as np
import sklearn

print(sklearn.__version__)

0.18.1


In [1]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original', data_home="datasets/mnist")

In [29]:
# split data into to training, validation and testing set

idx = np.random.permutation(len(mnist["data"]))
data = mnist["data"][idx]
target = mnist["target"][idx]

X_train, y_train = data[:40000:10], target[:40000:10]
X_val, y_val = data[40000:50000:10], target[40000:50000:10]
X_test, y_test = data[50000::10], target[50000::10]

In [38]:
# train multiple classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

X_train_scaled = StandardScaler().fit_transform(X_train.astype(np.float64))
X_val_scaled = StandardScaler().fit_transform(X_val.astype(np.float64))

classifiers = [RandomForestClassifier(),
              ExtraTreesClassifier(),
              SVC(decision_function_shape = "ovr")]

accuracy_scores = {}
for clf in classifiers:
    name = clf.__class__.__name__
    clf.fit(X_train_scaled, y_train)
    pred_y = clf.predict(X_val_scaled)
    accuracy_scores[name] = accuracy_score(pred_y, y_val)

In [39]:
for key, value in accuracy_scores.items():
    print(key, value)

RandomForestClassifier 0.803
ExtraTreesClassifier 0.898
SVC 0.914


In [43]:
from sklearn.ensemble import VotingClassifier
rnd_clf = RandomForestClassifier(random_state=42)
tree_clf = ExtraTreesClassifier(random_state=42)
svc_clf = SVC(probability=True, random_state=42)

voting_clf = VotingClassifier([("rnd", rnd_clf),
                               ("extra_tree", tree_clf),
                               ("svc", svc_clf)], voting="soft")

voting_clf.fit(X_train_scaled, y_train)
accuracy_score(voting_clf.predict(X_val_scaled), y_val)

0.93500000000000005

In [44]:
X_train, y_train = data[:40000], target[:40000]
X_val, y_val = data[40000:50000], target[40000:50000]
X_test, y_test = data[50000:], target[50000:]

X_train_scaled = StandardScaler().fit_transform(X_train)
X_val_scaled = StandardScaler().fit_transform(X_val)

voting_clf.fit(X_train_scaled, y_train)
accuracy_score(voting_clf.predict(X_val_scaled), y_val)



0.96850000000000003

### Question 2

train a blender

In [46]:
X_train, y_train = data[:40000:10], target[:40000:10]
X_val, y_val = data[40000:50000:10], target[40000:50000:10]
X_test, y_test = data[50000::10], target[50000::10]

X_train_scaled = StandardScaler().fit_transform(X_train)
X_val_scaled = StandardScaler().fit_transform(X_val)

rnd_clf = RandomForestClassifier(random_state=42)
tree_clf = ExtraTreesClassifier(random_state=42)
svc_clf = SVC(random_state=42)

rnd_clf.fit(X_train_scaled, y_train)
y_rnd_pred = rnd_clf.predict(X_val_scaled)

tree_clf.fit(X_train_scaled, y_train)
y_tree_pred = tree_clf.predict(X_val_scaled)

svc_clf.fit(X_train_scaled, y_train)
y_svc_pred = svc_clf.predict(X_val_scaled)



In [47]:
print(y_rnd_pred.shape, y_tree_pred.shape, y_svc_pred.shape)

(1000,) (1000,) (1000,)


In [51]:
blend = np.c_[y_rnd_pred, y_tree_pred, y_svc_pred, y_val]
blend

array([[ 0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.],
       [ 9.,  9.,  9.,  9.],
       ..., 
       [ 9.,  4.,  9.,  9.],
       [ 7.,  7.,  7.,  7.],
       [ 4.,  4.,  4.,  4.]])

In [54]:
blender_clf = RandomForestClassifier(random_state=42)
blender_clf.fit(blend[:,:3], blend[:,3])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [55]:
X_test_scaled = StandardScaler().fit_transform(X_test)



In [56]:
pred_rnd = rnd_clf.predict(X_test_scaled)
pred_tree = tree_clf.predict(X_test_scaled)
pred_svc = svc_clf.predict(X_test_scaled)

In [57]:
y_test_pred = blender_clf.predict(np.c_[pred_rnd, pred_tree, pred_svc])

In [58]:
accuracy_score(y_test_pred, y_test)

0.91449999999999998