In [1]:
%%javascript

IPython.tab_as_tab_everywhere = function(use_tabs) {
    if (use_tabs === undefined) {
        use_tabs = true; 
    }

    // apply setting to all current CodeMirror instances
    IPython.notebook.get_cells().map(
        function(c) {  return c.code_mirror.options.indentWithTabs=use_tabs;  }
    );
    // make sure new CodeMirror instances created in the future also use this setting
    CodeMirror.defaults.indentWithTabs=use_tabs;

    };

IPython.tab_as_tab_everywhere()

<IPython.core.display.Javascript object>

In [89]:
import mnist_decoder as md
from helper import *
import numpy as np
import pickle
import random

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import SGDClassifier, Perceptron
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [3]:
train_images_ = md.get_images("../data/train-images-idx3-ubyte.gz")
train_labels_ = md.get_labels("../data/train-labels-idx1-ubyte.gz")
test_images_ = md.get_images("../data/t10k-images-idx3-ubyte.gz")
test_labels_ = md.get_labels("../data/t10k-labels-idx1-ubyte.gz")

In [40]:
def image_to_feature_imageVec(images, labels):
	if images.shape[0] != labels.shape[0]:
		print("error: the number of samples is different from the number of labels")
		return

	n = images.shape[0]

	data_set = []

	for i in range(n):
		image = images[i, :, :]
		label = labels[i]
		
		image_rescaled = image.reshape(784)
		
		tmp_list = np.concatenate( ([label], image_rescaled) )
		data_set.append(tmp_list)

	return np.array(data_set)[:, 1:], np.array(data_set)[:, 0]

In [13]:
random.seed(20220319)
sp_idx_train = random.sample(range(60000), 6000)
sp_idx_test = random.sample(range(10000), 1000)

In [14]:
# subset
train_images = train_images_[sp_idx_train]
train_labels = train_labels_[sp_idx_train]
test_images = test_images_[sp_idx_test]
test_labels = test_labels_[sp_idx_test]

#### image vec feature

In [94]:
train_X, train_y = image_to_feature_imageVec(train_images, train_labels)
test_X, test_y = image_to_feature_imageVec(test_images, test_labels)

#### three-feature vec

In [80]:
train_X, train_y = image_to_feature(train_images, train_labels)
test_X, test_y = image_to_feature(test_images, test_labels)

#### show image

In [None]:
plt.imshow(train_images[2,:,:])

### check if the classes are balanced

In [17]:
ys = {}

for y in test_y:
    ys[y] = ys.get(y, 0) + 1

ys

{5.0: 92,
 2.0: 102,
 0.0: 91,
 6.0: 104,
 4.0: 89,
 8.0: 97,
 9.0: 114,
 3.0: 106,
 7.0: 90,
 1.0: 115}

## Base models

### fit the SVM model

In [118]:
clf = svm.SVC(decision_function_shape='ovr')

clf.fit(train_X, train_y)

# clf.score(test_X, test_y) # 0.954

0.954

In [122]:
(clf.score(train_X, train_y), clf.score(test_X, test_y)) # (0.987, 0.954)

(0.987, 0.954)

### fit the random forest model

In [None]:
clfrf = RandomForestClassifier(random_state=5)

clfrf.fit(train_X, train_y)

In [123]:
(clfrf.score(train_X, train_y), clfrf.score(test_X, test_y)) # (1.0, 0.943)

(1.0, 0.943)

### scaler and SGDClassifier

In [99]:
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
test_X_scaled = scaler.transform(test_X)

In [None]:
clf_perceptron = Perceptron()
clf_perceptron.fit(train_X, train_y)

In [124]:
(clf_perceptron.score(train_X, train_y), clf_perceptron.score(test_X, test_y)) # (0.9048333333333334, 0.821)

(0.9048333333333334, 0.821)

In [None]:
clf_perceptron = Perceptron()
clf_perceptron.fit(train_X_scaled, train_y)

In [125]:
(clf_perceptron.score(train_X_scaled, train_y), clf_perceptron.score(test_X_scaled, test_y)) # (0.8665, 0.812)

(0.8665, 0.812)

## Boosting

### fit AdaBoost with random forest

In [136]:
rf_ada = RandomForestClassifier()

clf_ada_rf = AdaBoostClassifier(rf_ada, n_estimators=10, random_state=0).fit(train_X, train_y)

In [137]:
(clf_ada_rf.score(train_X, train_y), clf_ada_rf.score(test_X, test_y)) # (1.0, 0.933)

(1.0, 0.933)

### fit adaBoost with svm
This method is still buggy and takes very long time

In [69]:
# svm_ada = svm.SVC(decision_function_shape='ovr')

# clf_ada_svm = AdaBoostClassifier(
# 	svm_ada, 
# 	n_estimators=10, 
# 	random_state=0, 
# 	algorithm='SAMME'
# ).fit(train_X, train_y)

# clf_ada_svm.score(test_X, test_y)

### fit adaBoost with Perceptron

In [138]:
perceptron_ada = Perceptron()
clf_ada_perceptron = AdaBoostClassifier(
	perceptron_ada, n_estimators=10, random_state=0, algorithm='SAMME'
).fit(train_X_scaled, train_y)

In [139]:
(clf_ada_perceptron.score(train_X_scaled, train_y), 
 clf_ada_perceptron.score(test_X_scaled, test_y)) # (0.9886666666666667, 0.869)

(0.9715, 0.855)

## Bagging

### bagging using random forest

In [None]:
rf_bag = RandomForestClassifier()

clf_bag_rf = BaggingClassifier(
	rf_bag, n_estimators=10, random_state=0
).fit(train_X, train_y)

In [133]:
(clf_bag_rf.score(train_X, train_y), clf_bag_rf.score(test_X, test_y) ) # (0.998, 0.935)

(0.998, 0.935)

### bagging using svm

In [None]:
svm_bag = svm.SVC(decision_function_shape='ovr')

clf_bag_svm = BaggingClassifier(
	svm_bag, n_estimators=10, random_state=0
).fit(train_X, train_y)

In [134]:
(clf_bag_svm.score(train_X, train_y), clf_bag_svm.score(test_X, test_y)) # (0.9826666666666667, 0.951)

(0.9826666666666667, 0.951)

### bagging with perceptron

In [None]:
ptron_bag = Perceptron()

clf_bag_ptron = BaggingClassifier(
	ptron_bag, n_estimators=10, random_state=0
).fit(train_X, train_y)

In [135]:
(clf_bag_ptron.score(train_X, train_y), clf_bag_ptron.score(test_X, test_y)) # (0.939, 0.887)

(0.939, 0.887)

### save the fitted model

In [None]:
file_save_clf = open('default_parameter_svm.obj', 'wb')

In [None]:
pickle.dump(clf, file_save_clf)