<a href="https://colab.research.google.com/github/victorhmota/cursoml/blob/main/projeto_mnist.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version='1')
mnist.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])

In [None]:
X = mnist['data']
y = mnist['target']

In [None]:
X.shape
y.shape

In [None]:
some_digit = X[0]
some_digit_image = some_digit.reshape(28,28)

plt.imshow(some_digit_image, cmap=mpl.cm.binary, interpolation='nearest')
plt.axis('off')
plt.show()

In [None]:
y[0]

'5'

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [None]:
y_train_5 = (y_train == '5')
y_test_5 = (y_test == '5')

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

In [None]:
sgd_clf.predict([some_digit])

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring='accuracy')

In [None]:
#Multiclass classification

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
pipe = Pipeline([('std_scaler', StandardScaler()),
                 ('estimator', SGDClassifier(max_iter=10, random_state=42))])

param_grid = [{'estimator__loss':['hinge', 'log'], 'estimator__alpha':[1e-4, 1e-2, 1]}]

In [None]:
grid_search = GridSearchCV(pipe, param_grid, cv=5, verbose=10, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
results = pd.concat([pd.DataFrame(grid_search.cv_results_["params"]),
                     pd.DataFrame(grid_search.cv_results_['std_test_score'], 
                                  columns=["Std"]),
                     pd.DataFrame(grid_search.cv_results_["mean_test_score"], 
                                  columns=["Score"])],axis=1)

results.sort_values("Score", ascending=False)

In [None]:
#Implementando o melhor modelo

In [None]:
model = Pipeline([('std_scaler', StandardScaler()),
                  ('estimator', SGDClassifier(max_iter=10, random_state=42))])
model.set_params(**grid_search.best_params_)

In [None]:
model.fit(X_train, y_train)

In [None]:
some_digit = X_test[0]
model.predict([some_digit])

In [None]:
y_test[0]

In [None]:
#Testando esquema OvO

In [None]:
from sklearn.multiclass import OneVsOneClassifier

model = Pipeline([('std_scaler', StandardScaler()), ('estimator', SGDClassifier(max_iter = 1000, random_state=42))])
model.set_params(**grid_search.best_params_)

ovo_clf = OneVsOneClassifier(model)

ovo_clf.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(ovo_clf, X_train, y_train, cv=5)

In [None]:
scores

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = ovo_clf.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.927

In [None]:
#Implementando o 'data argumentation'

In [None]:
from scipy.ndimage.interpolation import shift

In [None]:
def shift_image(image, dx, dy):
  image = image.reshape((28,28))
  shift_image = shift(image, [dy, dx], cval=0, mode='constant')
  return shift_image.reshape([-1])

In [None]:
X_train_augmented = [image for image in X_train]
y_train_augmented = [image for image in y_train]

In [None]:
for dx, dy in ((1,0), (-1, 0), (0, 1), (0, -1)):
  for image, label in zip(X_train, y_train):
    X_train_augmented.append(shift_image(image, dx, dy))
    y_train_augmented.append(label)

In [None]:
len(X_train_augmented)

In [None]:
#Embaralhamento
shuffle_index = np.random.permutation(len(X_train_augmented))
X_train_augmented = np.array(X_train_augmented)[shuffle_index]
y_train_augmented = np.array(y_train_augmented)[shuffle_index]

In [None]:
#Procurando por melhores parâmetros com o gridsearch

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
grid_search2 = GridSearchCV(pipe, param_grid, cv=3, verbose=10, n_jobs=-1)
grid_search2.fit(X_train_augmented, y_train_augmented)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   49.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:  9.6min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('std_scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('estimator',
                                        SGDClassifier(alpha=0.0001,
                                                      average=False,
                                                      class_weight=None,
                                                      early_stopping=False,
                                                      epsilon=0.1, eta0=0.0,
                                                      fit_intercept=True,
                                                      l1_ratio=0.15,
                                                      learning_rate='optimal',
                  

In [None]:
grid_search2.best_params_

{'estimator__alpha': 0.0001, 'estimator__loss': 'log'}

In [None]:
model2 = Pipeline([('std_scaler', StandardScaler()), ('estimator', SGDClassifier(max_iter=10, random_state=42))])

model2.set_params(**grid_search2.best_params_)
model2.fit(X_train_augmented, y_train_augmented)



Pipeline(memory=None,
         steps=[('std_scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('estimator',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='log', max_iter=10,
                               n_iter_no_change=5, n_jobs=None, penalty='l2',
                               power_t=0.5, random_state=42, shuffle=True,
                               tol=0.001, validation_fraction=0.1, verbose=0,
                               warm_start=False))],
         verbose=False)

In [None]:
some_digit2 = X_test[0]
digit_predict = model2.predict([some_digit2])

In [None]:
digit_predict

In [None]:
y_test[0]

In [None]:
#Utilizando método OvO

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
from sklearn.multiclass import OneVsOneClassifier
model2 = Pipeline([('std_scaler', StandardScaler()), ('estimator', SGDClassifier(max_iter = 1000, random_state=42))])
model2.set_params(**grid_search2.best_params_)
ovo_clf = OneVsOneClassifier(model2)

ovo_clf.fit(X_train_augmented, y_train_augmented)


OneVsOneClassifier(estimator=Pipeline(memory=None,
                                      steps=[('std_scaler',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('estimator',
                                              SGDClassifier(alpha=0.0001,
                                                            average=False,
                                                            class_weight=None,
                                                            early_stopping=False,
                                                            epsilon=0.1,
                                                            eta0=0.0,
                                                            fit_intercept=True,
                                                            l1_ratio=0

In [None]:
y_pred2 = ovo_clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
conf_matrix2 = confusion_matrix(y_test, y_pred2)
conf_matrix2

array([[ 965,    1,    1,    0,    1,    6,    2,    2,    2,    0],
       [   0, 1124,    4,    1,    0,    1,    2,    0,    3,    0],
       [   6,    3,  955,   11,   13,    1,    7,   11,   22,    3],
       [   1,    4,    9,  925,    2,   28,    2,   16,   21,    2],
       [   1,    1,    7,    1,  944,    0,    5,    2,    1,   20],
       [   9,    4,    6,   38,    6,  802,    9,    1,   12,    5],
       [  12,    6,    5,    1,    5,   13,  914,    1,    1,    0],
       [   1,    4,   14,    6,    6,    0,    0,  974,    0,   23],
       [   6,    5,    5,   19,   10,   20,    6,   10,  887,    6],
       [   4,    9,    3,    8,   23,    3,    0,   18,    4,  937]])

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred2)

0.9427

In [None]:
#Comparando sem 'data argumentation' e com 'data argumentation'.

Os resultados anteriores apontaram acurácia de 0.927, enquanto os resultados utilizando 'data argumentation' encontraram 0.9427. Em resumo, a utilização desse método representou um ganho de 0.0157 de acurácia. 