In [None]:
import pickle

import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.base import clone, BaseEstimator

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score

import matplotlib as mpl
import matplotlib.pyplot as plt

### Loading Dataset

In [None]:
with open('mnist.pkl', 'rb') as bunch:
    mnist = pickle.load(bunch)

In [None]:
X, y = mnist['data'].to_numpy(), mnist['target'].to_numpy().astype('uint8')


In [None]:
def pretty_print_mnist_number(number: np.array):
    res = ''
    for linha in number.reshape(28, 28):
        for p in linha:
            res += f'{int(p):>3}'
        res += '\n'
    print(res)

### Treino e Teste
O database já está separado aleatoriamente e tem 70 mil amostras,
vamos pegar 60 mil para treino e 10 mil para teste



In [None]:
split_threshold = 60_000


In [None]:
X_train, X_test = X[:split_threshold], X[split_threshold:]
y_train, y_test = y[:split_threshold], y[split_threshold:]

## Treinando um classificador binário
Vamos teinar um classificador que verifica apenas se uma imagem
é o número 5 ou não



In [None]:
y_train_5 = (y_train==5)
y_test_5 = (y_test==5)

In [None]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

In [None]:
y_train_pred_sdg = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

In [None]:
accuracy_score(y_train_5, y_train_pred_sdg)

In [None]:
confusion_matrix(y_train_5, y_train_pred_sdg)

In [None]:
precision_score(y_train_5, y_train_pred_sdg)

In [None]:
recall_score(y_train_5, y_train_pred_sdg)

In [None]:
f1_score(y_train_5, y_train_pred_sdg)

Usando o never5, um que sempre diz que o nao eh o 5

In [None]:
class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        return self
    def predict(self, X):
        return np.zeros(len(X), dtype=bool)

In [None]:
n5_clf = Never5Classifier()
n5_clf.fit(X_train, y_train_5)

In [None]:
y_train_pred_n5 = cross_val_predict(n5_clf, X_train, y_train_5, cv=3)

In [None]:
accuracy_score(y_train_5, y_train_pred_n5)

In [None]:
confusion_matrix(y_train_5, y_train_pred_n5)

In [None]:
precision_score(y_train_5, y_train_pred_n5)

In [None]:
recall_score(y_train_5, y_train_pred_n5)

In [None]:
f1_score(y_train_5, y_train_pred_n5)

### Classificador chamado regressao logistica

log_reg = LogisticRegression()

In [None]:
log_reg = Never5Classifier()
log_reg.fit(X_train, y_train_5)

In [None]:
y_train_pred_log = cross_val_predict(log_reg, X_train, y_train_5, cv=3)

In [None]:
accuracy_score(y_train_5, y_train_pred_log)

In [None]:
confusion_matrix(y_train_5, y_train_pred_log)

In [None]:
precision_score(y_train_5, y_train_pred_log)

In [None]:
recall_score(y_train_5, y_train_pred_log)

In [None]:
f1_score(y_train_5, y_train_pred_log)