# Classificador Binário


![alt text](https://jakelearnsdatascience.files.wordpress.com/2017/02/lda_binary.png "Binary Classification")

## Carregando base Mnist

![alt text](https://corochann.com/wp-content/uploads/2017/02/mnist_plot.png)

In [1]:
from scipy.io import loadmat

mnist_raw = loadmat("mnist-original.mat")
mnist = {
    "data": mnist_raw["data"].T,
    "target": mnist_raw["label"][0],
    "COL_NAMES": ["label", "data"],
    "DESCR": "mldata.org dataset: mnist-original",
    }



X,y = mnist['data'], mnist['target']

### MNIST banco de dados de dígitos manuscritos composto por:
* um conjunto de treinamento de 60.000 exemplos 
* um conjunto de testes de 10.000 exemplos. 
* Os dígitos foram normalizados em tamanho e centralizados em uma imagem de tamanho fixo


In [2]:
import numpy as np

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

shuffle_index = np.random.permutation(60000)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(60000, 784) (10000, 784) (60000,) (10000,)


## Redefinindo a base como true para 5 e false para outro número
## Classificador binário para identificar se é 5 ou se não é 5 

In [3]:
# target / class / y
print(y_train)

[0. 0. 0. ... 9. 9. 9.]


In [4]:
# Start by only trying to ID "five" digits.

y_train_5 = (y_train == 5) # create target vectors
y_test_5  = (y_test == 5)

# 5 = True
# other = False

print(y_train_5.shape, y_train_5)
print(y_test_5.shape, y_test_5)

(60000,) [False False False ... False False False]
(10000,) [False False False ... False False False]


## SGD Classifier

In [5]:
# SGD classifier: good at handling large DBs
#                 also good at handling one-at-a-time learning
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

# Predição

In [6]:
import matplotlib as plt

import random
number = random.sample(range(1,9999), 1)

print(number[0])
pred_digit = number[0]

#print(y_test_5[pred_digit], sgd_clf.predict(X_test[pred_digit]))
print(y_test_5[pred_digit])
print(X_test[pred_digit])
#print(sgd_clf.predict(X_test[pred_digit]))


9563
False
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0  17 196 254 254 254 195  74   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0  13 191 253 253
 253 253 253 253 209  80   0   0   0   0

## Medindo qualidade do classificador
## Matrix de confusão
## Mátricas precisão, recall e f1

In [11]:
from sklearn.metrics import accuracy_score

y_train_pred = sgd_clf.predict(X_train)

#y_test_5[pred_digit], X_test[


print("Accuracy Score", accuracy_score(y_train_5, y_train_pred)*100)

Accuracy Score 96.83666666666667


In [13]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_train_5, y_train_pred)
print(cm)

[[53669   910]
 [  988  4433]]


In [14]:
# precision, recall, f1 metrics

from sklearn.metrics import precision_score, recall_score, f1_score

print("precision:",precision_score(y_train_5, y_train_pred))
print("recall:",recall_score(y_train_5, y_train_pred))

# F1 score favors classifiers with similar precision & recall.
print("f1:",f1_score(y_train_5, y_train_pred))

precision: 0.829683698296837
recall: 0.8177458033573142
f1: 0.8236714975845411


# Dados de teste

In [15]:
from sklearn.metrics import classification_report

y_test_pred = sgd_clf.predict(X_test)

classes = ['Digit all', 'Digit 5']
print(classification_report(y_test_5, y_test_pred, target_names=classes))

              precision    recall  f1-score   support

   Digit all       0.98      0.98      0.98      9108
     Digit 5       0.81      0.83      0.82       892

    accuracy                           0.97     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.97      0.97      0.97     10000



In [16]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test_5, y_test_pred)
print(cm)

[[8939  169]
 [ 156  736]]


In [17]:
from sklearn.metrics import accuracy_score

print("Accuracy Score", accuracy_score(y_test_5, y_test_pred)*100)

Accuracy Score 96.75
