In [1]:
from sklearn.datasets import load_digits
from sklearn.datasets import fetch_openml
import numpy as np

mnist = fetch_openml('mnist_784', version=1)
# mnist


In [2]:
import numpy as np

X, y = mnist["data"], mnist["target"]
y = y.astype(np.uint8)
X.shape


(70000, 784)

In [3]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

# some_digit = X[1000]
# some_digit_image = some_digit.reshape(28, 28)
# plt.imshow(some_digit_image, cmap= matplotlib.cm.binary, interpolation="nearest")
# plt.axis("off")
# plt.show



In [4]:
# Split the dataset in train and test set
import numpy as np
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]


In [5]:
# Binary classifier

from sklearn.linear_model import SGDClassifier
y_train_5 = (y_train == 5)  # vrai pour les 5, faux pour le reste.
y_test_5 = (y_test == 5)

# Stochastic gradient descent 
sgd_clf = SGDClassifier(tol=1e-3, random_state=42)
sgd_clf.fit(X_train, y_train_5)


SGDClassifier(random_state=42)

In [6]:
# Sort all "5" predicted in an array
value = 1
array_5 = []
while value < len(X_test):
    if sgd_clf.predict([X_test[value]]):
        array_5.append(value)
    value +=1

print(array_5)


[15, 23, 45, 53, 59, 98, 102, 127, 129, 132, 152, 155, 162, 165, 167, 182, 187, 207, 217, 219, 240, 245, 253, 261, 283, 289, 313, 317, 319, 333, 347, 351, 356, 364, 375, 395, 397, 402, 412, 433, 483, 491, 509, 518, 538, 540, 570, 588, 638, 645, 654, 692, 694, 710, 711, 729, 739, 766, 779, 785, 797, 812, 827, 856, 869, 897, 935, 955, 978, 1003, 1014, 1028, 1041, 1046, 1070, 1073, 1085, 1102, 1113, 1131, 1135, 1144, 1153, 1168, 1190, 1202, 1221, 1233, 1234, 1252, 1258, 1281, 1285, 1339, 1340, 1370, 1376, 1405, 1406, 1447, 1460, 1471, 1473, 1476, 1487, 1493, 1510, 1521, 1550, 1587, 1598, 1616, 1618, 1629, 1635, 1639, 1653, 1672, 1677, 1693, 1747, 1752, 1755, 1761, 1774, 1810, 1833, 1846, 1847, 1860, 1866, 1879, 1896, 1902, 1911, 1931, 1940, 1948, 1967, 1973, 1999, 2003, 2021, 2029, 2030, 2047, 2073, 2077, 2078, 2100, 2103, 2113, 2114, 2134, 2159, 2180, 2207, 2214, 2215, 2237, 2241, 2279, 2282, 2346, 2393, 2413, 2445, 2449, 2452, 2476, 2487, 2515, 2518, 2525, 2534, 2540, 2542, 2545, 2546, 

In [7]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")


array([0.9575, 0.9632, 0.9652])

In [8]:
from sklearn.base import BaseEstimator
# Dump classifier which returns 0 for all digits
class Never5Clf(BaseEstimator):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        pass
    def predict(self, X, y=None):
        return np.zeros((len(X), 1), dtype=bool)

never_5_clf = Never5Clf()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")

array([0.90735, 0.9076 , 0.914  ])

In [9]:
from sklearn.model_selection import cross_val_predict
# Predictions in cross-validation sel
y_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)


In [10]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
# Confusion matrix
confusion_matrix(y_train_5, y_pred)
precision_score(y_train_5, y_pred) #TP/TP+FP
recall_score(y_train_5, y_pred) #TP/TP+FN
f1_score(y_train_5, y_pred)

0.7706993569131833

In [11]:
# Modify threshold
some_digit = X[134]
y_scores = sgd_clf.decision_function([some_digit])
y_scores
threshold = 200
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred

array([False])

In [12]:
# Compute prediction values
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")

In [13]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    # plot(x, y, type, label)
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recalls")
    plt.xlabel("Threshold")
    plt.legend(loc="upper_left")
    plt.ylim([0, 1])
    plt.xlim([-50000, 50000])
    
# plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
# plt.show()
    

In [14]:
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(precisions[:-1], recalls[:-1], color="blue")
    plt.ylabel("Precisions")
    plt.xlabel("Recalls")
    
# plot_precision_vs_recall(precisions, recalls)
# plt.show()

In [15]:
# 90% of precision
y_train_pred_90 = (y_scores > 3800)

precision_score(y_train_5, y_train_pred_90)

0.9415730337078652

In [16]:
# High precision decrease the recall 
recall_score(y_train_5, y_train_pred_90)

0.4637520752628666

In [17]:
# ROC curve
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel("True Positive Rate")
        

In [18]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_5, y_scores)

0.960532666003226

In [19]:
# Try random forest clf
# from sklearn.ensemble import RandomForestRegressor
# 
# forest_clf = RandomForestRegressor(random_state= 42)
# y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method="predict")
# 

In [20]:
# y_scores_forest = y_probas_forest[:, 1]
# fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)
# 


In [21]:
# Multiclass classification
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])


array([3], dtype=uint8)

In [22]:
# See the predictions for all classes
some_digit_scores = sgd_clf.decision_function([some_digit])
some_digit_scores

# Retrieve which class this digit belongs to
np.argmax(some_digit_scores)
# Watch all the classes in the multiclass classifier
sgd_clf.classes_
# Check the class "5"
sgd_clf.classes_[5]


5

In [23]:
# Multiclass classification with OneVsOneClassifier
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])


array([3], dtype=uint8)

In [24]:
# Number of OneVsOne classifiers => n*(n-1)/2
len(ovo_clf.estimators_)

45

In [25]:
# RandomForestClassifier
# from sklearn.ensemble import RandomForestRegressor
# forest_clf = RandomForestRegressor(random_state= 42)
# forest_clf.fit(X_train, y_train)
# 

In [26]:
# forest_clf.predict(some_digit.reshape(-1,1))

In [27]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")




array([0.90205, 0.8981 , 0.897  ])

In [28]:
# Error analysis
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx





array([[5583,    0,   19,    7,    8,   38,   32,    5,  230,    1],
       [   0, 6416,   44,   17,    3,   44,    4,    8,  195,   11],
       [  25,   27, 5254,   79,   66,   23,   69,   40,  365,   10],
       [  29,   22,  107, 5227,    2,  195,   23,   43,  416,   67],
       [  12,   14,   38,    8, 5214,    8,   34,   22,  331,  161],
       [  29,   17,   26,  149,   52, 4441,   77,   19,  541,   70],
       [  29,   18,   48,    2,   47,   85, 5544,    5,  140,    0],
       [  22,   13,   53,   20,   44,   10,    5, 5683,  191,  224],
       [  16,   62,   39,   86,    2,  114,   26,    9, 5454,   43],
       [  23,   21,   29,   58,  119,   31,    1,  179,  361, 5127]],
      dtype=int64)