In [None]:
from sklearn.datasets import fetch_openml
import pandas as pd

mnist = fetch_openml("mnist_784", version=1, as_frame=False)

In [None]:
X, y = mnist.data, mnist.target

In [None]:
X

In [None]:
X.shape

In [None]:
y

In [None]:
y.shape

Display the image of MNIST dataset using Matplotlib imshow()

In [None]:
import matplotlib.pyplot as plt


def plot_digits(image):
    image = image.reshape(28, 28)
    plt.imshow(image, cmap="binary")
    plt.axis("off")


digit_one = X[24]
digit_five = X[0]
digit_four = X[2]


plt.figure(figsize=(6, 3))

plt.subplot(1,3,1)
plot_digits(digit_one)
plt.title("digit_one")

plt.subplot(1, 3, 2)
plot_digits(digit_four)
plt.title("digit_four")

plt.subplot(1, 3, 3)
plot_digits(digit_five)
plt.title("digit_five")

plt.show()

In [None]:
y[24], y[0]

In [None]:
plt.figure(figsize=(9, 9))
for idx, img_data in enumerate(X[:400]):
    plt.subplot(20, 20, idx + 1)
    plot_digits(img_data)
# plt.subplots_adjust(wspace=0, hspace=0)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

Training a Binary Classifier

In [None]:
y_train_5 = y_train == "5"
y_test_5 = y_test == "5"

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_classifier = SGDClassifier(random_state=42)
sgd_classifier.fit(
    X_train, y_train_5
)  # Adding this, now sgd_classifier can be used for detection of number 5

In [None]:
# Using sgd_classifier

sgd_classifier.predict(
    [digit_four]
)  # sgd_classifier is for digit 5 but some_digit is digit 1

# So array show False

In [None]:
sgd_classifier.predict([digit_five])

# digit_five is actually digit 5 from MNIST dataset. So, answer is True

Measuring Accuary using Cross-Validation
Measuring the perform of SGDClassifier using k-fold cross-validation 

In [None]:
# It will take sometime

from sklearn.model_selection import cross_val_score

score = cross_val_score(sgd_classifier, X_train, y_train_5, cv=5, scoring="accuracy")

In [None]:
score

In [None]:
pd.Series(score).describe()

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfold = StratifiedKFold(n_splits=5)

for train_idx, test_idx in skfold.split(X_train, y_train_5):
    clone_clf = clone(sgd_classifier)
    X_train_fold = X_train[train_idx]
    X_test_fold = X_train[test_idx]
    y_train_fold = y_train_5[train_idx]
    y_test_fold = y_train_5[test_idx]

    clone_clf.fit(X_train_fold, y_train_fold)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

In [None]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier()
dummy_clf.fit(X_train, y_train_5)
print(any(dummy_clf.predict(X_train)))

cross_val_score(dummy_clf, X_train, y_train_5, cv=5, scoring="accuracy")

In [None]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_classifier, X_train, y_train_5, cv=5)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_train_5, y_train_pred)
cm

In [None]:
y_train_perfect_predictions = y_train_5
confusion_matrix(y_train_5, y_train_perfect_predictions)

In [None]:
from sklearn.metrics import precision_score, recall_score

precision_score(
    y_train_5, y_train_pred
)  # 4047 / (4047 + 1450) [Refers to Confusion Matrix]

In [None]:
recall_score(
    y_train_5, y_train_pred
)  # 4047 / (4047 + 1003) [Refers to Confusion Matrix]

In [None]:
from sklearn.metrics import f1_score

f1_score(y_train_5, y_train_pred)

In [None]:
y_score = sgd_classifier.decision_function([digit_five])
y_score

In [None]:
threshold = 0
y_digit_five_score = y_score > threshold
y_digit_five_score

In [None]:
threshold = 3000
y_digit_five_score = y_score > threshold
y_digit_five_score

In [None]:
y_score = cross_val_predict(
    sgd_classifier, X_train, y_train_5, cv=5, method="decision_function"
)

In [None]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_score)

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
plt.vlines(threshold, 0, 1.0, "k", "dotted", label="Threshold")

idx = (thresholds >= threshold).argmax()  # first index ≥ threshold
plt.plot(thresholds[idx], precisions[idx], "bo")
plt.plot(thresholds[idx], recalls[idx], "go")
plt.axis([-50000, 50000, 0, 1])
plt.grid()
plt.xlabel("Threshold")
plt.ylabel("Precision and Recall")
plt.title("Precision and Recall vs Threshold")
plt.legend(loc="center right")

plt.show()

In [None]:
import matplotlib.patches as patches

plt.figure(figsize=(8, 6))
plt.plot(recalls, precisions, "b-", linewidth=2, label="Precision vs Recall Curve")
plt.title("Precision vs Recall Curve")
plt.grid()
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.gca().add_patch(
    patches.FancyArrowPatch(
        (0.79, 0.60),
        (0.61, 0.78),
        connectionstyle="arc3,rad=.2",
        arrowstyle="Simple, tail_width=1.5, head_width=8, head_length=10",
        color="#444444",
    )
)
plt.text(0.56, 0.62, "Higher\nthreshold", color="#333333")
plt.show()

In [None]:
idx_for_90_precision = (precisions >= 0.90).argmax()
threshold_for_90_precision = thresholds[idx_for_90_precision]
threshold_for_90_precision

In [None]:
y_train_pred_90 = y_score >= threshold_for_90_precision
precision_score(y_train_5, y_train_pred_90)

In [None]:
recall_score(y_train_5, y_train_pred_90)

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_score)

In [None]:
idx_for_threshold_at_90 = (thresholds <= threshold_for_90_precision).argmax()
tpr_90, fpr_90 = tpr[idx_for_threshold_at_90], fpr[idx_for_threshold_at_90]

In [None]:
plt.plot(fpr, tpr, linewidth=2, label="ROC curve")
plt.plot([0, 1], [0, 1], "k", label="Random Classifier's ROC Curve")
plt.plot([fpr_90], [tpr_90], "ko", label="Threshold for 90% precision")
plt.grid()
plt.gca().add_patch(
    patches.FancyArrowPatch(
        (0.25, 0.89),
        (0.09, 0.65),
        connectionstyle="arc3, rad=.4",
        arrowstyle="Simple, tail_width=1.5, head_width=10, head_length=10",
        color="black",
    )
)
plt.text(0.15, 0.73, "Higher\nthreshold", color="black")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_train_5, y_score)

Selecting Model

In [None]:
#Selecting the Model RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier

forest_classifier = RandomForestClassifier(random_state=42)

In [None]:
y_prob_foreest = cross_val_predict(
    forest_classifier, X_train, y_train_5, cv=5, method="predict_proba"
)

In [None]:
y_prob_foreest[:2]

In [None]:
y_score_forest = y_prob_foreest[:, 1]
precisions_forest, recalls_forest, thresholds_forest = precision_recall_curve(
    y_train_5, y_score_forest
)

In [None]:
plt.plot(recalls_forest, precisions_forest, "k-", linewidth=2, label="Random Forest")
plt.plot(recalls, precisions, "b--", linewidth=2, label="SGD")
plt.grid()
plt.legend()
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.axis([0, 1, 0, 1])
plt.show()

In [None]:
y_train_pred_forest = y_prob_foreest[:,1] >= 0.5

In [None]:
f1_score(y_train_5, y_train_pred_forest)

In [None]:
roc_auc_score(y_train_5, y_score_forest)

Multiclass Classification:
We use the binary classifier for predicting the multiclass classification
We train different model for predicting the different digits under one model

SVC does this. SVC is by default one-vs-one(OvO) model classifier.

In [None]:
from sklearn.svm import SVC

svc_classifier = SVC(random_state=42)
svc_classifier.fit(X_train, y_train)

In [None]:
svc_classifier.predict([digit_one])

In [None]:
svc_classifier.predict([digit_five])

In [None]:
svc_classifier.predict([digit_four])

In [None]:
digit_five_scores = svc_classifier.decision_function([digit_five])
digit_five_scores.round(3)

In [None]:
digit_four_scores = svc_classifier.decision_function([digit_four])
digit_four_scores.round(3)

In [None]:
class_id = digit_five_scores.argmax()
class_id

In [None]:
class_id_four = digit_four_scores.argmax()
class_id_four

In [None]:
svc_classifier.classes_

In [None]:
svc_classifier.classes_[class_id]

In [None]:
svc_classifier.classes_[class_id_four]

Converting the OvO to OvR:
Converting whole data to OvR takes lot of time so doing this on small amount of dataset 

In [None]:
from sklearn.multiclass import OneVsRestClassifier

ovr_classifier = OneVsRestClassifier(SVC(random_state=42))
ovr_classifier.fit(X_train, y_train)

In [None]:
ovr_classifier.predict([digit_one])

In [None]:
ovr_classifier.predict([digit_four])

In [None]:
ovr_classifier.predict([digit_five])

In [None]:
sgd_classifier = SGDClassifier(random_state=42)
sgd_classifier.fit(X_train, y_train)

In [None]:
sgd_classifier.predict([digit_one])