# Imports

In [None]:
import numpy as np
import os
%load_ext autoreload
%autoreload 2
import sys
import os


# Get the current folder path and go up one level ('..') to the project root
project_root = os.path.abspath('..')

# Add the root to the system path so Python can find 'src'
if project_root not in sys.path:
    sys.path.append(project_root)
%load_ext autoreload
%autoreload 2
from keras.src.metrics import Precision
from keras.src.metrics.metrics_utils import confusion_matrix

np.random.seed(42) #to make results reproducible

# Matplotlib and seaborn configuration
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Function to save Figures

In [None]:
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Importing Dataset

In [None]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

In [None]:
print(mnist["frame"])
print(mnist["feature_names"]) #28 x 28 = 784 Pixels
print(mnist["target_names"]) # Class

In [None]:
print(mnist["categories"])

In [None]:
print("description", mnist["DESCR"])
print("details", mnist["details"])
print("categories", mnist["categories"])
print("url", mnist["url"])

In [None]:
X , y = mnist["data"], mnist["target"]
X.shape

In [None]:
y.shape

In [None]:
some_digit = X[0]
some_digit_image = some_digit.reshape(28,28)
plt.imshow(some_digit_image, cmap=mpl.cm.binary)
plt.axis('off')
plt.show()
save_fig("some_digit_plot")

In [None]:
y[0]

In [None]:
y = y.astype(np.uint8)

In [None]:
def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap=mpl.cm.binary,
               interpolation="nearest")
    plt.axis("off")

In [None]:
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis = 0)
    plt.imshow(image, cmap = mpl.cm.binary , **options)
    plt.axis("off")

In [None]:
plt.figure(figsize=(9,9))
example_images = X[:100]
plot_digits(example_images, images_per_row=10)
save_fig("more_digits_plot")
plt.show()

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

# Binary Classifier

In [None]:
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5 )

In [None]:
from sklearn.linear_model import SGDClassifier
#Stochastic Gradient Descent (SGD)
sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
sgd_clf.fit(X_train, y_train_5)

In [None]:
sgd_clf.predict([some_digit])

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict

cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

In [None]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier()
dummy_clf.fit(X_train, y_train_5)
print(any(dummy_clf.predict(X_train)))

False = no 5s detected

In [None]:
cross_val_score(dummy_clf, X_train, y_train_5, cv=3 , scoring="accuracy" )

It has 90 Percent accuracy; ONLY COZ IT HAS 10 PERCENT OF IMAGES are 5s.
this is why accuracy is not a preffered performance measure for classifiers.
especillay when dataset is screwed or much classes are more frequent than others.for this we use



In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3) # add shuffle = True if the dataset is not shuffled
for train_index, test_index in skfolds.split(X_train,y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train_5[test_index]

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))


# Confusion Matrices
it is to count the no. of items  its getting confused class A with other Classes B.
1. to compute first we need predictions and actual values.
2.but dont use test set , we use cross_val_product


In [None]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5 , cv=3)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_train_5, y_train_pred)
cm

True Negative (TN): The model correctly ignored the non-5 images (53,892 times).

False Positive (Type I): The model made a "False Alarm." It thought a non-5 was a 5 (687 times).

False Negative (Type II): The model "Missed" the target. It failed to see a real 5 (1,891 times).

True Positive (TP): The model correctly spotted the 5s (3,530 times).

In [None]:
y_train_perfect_predictions = y_train_5
confusion_matrix(y_train_5, y_train_perfect_predictions)

confusion matrix gives us a lot of info sometimes we want less , and more concise , that time we gonna use Precision Classifier

# Precision and Recall


In [None]:
from sklearn.metrics import  precision_score, recall_score

precision_score(y_train_5,y_train_pred)

In [None]:
#computes the precision: TP / (FP + TP)
cm[1, 1] / (cm[0, 1] + cm[1, 1])

In [None]:
recall_score(y_train_5,y_train_pred)

In [None]:
#computes the recall: TP / (FN + TP)
cm[1, 1] / (cm[1, 0] + cm[1, 1])

to combine it into a single metric , we combine precision and recall score to become F1 score.

In [None]:
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)

In [None]:
#computes the f1 score
cm[1, 1] / (cm[1, 1] + (cm[1, 0] + cm[0, 1]) / 2)

In [None]:
# 1. Look at the confusion matrix
print(confusion_matrix(y_train_5, y_train_pred))

# 2. Check Precision, Recall, and F1 Score
print("Precision:", precision_score(y_train_5, y_train_pred)) #When it claims an image is a 5, how often is it actually correct?
print("Recall:", recall_score(y_train_5, y_train_pred))# Out of all the actual 5s in the dataset, what percentage did the model successfully find?
print("F1 Score:", f1_score(y_train_5, y_train_pred))
#A combination of Precision and Recall into a single number


# Precision/Recall Trade-off

In [None]:
y_scores = sgd_clf.decision_function([some_digit])
#instead of predict function we can use decision_function which allows u to make the prediction based on the threshold value u assign
y_scores
threshold = 0 #inital threshold which gives same value as predict

In [None]:
y_some_digit_pred = (y_scores > threshold)

In [None]:
#increasing threshold
threshold = 3000
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred

increasing threshold decreases the RECALL , when we predicted with 0 threshold it actually predicted that its 5 but when we shifted to 3000 it changed it to False. hence the threshold works

now which one to use , **first** use the Cross val predict , but with decision_function instead of predict method.


In [None]:
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv = 3, method= "decision_function")

**Second** use the precision_recall_curve to compute precision and recall for all the possible thresholds

In [None]:
from sklearn.metrics import  precision_recall_curve
precisions , recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

In [None]:
print(type(precisions))
print(len(precisions))
print(precisions.shape)
print(precisions[0:5])

In [None]:
plt.plot(thresholds, precisions[:-1], "b--", label = "Precision", linewidth = 2)
plt.plot(thresholds, recalls[:-1], "r--", label = "Recall", linewidth = 2)
plt.vlines(threshold, 0 , 1.0, "k", "dotted" , label = "threshold")
save_fig("precision_recall_curve")
plt.show()

In [None]:
#importing from different files
from src.extras import plot_precision_recall_vs_threshold

plot_precision_recall_vs_threshold(precisions, recalls, thresholds, target_precision=0.90, save_name="precision_recall_vs_threshold_plot")