In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import datasets

# Loading the data (fashion MNIST)
(train_images, train_labels), (test_images, test_labels) = datasets.fashion_mnist.load_data()

CLASS_NAMES = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
NUM_CLASSES  = len(CLASS_NAMES)

m_train = 60000
m_test  = 10000
num_px  = 28

train_set_x_flatten = train_images.reshape(train_images.shape[0], -1).T
test_set_x_flatten  = test_images.reshape(test_images.shape[0], -1).T

train_set_x = train_set_x_flatten / 255.
test_set_x  = test_set_x_flatten  / 255.

train_set_y = train_labels.reshape(1, -1)
test_set_y  = test_labels.reshape(1, -1)

def sigmoid(z):
    """
    Compute the sigmoid of z

    Arguments:
    z -- A scalar or numpy array of any size.

    Return:
    s -- sigmoid(z)
    """

    ### START CODE HERE ### (≈ 1 line of code)
    s = 1 / (1 + np.exp(-z))
    ### END CODE HERE ###

    return s

def initialize_with_zeros(dim):
    """
    This function creates a vector of zeros of shape (dim, 1) for w and initializes b to 0.

    Argument:
    dim -- size of the w vector we want (or number of parameters in this case)

    Returns:
    w -- initialized vector of shape (dim, 1)
    b -- initialized scalar (corresponds to the bias)
    """

    ### START CODE HERE ### (≈ 1 line of code)
    w = np.zeros((dim,1))
    b = 0
    ### END CODE HERE ###

    assert(w.shape == (dim, 1))
    assert(isinstance(b, float) or isinstance(b, int))

    return w, b

# GRADED FUNCTION: propagate

def propagate(w, b, X, Y):
    """
    Implement the cost function and its gradient for the propagation explained above

    Arguments:
    w -- weights, a numpy array of size (num_px * num_px, 1)
    b -- bias, a scalar
    X -- data of size (num_px * num_px, number of examples)
    Y -- true "label" vector (containing 0 or 1) of size (1, number of examples)

    Return:
    cost -- negative log-likelihood cost for logistic regression
    dw -- gradient of the loss with respect to w, thus same shape as w
    db -- gradient of the loss with respect to b, thus same shape as b

    Tips:
    - Write your code step by step for the propagation. np.log(), np.dot()
    """

    m = X.shape[1]

    # FORWARD PROPAGATION (FROM X TO COST)
    ### START CODE HERE ### (≈ 2 lines of code)
    A = sigmoid(np.dot(w.T, X) + b)                                                  # compute activation
    cost = -(1/m) * np.sum(Y * np.log(A + 1e-8) + (1 - Y) * np.log(1 - A + 1e-8))   # compute cost
    ### END CODE HERE ###

    # BACKWARD PROPAGATION (TO FIND GRAD)
    ### START CODE HERE ### (≈ 2 lines of code)
    dw = (1/m) * np.dot(X, (A - Y).T)
    db = (1/m) * np.sum(A - Y, axis=1, keepdims=True)
    ### END CODE HERE ###

    assert(dw.shape == w.shape)
    assert(db.dtype == float)
    cost = np.squeeze(cost)
    assert(cost.shape == ())

    grads = {"dw": dw,
             "db": db}

    return grads, cost

# GRADED FUNCTION: optimize

def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost = False):
    """
    This function optimizes w and b by running a gradient descent algorithm

    Arguments:
    w -- weights, a numpy array of size (num_px * num_px, 1)
    b -- bias, a scalar
    X -- data of shape (num_px * num_px, number of examples)
    Y -- true "label" vector (containing 0 or 1), of shape (1, number of examples)
    num_iterations -- number of iterations of the optimization loop
    learning_rate -- learning rate of the gradient descent update rule
    print_cost -- True to print the loss every 100 steps

    Returns:
    params -- dictionary containing the weights w and bias b
    grads -- dictionary containing the gradients of the weights and bias with respect to the cost function
    costs -- list of all the costs computed during the optimization, this will be used to plot the learning curve.

    Tips:
    You basically need to write down two steps and iterate through them:
        1) Calculate the cost and the gradient for the current parameters. Use propagate().
        2) Update the parameters using gradient descent rule for w and b.
    """

    costs = []

    for i in range(num_iterations):


        # Cost and gradient calculation (≈ 1-4 lines of code)
        ### START CODE HERE ###
        grads, cost = propagate(w, b, X, Y)
        ### END CODE HERE ###

        # Retrieve derivatives from grads
        dw = grads["dw"]
        db = grads["db"]

        # update rule (≈ 2 lines of code)
        ### START CODE HERE ###
        w = w - learning_rate * dw
        b = b - learning_rate * db
        ### END CODE HERE ###

        # Record the costs
        if i % 100 == 0:
            costs.append(cost)

        # Print the cost every 100 training examples
        if print_cost and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))

    params = {"w": w,
              "b": b}

    grads = {"dw": dw,
             "db": db}

    return params, grads, costs

# GRADED FUNCTION: predict

def predict(w, b, X):
    '''
    Predict whether the label is 0 or 1 using learned logistic regression parameters (w, b)

    Arguments:
    w -- weights, a numpy array of size (num_px * num_px, 1)
    b -- bias, a scalar
    X -- data of size (num_px * num_px, number of examples)

    Returns:
    Y_prediction -- a numpy array (vector) containing all predictions (0/1) for the examples in X
    '''

    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    w = w.reshape(X.shape[0], 1)

    # Compute vector "A" predicting the probabilities of the class being present in the picture
    ### START CODE HERE ### (≈ 1 line of code)
    A = sigmoid(np.dot(w.T, X) + b)
    ### END CODE HERE ###

    for i in range(A.shape[1]):

        # Convert probabilities A[0,i] to actual predictions p[0,i]
        ### START CODE HERE ### (≈ 4 lines of code)
        if A[0, i] <= 0.5:
            Y_prediction[0, i] = 0
        else:
            Y_prediction[0, i] = 1
        ### END CODE HERE ###

    assert(Y_prediction.shape == (1, m))

    return Y_prediction

# GRADED FUNCTION: train_all_models

def train_all_models(X_train, Y_train, num_classes, num_iterations = 2000, learning_rate = 0.5, print_cost = False):
    """
    Trains one binary logistic regression model per class using the one-vs-all approach.
    For each class k, relabels Y so that class k is 1 and all others are 0,
    then trains and saves the parameters using the optimize() function.

    Arguments:
    X_train -- training set represented by a numpy array of shape (num_px * num_px, m_train)
    Y_train -- training labels represented by a numpy array of shape (1, m_train), values 0-9
    num_classes -- number of classes to train (10 for fashion MNIST)
    num_iterations -- hyperparameter representing the number of iterations to optimize the parameters
    learning_rate -- hyperparameter representing the learning rate used in the update rule of optimize()
    print_cost -- Set to true to print the cost every 100 iterations

    Returns:
    all_params -- list of dictionaries, one per class, each containing the weights w and bias b
    all_costs -- list of cost lists, one per class, used to plot the learning curves
    """

    all_params = []
    all_costs  = []

    ### START CODE HERE ###
    for k in range(num_classes):
        print ("\n--- Training model for class %i: %s vs. rest ---" %(k, CLASS_NAMES[k]))

        # Create binary label vector: 1 if class k, 0 otherwise (≈ 1 line of code)
        Y_binary = (Y_train == k).astype(int)

        # initialize parameters with zeros (≈ 1 line of code)
        w, b = initialize_with_zeros(X_train.shape[0])

        # Gradient descent (≈ 1 line of code)
        parameters, grads, costs = optimize(w, b, X_train, Y_binary, num_iterations, learning_rate, print_cost)

        # Save the trained parameters and costs for this class (≈ 2 lines of code)
        all_params.append(parameters)
        all_costs.append(costs)
    ### END CODE HERE ###

    return all_params, all_costs

# GRADED FUNCTION: predict_class

def predict_class(all_params, X):
    '''
    Predict the class label for one or more images by running all one-vs-all models
    and selecting the class with the highest predicted probability.

    Arguments:
    all_params -- list of dictionaries containing the weights w and bias b for each class,
                  as returned by train_all_models
    X -- data of size (num_px * num_px, number of examples)

    Returns:
    predicted_labels -- a numpy array of shape (m,) containing the predicted class (0-9) for each example
    probabilities -- a numpy array of shape (num_classes, m) containing the sigmoid output of each model
    '''

    num_classes = len(all_params)
    m = X.shape[1]
    probabilities = np.zeros((num_classes, m))

    ### START CODE HERE ### (≈ 4 lines of code)
    # Compute the probability for each class by running each one-vs-all model
    for k in range(num_classes):
        w = all_params[k]["w"]
        b = all_params[k]["b"]
        probabilities[k, :] = sigmoid(np.dot(w.T, X) + b).flatten()

    # Select the class with the highest probability
    predicted_labels = np.argmax(probabilities, axis=0)
    ### END CODE HERE ###

    return predicted_labels, probabilities

# GRADED FUNCTION: model

def model(X_train, Y_train, X_test, Y_test, num_iterations = 2000, learning_rate = 0.5, print_cost = False):
    """
    Builds the one-vs-all logistic regression model by calling the functions you've implemented previously

    Arguments:
    X_train -- training set represented by a numpy array of shape (num_px * num_px, m_train)
    Y_train -- training labels represented by a numpy array (vector) of shape (1, m_train)
    X_test -- test set represented by a numpy array of shape (num_px * num_px, m_test)
    Y_test -- test labels represented by a numpy array (vector) of shape (1, m_test)
    num_iterations -- hyperparameter representing the number of iterations to optimize the parameters
    learning_rate -- hyperparameter representing the learning rate used in the update rule of optimize()
    print_cost -- Set to true to print the cost every 100 iterations

    Returns:
    d -- dictionary containing information about the model.
    """

    ### START CODE HERE ###

    # Train all one-vs-all models (≈ 1 line of code)
    all_params, all_costs = train_all_models(X_train, Y_train, NUM_CLASSES, num_iterations, learning_rate, print_cost)

    # Predict test/train set examples (≈ 2 lines of code)
    Y_prediction_train, _ = predict_class(all_params, X_train)
    Y_prediction_test,  _ = predict_class(all_params, X_test)

    ### END CODE HERE ###

    # Print train/test Errors
    print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train.flatten())) * 100))
    print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test.flatten())) * 100))


    d = {"all_costs": all_costs,
         "Y_prediction_test": Y_prediction_test,
         "Y_prediction_train" : Y_prediction_train,
         "all_params" : all_params,
         "learning_rate" : learning_rate,
         "num_iterations": num_iterations}

    return d

d = model(train_set_x, train_set_y, test_set_x, test_set_y, num_iterations = 2000, learning_rate = 0.5, print_cost = True)


--- Training model for class 0: T-shirt/top vs. rest ---
Cost after iteration 0: 0.693147
Cost after iteration 100: 0.116054
Cost after iteration 200: 0.110959
Cost after iteration 300: 0.108221
Cost after iteration 400: 0.106463
Cost after iteration 500: 0.105232
Cost after iteration 600: 0.104316
Cost after iteration 700: 0.103603
Cost after iteration 800: 0.103027
Cost after iteration 900: 0.102548
Cost after iteration 1000: 0.102141
Cost after iteration 1100: 0.101788
Cost after iteration 1200: 0.101476
Cost after iteration 1300: 0.101197
Cost after iteration 1400: 0.100946
Cost after iteration 1500: 0.100717
Cost after iteration 1600: 0.100506
Cost after iteration 1700: 0.100312
Cost after iteration 1800: 0.100132
Cost after iteration 1900: 0.099964

--- Training model for class 1: Trouser vs. rest ---
Cost after iteration 0: 0.693147
Cost after iteration 100: 0.042528
Cost after iteration 200: 0.035553
Cost after iteration 300: 0.032043
Cost after iteration 400: 0.029849
Cost af

For each of the 10 Fashion MNIST classes, the original labels are converted into binary labels: samples of the target class are labeled 1, and all others are labeled 0. A separate logistic regression model is trained for each class using gradient descent, resulting in 10 independent models, each with its own weights and bias.

During inference, an input image is evaluated by all 10 models. Each model outputs a probability that the image belongs to its class, and the final prediction is the class with the highest probability.

Model performance is measured by computing accuracy on both the training and test datasets. Comparing these accuracies helps assess how well the model generalizes, with a large gap indicating potential overfitting.

After 2000 iterations of each item type, the results were

Train accuracy: 45.398% 
Test accuracy: 42.78%