In [2]:
# 2 different logistic_regression will be implemented
# - a custom logistic regression, where I'll create my own cost function, gradient evaluation and descent
# - a standard logistic regression from TensorFlow

In [3]:
# Custom Logistic Regression
# Self note:
# there are 3 key functions to build out and understand:
# 1. Hypothesis function 
#         - in our case, it'll be a sigmoid function
#         - this is the function that will map input to output
# 2. Cost function
#         - in our case this is the binary cross entropy (log loss) function
#         - this is the function that measures how well our model performs comparing the estimated to the actual
# 3. Gradient function
#         - in our case, this is the partial derivative of our Cost Function
#         - this is the function to find the optimal parameters in our cost function to minimize our cost function (i.e. reduce error)

In [4]:
import numpy as np
import copy, math
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist

In [100]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Preprocess the data
X_train_flattened = X_train.reshape(X_train.shape[0], -1) / 255.0  # Normalize and flatten
X_test_flattened = X_test.reshape(X_test.shape[0], -1) / 255.0  # Normalize and flatten

print(y_train.shape)

(60000,)


In [101]:
# Initialize parameters
num_samples, num_features = X_train_flattened.shape
num_classes = 10

np.random.seed(0)
W = np.random.randn(num_features, num_classes) * 0.01
b = np.zeros(num_classes)

In [94]:
# Define softmax function
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  # Stability improvement by subtracting max
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

# Define cost function (cross-entropy loss)
def compute_cost(X, y, W, b):
    m = X.shape[0]
    z = np.dot(X, W) + b
    probabilities = softmax(z)
    log_probabilities = -np.log(probabilities[np.arange(m), y])
    cost = np.sum(log_probabilities) / m
    return cost

# Define gradient computation
def compute_gradients(X, y, W, b):
    m = X.shape[0]
    z = np.dot(X, W) + b
    probabilities = softmax(z)
    probabilities[np.arange(m), y] -= 1
    dw = np.dot(X.T, probabilities) / m
    db = np.sum(probabilities, axis=0) / m
    return dw, db

# Define gradient descent
def gradient_descent(X, y, W, b, alpha, num_iterations):
    cost_history = []
    for i in range(num_iterations):
        dw, db = compute_gradients(X, y, W, b)
        W -= alpha * dw
        b -= alpha * db
        cost = compute_cost(X, y, W, b)
        cost_history.append(cost)
        if i % 100 == 0:
            print(f"Iteration {i}, Cost: {cost}")
    return W, b, cost_history

Iteration 0, Cost: 2.3042258768700883
Iteration 100, Cost: 1.5549540445669334
Iteration 200, Cost: 1.1942657719673913
Iteration 300, Cost: 0.9997031688065626
Iteration 400, Cost: 0.880412571824683
Iteration 500, Cost: 0.7998649578765522
Iteration 600, Cost: 0.7416089520333589
Iteration 700, Cost: 0.6973207742112889
Iteration 800, Cost: 0.6623705783600958
Iteration 900, Cost: 0.6339835234050717
Train Accuracy: 86.07%
Test Accuracy: 87.08%


In [75]:
# ONLY FOR REFERENCE hypothesis function for a binary classification problem
# parameters:
#   W: weights (a vector)
#   b: biases
def sigmoid(X, w, b):
    z = np.dot(w, X) + b
    h_x = 1 / (1 + np.exp(-z))
    return h_x

In [None]:
# Training the model
alpha = 0.01
num_iterations = 1000

W, b, cost_history = gradient_descent(X_train_flattened, y_train, W, b, alpha, num_iterations)

# Evaluate the model
def predict(X, W, b):
    z = np.dot(X, W) + b
    probabilities = softmax(z)
    return np.argmax(probabilities, axis=1)

y_train_pred = predict(X_train_flattened, W, b)
y_test_pred = predict(X_test_flattened, W, b)

# Calculate accuracy
train_accuracy = np.mean(y_train_pred == y_train) * 100
test_accuracy = np.mean(y_test_pred == y_test) * 100

print(f"Train Accuracy: {train_accuracy:.2f}%")
print(f"Test Accuracy: {test_accuracy:.2f}%")