### An Implementation Of Batch Gradient Descent With Early Stopping For Softmax Regression Without Using Scikit-learn

In [1]:
import numpy as np
np.random.seed(2042)

from sklearn import datasets
iris = datasets.load_iris()

In [2]:
def my_test_split(X, y, test_ratio = 0.2, validation_ratio = 0.2):
    total_size = len(X)
    test_size = int(total_size * test_ratio)
    valid_size = int(total_size * validation_ratio)
    train_size = total_size - test_size - valid_size
    
    permutation_indices = np.random.permutation(total_size)
    
    X_train = X[permutation_indices[:train_size]]
    y_train = y[permutation_indices[:train_size]]
    X_valid = X[permutation_indices[train_size:-test_size]]
    y_valid = y[permutation_indices[train_size:-test_size]]
    X_test = X[permutation_indices[-test_size:]]
    y_test = y[permutation_indices[-test_size:]]
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [3]:
def encode_output(y_output, possible_outputs):
    m = len(y_output)
    encoding = np.zeros((m, possible_outputs))
    encoding[np.arange(m), y_output] = 1
    return encoding

In [4]:
X = iris["data"][:, (2,3)]
y = iris["target"]

X_with_bias = np.c_[np.ones([len(X), 1]), X]

X_train, y_train, X_valid, y_valid, X_test, y_test = my_test_split(X_with_bias, y)

y_train_one_hot = encode_output(y_train, len(np.unique(y)))
y_valid_one_hot = encode_output(y_valid, len(np.unique(y)))
y_test_one_hot = encode_output(y_test, len(np.unique(y)))

In [5]:
def softmax(logits):
    exps = np.exp(logits)
    exps_sums = np.sum(exps, axis=1, keepdims=True)
    return exps / exps_sums

$J(\mathbf{\Theta}) =
- \dfrac{1}{m}\sum\limits_{i=1}^{m}\sum\limits_{k=1}^{K}{y_k^{(i)}\log\left(\hat{p}_k^{(i)}\right)}$

$\nabla_{\mathbf{\theta}^{(k)}} \, J(\mathbf{\Theta}) = \dfrac{1}{m} \sum\limits_{i=1}^{m}{ \left ( \hat{p}^{(i)}_k - y_k^{(i)} \right ) \mathbf{x}^{(i)}}$

In [23]:
n_inputs = X_train.shape[1]
n_outputs = len(np.unique(y))
m = len(X_train)
epsilon = 1e-7
iterations = 5001
eta = 0.01

theta = np.random.rand(n_inputs, n_outputs)
for i in range(iterations):
    logits = X_train.dot(theta)
    y_proba = softmax(logits)
    loss = -np.mean(np.sum(np.log(y_proba + epsilon) * y_train_one_hot, axis = 1))
    
    if i % 500 == 0:
        print(f"iteration: {i}, loss: {loss}")
    
    error = y_proba - y_train_one_hot

    gradients = 1/m * (X_train.T.dot(error))
    theta = theta - eta * gradients

iteration: 0, loss: 2.424039402976285
iteration: 500, loss: 0.7899842731479666
iteration: 1000, loss: 0.6593003448623166
iteration: 1500, loss: 0.5806238905682448
iteration: 2000, loss: 0.5285180290455324
iteration: 2500, loss: 0.49106178196638345
iteration: 3000, loss: 0.462411443393596
iteration: 3500, loss: 0.4394672725278143
iteration: 4000, loss: 0.4204536876487131
iteration: 4500, loss: 0.4042834955699425
iteration: 5000, loss: 0.39025269659925144


In [24]:
theta

array([[ 3.58781604, -0.19997109, -2.56327463],
       [-0.2063446 ,  0.86748759,  0.76527503],
       [-1.0967094 ,  0.05889889,  1.90669719]])

In [25]:
logits = X_valid.dot(theta)
y_proba = softmax(logits)
y_predict = np.argmax(y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_valid)
accuracy_score

0.9666666666666667

### With Regularization

In [31]:
n_inputs = X_train.shape[1]
n_outputs = len(np.unique(y))
m = len(X_train)
epsilon = 1e-7
iterations = 5001
eta = 0.1
alpha = 0.1

r_theta = np.random.rand(n_inputs, n_outputs)
for i in range(iterations):
    logits = X_train.dot(r_theta)
    y_proba = softmax(logits)
    entropy_loss = -np.mean(np.sum(np.log(y_proba + epsilon) * y_train_one_hot, axis = 1))
    l2_loss = 0.5 * np.sum(np.square(theta[1:]))
    loss = entropy_loss + l2_loss
    
    if i % 500 == 0:
        print(f"iteration: {i}, loss: {loss}")
    
    error = y_proba - y_train_one_hot
    gradients = 1/m * (X_train.T.dot(error)) + np.r_[np.zeros([1, n_outputs]), alpha * r_theta[1:]]
    r_theta = r_theta - eta * gradients

iteration: 0, loss: 4.262891533168492
iteration: 500, loss: 3.5710017605869813
iteration: 1000, loss: 3.518807242407099
iteration: 1500, loss: 3.496356208517777
iteration: 2000, loss: 3.484053376463778
iteration: 2500, loss: 3.476647793473481
iteration: 3000, loss: 3.4719793370568914
iteration: 3500, loss: 3.4689591834383267
iteration: 4000, loss: 3.466974372112718
iteration: 4500, loss: 3.465656835895248
iteration: 5000, loss: 3.4647764965015195


In [32]:
logits = X_valid.dot(r_theta)
y_proba = softmax(logits)
y_predict = np.argmax(y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_valid)
accuracy_score

1.0

### With Early Stopping