### Batch Gradient Descent with early stopping for Softmax Regression without SK-learn

In [27]:
# Load the iris dataset

from sklearn import datasets

iris = datasets.load_iris()

In [28]:
# Extract just the petal length and petal width

X = iris.data[:, (2, 3)]
y = iris.target

In [29]:
# Add the bias term for every instance

import numpy as np

X_bias = np.c_[np.ones([len(X), 1]), X]

In [30]:
# Create training and test sets

test_pct = 0.2
validation_pct = 0.2
full_size = len(X_bias)

test_size = int(full_size * test_pct)
validation_size = int(full_size * validation_pct)
train_size = full_size - test_size - validation_size

rnd_indices = np.random.permutation(full_size)

X_train = X_bias[rnd_indices[:train_size]]
y_train = y[rnd_indices[:train_size]]
X_valid = X_bias[rnd_indices[train_size:-test_size]]
y_valid = y[rnd_indices[train_size:-test_size]]
X_test = X_bias[rnd_indices[-test_size:]]
y_test = y[rnd_indices[-test_size:]]

In [31]:
# Define a function for one hot encoding

def to_one_hot(y):
    n_classes = y.max() + 1
    m = len(y)
    Y_one_hot = np.zeros((m, n_classes))
    Y_one_hot[np.arange(m), y] = 1
    return Y_one_hot

In [32]:
# Perform one hot encoding

Y_train_one_hot = to_one_hot(y_train)
Y_valid_one_hot = to_one_hot(y_valid)
Y_test_one_hot = to_one_hot(y_test)

In [33]:
# Define a function for softmax regression

def softmax(logits):
    exps = np.exp(logits)
    exp_sums = np.sum(exps, axis=1, keepdims=True)
    return exps / exp_sums

In [34]:
# Define the number of inputs and outputs

n_inputs = X_train.shape[1]
n_outputs = len(np.unique(y_train))

In [35]:
# Train the model

eta = 0.01
n_iterations = 5001
m = len(X_train)
epsilon = 1e-7

Theta = np.random.randn(n_inputs, n_outputs)

for iteration in range(n_iterations):
    logits = X_train.dot(Theta)
    Y_proba = softmax(logits)
    if iteration % 500 == 0:
        loss = -np.mean(np.sum(Y_train_one_hot * np.log(Y_proba + epsilon), axis=1))
        print(iteration, loss)
    error = Y_proba - Y_train_one_hot
    gradients = 1/m * X_train.T.dot(error)
    Theta = Theta - eta * gradients

0 8.39514458132667
500 0.7253711503438807
1000 0.6254153737178114
1500 0.5604715418952129
2000 0.515653335946687
2500 0.4826760094805529
3000 0.45705418324167074
3500 0.43627500011700276
4000 0.4188575765076855
4500 0.403883253749611
5000 0.3907546566169793


In [36]:
# Make predictions and check the accuracy

logits = X_valid.dot(Theta)
Y_proba = softmax(logits)
y_predict = np.argmax(Y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_valid)
accuracy_score

0.9