### An Implementation Of Batch Gradient Descent With Early Stopping For Softmax Regression Without Using Scikit-learn

In [18]:
import numpy as np
np.random.seed(2042)

from sklearn import datasets
iris = datasets.load_iris()

In [19]:
def my_test_split(X, y, test_ratio = 0.2, validation_ratio = 0.2):
    total_size = len(X)
    test_size = int(total_size * test_ratio)
    valid_size = int(total_size * validation_ratio)
    train_size = total_size - test_size - valid_size
    
    permutation_indices = np.random.permutation(total_size)
    
    X_train = X[permutation_indices[:train_size]]
    y_train = y[permutation_indices[:train_size]]
    X_valid = X[permutation_indices[train_size:-test_size]]
    y_valid = y[permutation_indices[train_size:-test_size]]
    X_test = X[permutation_indices[-test_size:]]
    y_test = X[permutation_indices[-test_size:]]
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [45]:
def encode_output(y_output, possible_outputs):
    m = len(y_output)
    encoding = np.zeros((m, possible_outputs))
    encoding[np.arange(m), y_output] = 1
    return encoding

In [47]:
X = iris["data"][:, (2,3)]
y = iris["target"]

X_with_bias = np.c_[np.ones([len(X), 1]), X]

X_train, y_train, X_valid, y_valid, X_test, y_test = my_test_split(X_with_bias, encode_output(y, len(np.unique(y))))

In [48]:
def softmax(logits):
    exps = np.exp(logits)
    exps_sums = np.sum(exps, axis=1, keepdims=True)
    return exps / exps_sums

$J(\mathbf{\Theta}) =
- \dfrac{1}{m}\sum\limits_{i=1}^{m}\sum\limits_{k=1}^{K}{y_k^{(i)}\log\left(\hat{p}_k^{(i)}\right)}$

$\nabla_{\mathbf{\theta}^{(k)}} \, J(\mathbf{\Theta}) = \dfrac{1}{m} \sum\limits_{i=1}^{m}{ \left ( \hat{p}^{(i)}_k - y_k^{(i)} \right ) \mathbf{x}^{(i)}}$

In [62]:
n_inputs = X_train.shape[1]
n_outputs = len(np.unique(y))
epsilon = 1e-7
iterations = 1

theta = np.random.rand(n_inputs, n_outputs)
for i in range(iterations):
    logits = X_train.dot(theta)
    y_proba = softmax(logits)
    loss = -np.mean(np.sum(np.log(y_proba + epsilon) * y_train, axis = 1))
    
    

1.3398890678549158

[[-0.50491504 -1.52010368 -1.72735338]
 [-0.71980296 -1.24963939 -1.48481612]
 [-0.52870168 -1.45894568 -1.72513284]
 [-0.45839366 -1.55355148 -1.85657792]
 [-0.59994152 -1.36402782 -1.63205426]]


In [3]:
def get_gradient_scalar(x_i, theta_matrix, output_class_k, true_outcome):
    '''return p_k^(i) hat - y_k^(i)'''
    
    matrix_product = x_i @ theta_matrix
    pki_hat = np.exp(matrix_product[output_class_k]) / np.sum(np.exp(matrix_product)) # probability
    
    yki = 1 if output_class_k == true_outcome else 0
    return pki_hat - yki

In [4]:
def get_theta_gradient(output_class_k, X, y, theta_matrix):
    theta_gradient = np.zeros((1, X.shape[1]))
    m = X.shape[0]
    for x_i, true_outcome in zip(X, y):
        theta_gradient += get_gradient_scalar(x_i, theta_matrix, output_class_k, true_outcome) * x_i
    
    return theta_gradient / m

In [5]:
t0, t1 = 1, 1

def get_step_multiplier(alpha, epoch):
    return alpha * (t0 / (t1 + epoch))

In [6]:
from sklearn.metrics import accuracy_score

def my_batch_GD(X_train, y_train, X_valid, y_valid, num_output_classes, alpha = 1, epochs = 10):
    theta_matrix = np.random.rand(X_train[0].size, num_output_classes)
    best_theta_matrix = theta_matrix
    best_accuracy_score = accuracy_score(y_valid, np.argmax(X_valid @ theta_matrix, axis=1))
    
    for e in range(epochs):
        delta_theta_matrix = np.array([]).reshape((0, X_train.shape[1]))
#         print(f"delta_theta_matrix: {delta_theta_matrix}")
        for k in range(num_output_classes):
            delta_theta_matrix = np.vstack(
                (delta_theta_matrix, np.array(get_theta_gradient(k, X_train, y_train, theta_matrix)))
            )
#         print(f"delta_theta_matrix: \n{delta_theta_matrix}, \ntheta_matrix: \n{theta_matrix}")
        
        step_multiplier = get_step_multiplier(alpha, e)
        theta_matrix -= step_multiplier * np.transpose(delta_theta_matrix)
        
        temp_accuracy = accuracy_score(y_valid, np.argmax(X_valid @ theta_matrix, axis=1))
        if temp_accuracy > best_accuracy_score:
            best_accuracy_score = temp_accuracy
            best_theta_matrix = np.copy(theta_matrix)
    
    print("theta_matrix:")
    print(theta_matrix)
    print("\nbest_theta_matrix:")
    print(best_theta_matrix)
    return best_theta_matrix

In [7]:
print(my_batch_GD(np.array( [ [1,2,3], [4,5,6], [7,8,9], [10,11,12] ] ), np.array([0, 1, 1, 0]),
                  np.array( [ [13,14,15] ] ), np.array([1]), 3))

theta_matrix:
[[0.76113877 0.69342476 0.48133725]
 [0.65674116 0.97044804 0.23801772]
 [1.02029329 0.42895845 0.46294118]]

best_theta_matrix:
[[-1.71797406  3.16133731  0.49253752]
 [-2.26535748  3.8710058   0.2595586 ]
 [-2.34479115  3.76216141  0.49482266]]
[[-1.71797406  3.16133731  0.49253752]
 [-2.26535748  3.8710058   0.2595586 ]
 [-2.34479115  3.76216141  0.49482266]]


In [8]:
def fit_and_make_prediction(X_train, y_train, X_valid, y_valid, num_output_classes, alpha=1.0, epochs = 10):
    theta_matrix = my_batch_GD(X_train, y_train, X_valid, y_valid, num_output_classes, alpha, epochs)
    
#     print("theta_matrix:")
#     print(theta_matrix)
    
    valid_relative_weights = X_valid @ theta_matrix
    train_relative_weights = X_train @ theta_matrix
    return np.argmax(valid_relative_weights, axis=1), np.argmax(train_relative_weights, axis=1), theta_matrix

In [11]:
# Reference regressor
from sklearn.linear_model import LogisticRegression

softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=10, random_state=11)
softmax_reg.fit(X_with_bias, y)

y_pred = softmax_reg.predict(X_with_bias)
matches = len(y_pred[y_pred == y])
total = len(y_pred)
print(len(y_pred[y_pred == y]))
print(len(y_pred))
print(f"Logistic Regression: {matches} / {total}: {matches / total}")
print(f"Accuracy Score: {accuracy_score(y, y_pred)}")

144
150
Logistic Regression: 144 / 150: 0.96
Accuracy Score: 0.96


In [12]:
valid_pred, train_pred, my_theta_matrix = fit_and_make_prediction(X_train, y_train, X_valid, y_valid, 3, epochs = 50)

valid_pred_count = len(valid_pred[valid_pred == y_valid])
train_pred_count = len(train_pred[train_pred == y_train])

print(f"\nmy validation count: {valid_pred_count} / {len(valid_pred)}")
print(f"validation accuracy score: {accuracy_score(y_valid, valid_pred)}")

print(f"my training count: {train_pred_count} / {len(train_pred)}")
print(f"training accuracy score: {accuracy_score(y_train, train_pred)}")

print(f"\nmy validation predictions  : {valid_pred}")
print(f"true validation predictions: {y_valid}")

print()
print(f"my training predictions  : {train_pred}")
print(f"true training predictions: {y_train}")

theta_matrix:
[[ 1.26164891  0.0095573  -0.33291927]
 [ 0.19846246  0.8023991   0.93816814]
 [ 0.34243093  0.40465132  0.35505729]]

best_theta_matrix:
[[ 1.02195519  0.04039591 -0.12406416]
 [ 0.2423502   0.79856735  0.89811216]
 [ 0.44094503  0.45976293  0.20143157]]

my validation count: 23 / 30
validation accuracy score: 0.7666666666666667
my training count: 57 / 90
training accuracy score: 0.6333333333333333

my validation predictions  : [0 0 0 0 1 2 1 1 0 1 0 1 1 1 1 0 1 1 0 1 1 0 0 0 1 0 1 0 0 1]
true validation predictions: [0 0 0 0 2 2 2 2 0 1 0 1 1 2 1 0 2 1 0 1 2 0 0 0 0 0 1 0 0 1]

my training predictions  : [0 0 1 1 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 0 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 0 0 1 1 0 1
 1 0 0 1 1 0 0 0 1 1 1 1 1 1 1 0]
true training predictions: [0 0 1 1 2 2 2 1 0 2 0 1 1 1 0 1 1 1 1 0 2 0 2 1 2 2 0 2 2 1 1 1 2 2 1 1 1
 2 2 2 1 2 0 0 1 2 2 2 1 1 0 0 2 1 1 1 0 2 2 1 0 0 0 2 1 2 2 1 0 0 2 1 0 1
 2 0 0 2 2 