### An Implementation Of Batch Gradient Descent With Early Stopping For Softmax Regression Without Using Scikit-learn

In [1]:
import numpy as np
np.random.seed(2042)

from sklearn import datasets
iris = datasets.load_iris()

In [2]:
def my_test_split(X, y, test_ratio = 0.2, validation_ratio = 0.2):
    total_size = len(X)
    test_size = int(total_size * test_ratio)
    valid_size = int(total_size * validation_ratio)
    train_size = total_size - test_size - valid_size
    
    permutation_indices = np.random.permutation(total_size)
    
    X_train = X[permutation_indices[:train_size]]
    y_train = y[permutation_indices[:train_size]]
    X_valid = X[permutation_indices[train_size:-test_size]]
    y_valid = y[permutation_indices[train_size:-test_size]]
    X_test = X[permutation_indices[-test_size:]]
    y_test = y[permutation_indices[-test_size:]]
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [3]:
def encode_output(y_output, possible_outputs):
    m = len(y_output)
    encoding = np.zeros((m, possible_outputs))
    encoding[np.arange(m), y_output] = 1
    return encoding

In [4]:
X = iris["data"][:, (2,3)]
y = iris["target"]

X_with_bias = np.c_[np.ones([len(X), 1]), X]

X_train, y_train, X_valid, y_valid, X_test, y_test = my_test_split(X_with_bias, y)

y_train_one_hot = encode_output(y_train, len(np.unique(y)))
y_valid_one_hot = encode_output(y_valid, len(np.unique(y)))
y_test_one_hot = encode_output(y_test, len(np.unique(y)))

In [5]:
def softmax(logits):
    exps = np.exp(logits)
    exps_sums = np.sum(exps, axis=1, keepdims=True)
    return exps / exps_sums

$J(\mathbf{\Theta}) =
- \dfrac{1}{m}\sum\limits_{i=1}^{m}\sum\limits_{k=1}^{K}{y_k^{(i)}\log\left(\hat{p}_k^{(i)}\right)}$

$\nabla_{\mathbf{\theta}^{(k)}} \, J(\mathbf{\Theta}) = \dfrac{1}{m} \sum\limits_{i=1}^{m}{ \left ( \hat{p}^{(i)}_k - y_k^{(i)} \right ) \mathbf{x}^{(i)}}$

In [32]:
n_inputs = X_train.shape[1]
n_outputs = len(np.unique(y))
m = len(X_train)
epsilon = 1e-7
iterations = 5001
eta = 0.01

theta = np.random.rand(n_inputs, n_outputs)
for i in range(iterations):
    logits = X_train.dot(theta)
    y_proba = softmax(logits)
    loss = -np.mean(np.sum(np.log(y_proba + epsilon) * y_train_one_hot, axis = 1))
    
    if i % 500 == 0:
        print(f"iteration: {i}, loss: {loss}")
        print(theta)
        print(np.square(theta[1:]))
        print(np.sum(np.square(theta[1:])))
        print()
    
    error = y_proba - y_train_one_hot
#     print(y_proba[0])
#     print(y_train_one_hot[0])
#     print(error[0])
    gradients = 1/m * (X_train.T.dot(error))
    theta = theta - eta * gradients

iteration: 0, loss: 1.6256927652424578
[[0.64169972 0.4312359  0.37721392]
 [0.41944362 0.31001648 0.9673021 ]
 [0.87074017 0.41463444 0.96446695]]
[[0.17593295 0.09611022 0.93567336]
 [0.75818844 0.17192172 0.93019649]]
3.0680231831519

iteration: 500, loss: 0.7501153624129366
[[ 1.35293294  0.29544561 -0.19822901]
 [ 0.24106854  0.74861486  0.7070788 ]
 [ 0.55618596  0.54389953  1.14975608]]
[[0.05811404 0.56042421 0.49996043]
 [0.30934282 0.2958267  1.32193903]]
3.045607232779993

iteration: 1000, loss: 0.633863423393384
[[ 1.89681574  0.14017898 -0.58684518]
 [ 0.15746895  0.80807453  0.73121873]
 [ 0.34295402  0.53323638  1.37365116]]
[[0.02479647 0.65298445 0.53468083]
 [0.11761746 0.28434103 1.88691751]]
3.5013377467419904

iteration: 1500, loss: 0.5631385364010538
[[ 2.31316518  0.03738318 -0.90039882]
 [ 0.08619155  0.85943002  0.75114063]
 [ 0.1801523   0.50959716  1.5600921 ]]
[[0.00742898 0.73861996 0.56421224]
 [0.03245485 0.25968927 2.43388736]]
4.036292668045343

iterati

In [17]:
theta

array([[ 3.51152678, -0.24775141, -2.26775028],
       [-0.36420824,  0.64468723,  0.36937679],
       [-1.0708845 ,  0.2609794 ,  2.41911317]])

In [24]:
logits = X_valid.dot(theta)
y_proba = softmax(logits)
y_predict = np.argmax(y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_valid)
accuracy_score

0.9666666666666667

### With Regularization

In [25]:
n_inputs = X_train.shape[1]
n_outputs = len(np.unique(y))
m = len(X_train)
epsilon = 1e-7
iterations = 5001
eta = 0.1

r_theta = np.random.rand(n_inputs, n_outputs)
for i in range(iterations):
    logits = X_train.dot(r_theta)
    y_proba = softmax(logits)
    loss = -np.mean(np.sum(np.log(y_proba + epsilon) * y_train_one_hot, axis = 1))
    
    if i % 500 == 0:
        print(f"iteration: {i}, loss: {loss}")
    
    error = y_proba - y_train_one_hot
    gradients = 1/m * (X_train.T.dot(error))
    r_theta = r_theta - eta * gradients

iteration: 0, loss: 1.301694722520997
iteration: 500, loss: 0.3928030617844213
iteration: 1000, loss: 0.30658759857848306
iteration: 1500, loss: 0.2621561380281134
iteration: 2000, loss: 0.23316928328420883
iteration: 2500, loss: 0.21228612806606428
iteration: 3000, loss: 0.1963633650154607
iteration: 3500, loss: 0.18375171897389989
iteration: 4000, loss: 0.17347908052506197
iteration: 4500, loss: 0.16492763625647988
iteration: 5000, loss: 0.15768316862309792


In [28]:
logits = X_valid.dot(r_theta)
y_proba = softmax(logits)
y_predict = np.argmax(y_proba, axis=1)

accuracy_score = np.mean(y_predict == y_valid)
accuracy_score

0.9666666666666667

In [18]:
# def get_gradient_scalar(x_i, theta_matrix, output_class_k, true_outcome):
#     '''return p_k^(i) hat - y_k^(i)'''
    
#     matrix_product = x_i @ theta_matrix
#     pki_hat = np.exp(matrix_product[output_class_k]) / np.sum(np.exp(matrix_product)) # probability
    
#     yki = 1 if output_class_k == true_outcome else 0
#     return pki_hat - yki

In [19]:
# def get_theta_gradient(output_class_k, X, y, theta_matrix):
#     theta_gradient = np.zeros((1, X.shape[1]))
#     m = X.shape[0]
#     for x_i, true_outcome in zip(X, y):
#         theta_gradient += get_gradient_scalar(x_i, theta_matrix, output_class_k, true_outcome) * x_i
    
#     return theta_gradient / m

In [9]:
# t0, t1 = 1, 1

# def get_step_multiplier(alpha, epoch):
#     return alpha * (t0 / (t1 + epoch))

In [10]:
# from sklearn.metrics import accuracy_score

# def my_batch_GD(X_train, y_train, X_valid, y_valid, num_output_classes, alpha = 1, epochs = 10):
#     theta_matrix = np.random.rand(X_train[0].size, num_output_classes)
#     best_theta_matrix = theta_matrix
#     best_accuracy_score = accuracy_score(y_valid, np.argmax(X_valid @ theta_matrix, axis=1))
    
#     for e in range(epochs):
#         delta_theta_matrix = np.array([]).reshape((0, X_train.shape[1]))
# #         print(f"delta_theta_matrix: {delta_theta_matrix}")
#         for k in range(num_output_classes):
#             delta_theta_matrix = np.vstack(
#                 (delta_theta_matrix, np.array(get_theta_gradient(k, X_train, y_train, theta_matrix)))
#             )
# #         print(f"delta_theta_matrix: \n{delta_theta_matrix}, \ntheta_matrix: \n{theta_matrix}")
        
#         step_multiplier = get_step_multiplier(alpha, e)
#         theta_matrix -= step_multiplier * np.transpose(delta_theta_matrix)
        
#         temp_accuracy = accuracy_score(y_valid, np.argmax(X_valid @ theta_matrix, axis=1))
#         if temp_accuracy > best_accuracy_score:
#             best_accuracy_score = temp_accuracy
#             best_theta_matrix = np.copy(theta_matrix)
    
#     print("theta_matrix:")
#     print(theta_matrix)
#     print("\nbest_theta_matrix:")
#     print(best_theta_matrix)
#     return best_theta_matrix

In [11]:
# print(my_batch_GD(np.array( [ [1,2,3], [4,5,6], [7,8,9], [10,11,12] ] ), np.array([0, 1, 1, 0]),
#                   np.array( [ [13,14,15] ] ), np.array([1]), 3))

theta_matrix:
[[ 2.05066161  1.34734224 -2.02030864]
 [ 1.94817848  2.20419274 -1.91676995]
 [ 1.7032019   2.36560163 -2.27063944]]

best_theta_matrix:
[[ 2.05066161  1.34734224 -2.02030864]
 [ 1.94817848  2.20419274 -1.91676995]
 [ 1.7032019   2.36560163 -2.27063944]]
[[ 2.05066161  1.34734224 -2.02030864]
 [ 1.94817848  2.20419274 -1.91676995]
 [ 1.7032019   2.36560163 -2.27063944]]


In [20]:
# def fit_and_make_prediction(X_train, y_train, X_valid, y_valid, num_output_classes, alpha=1.0, epochs = 10):
#     theta_matrix = my_batch_GD(X_train, y_train, X_valid, y_valid, num_output_classes, alpha, epochs)
    
# #     print("theta_matrix:")
# #     print(theta_matrix)
    
#     valid_relative_weights = X_valid @ theta_matrix
#     train_relative_weights = X_train @ theta_matrix
#     return np.argmax(valid_relative_weights, axis=1), np.argmax(train_relative_weights, axis=1), theta_matrix

In [21]:
# # Reference regressor
# from sklearn.linear_model import LogisticRegression

# softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=10, random_state=11)
# softmax_reg.fit(X_with_bias, y)

# y_pred = softmax_reg.predict(X_with_bias)
# matches = len(y_pred[y_pred == y])
# total = len(y_pred)
# print(len(y_pred[y_pred == y]))
# print(len(y_pred))
# print(f"Logistic Regression: {matches} / {total}: {matches / total}")
# print(f"Accuracy Score: {accuracy_score(y, y_pred)}")

In [22]:
# valid_pred, train_pred, my_theta_matrix = fit_and_make_prediction(X_train, y_train, X_valid, y_valid, 3, epochs = 50)

# valid_pred_count = len(valid_pred[valid_pred == y_valid])
# train_pred_count = len(train_pred[train_pred == y_train])

# print(f"\nmy validation count: {valid_pred_count} / {len(valid_pred)}")
# print(f"validation accuracy score: {accuracy_score(y_valid, valid_pred)}")

# print(f"my training count: {train_pred_count} / {len(train_pred)}")
# print(f"training accuracy score: {accuracy_score(y_train, train_pred)}")

# print(f"\nmy validation predictions  : {valid_pred}")
# print(f"true validation predictions: {y_valid}")

# print()
# print(f"my training predictions  : {train_pred}")
# print(f"true training predictions: {y_train}")