### An Implementation Of Batch Gradient Descent With Early Stopping For Softmax Regression Without Using Scikit-learn

In [7]:
import numpy as np
np.random.seed(2042)

In [19]:
def my_test_split(X, y, test_ratio = 0.2, validation_ratio = 0.2):
    total_size = len(X)
    test_size = int(total_size * test_ratio)
    valid_size = int(total_size * validation_ratio)
    train_size = total_size - test_size - valid_size
    
    permutation_indices = np.random.permutation(total_size)
    
    X_train = X[permutation_indices[:train_size]]
    y_train = y[permutation_indices[:train_size]]
    X_valid = X[permutation_indices[train_size:-test_size]]
    y_valid = y[permutation_indices[train_size:-test_size]]
    X_test = X[permutation_indices[-test_size:]]
    y_test = X[permutation_indices[-test_size:]]
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [20]:
def get_gradient_scalar(x_i, theta_matrix, output_class_k, true_outcome):
    '''return p_k^(i) hat - y_k^(i)'''
    
    matrix_product = x_i @ theta_matrix
    pki_hat = np.exp(matrix_product[output_class_k]) / np.sum(np.exp(matrix_product)) # probability
    
    yki = 1 if output_class_k == true_outcome else 0
    return pki_hat - yki

In [21]:
def get_theta_gradient(output_class_k, X, y, theta_matrix):
    theta_gradient = np.zeros((1, X.shape[1]))
    m = X.shape[0]
    for x_i, true_outcome in zip(X, y):
        theta_gradient += get_gradient_scalar(x_i, theta_matrix, output_class_k, true_outcome) * x_i
    
    return theta_gradient / m

In [22]:
t0, t1 = 1, 1

def get_step_multiplier(alpha, epoch):
    return alpha * (t0 / (t1 + epoch))

In [70]:
from sklearn.metrics import accuracy_score

def my_batch_GD(X_train, y_train, X_valid, y_valid, num_output_classes, alpha = 1, epochs = 10):
    theta_matrix = np.random.rand(X_train[0].size, num_output_classes)
    best_theta_matrix = theta_matrix
    best_accuracy_score = accuracy_score(y_valid, np.argmax(X_valid @ theta_matrix, axis=1))
    
    for e in range(epochs):
        delta_theta_matrix = np.array([]).reshape((0, X_train.shape[1]))
#         print(f"delta_theta_matrix: {delta_theta_matrix}")
        for k in range(num_output_classes):
            delta_theta_matrix = np.vstack(
                (delta_theta_matrix, np.array(get_theta_gradient(k, X_train, y_train, theta_matrix)))
            )
#         print(f"delta_theta_matrix: \n{delta_theta_matrix}, \ntheta_matrix: \n{theta_matrix}")
        
        step_multiplier = get_step_multiplier(alpha, e)
        theta_matrix -= step_multiplier * np.transpose(delta_theta_matrix)
        
        temp_accuracy = accuracy_score(y_valid, np.argmax(X_valid @ theta_matrix, axis=1))
        if temp_accuracy > best_accuracy_score:
            best_accuracy_score = temp_accuracy
            best_theta_matrix = np.copy(theta_matrix)
    
    print("theta_matrix:")
    print(theta_matrix)
    print("\nbest_theta_matrix:")
    print(best_theta_matrix)
    return best_theta_matrix

In [71]:
print(my_batch_GD(np.array( [ [1,2,3], [4,5,6], [7,8,9], [10,11,12] ] ), np.array([0, 1, 1, 0]),
                  np.array( [ [13,14,15] ] ), np.array([1]), 3))

theta_matrix:
[[ 0.85429727  1.65845611 -0.18244137]
 [ 0.81107944  0.97450803  0.01241281]
 [ 0.98072255  1.40100181 -0.21864462]]

best_theta_matrix:
[[-1.0723057   3.58501949 -0.18240178]
 [-1.48357023  3.26907852  0.01249199]
 [-1.68197382  4.06357941 -0.21852585]]
[[-1.0723057   3.58501949 -0.18240178]
 [-1.48357023  3.26907852  0.01249199]
 [-1.68197382  4.06357941 -0.21852585]]


In [72]:
def fit_and_make_prediction(X_train, y_train, X_valid, y_valid, num_output_classes, alpha=1.0, epochs = 10):
    theta_matrix = my_batch_GD(X_train, y_train, X_valid, y_valid, num_output_classes, alpha, epochs)
    
#     print("theta_matrix:")
#     print(theta_matrix)
    
    valid_relative_weights = X_valid @ theta_matrix
    train_relative_weights = X_train @ theta_matrix
    return np.argmax(valid_relative_weights, axis=1), np.argmax(train_relative_weights, axis=1), theta_matrix

In [73]:
from sklearn import datasets
iris = datasets.load_iris()

In [74]:
X = iris["data"][:, (2,3)]
y = iris["target"]

X_with_bias = np.c_[np.ones([len(X), 1]), X]

X_train, y_train, X_valid, y_valid, X_test, y_test = my_test_split(X_with_bias, y)

In [75]:
# Reference regressor
from sklearn.linear_model import LogisticRegression

softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=10, random_state=11)
softmax_reg.fit(X_with_bias, y)

y_pred = softmax_reg.predict(X_with_bias)
matches = len(y_pred[y_pred == y])
total = len(y_pred)
print(len(y_pred[y_pred == y]))
print(len(y_pred))
print(f"Logistic Regression: {matches} / {total}: {matches / total}")
print(f"Accuracy Score: {accuracy_score(y, y_pred)}")

144
150
Logistic Regression: 144 / 150: 0.96
Accuracy Score: 0.96


In [78]:
valid_pred, train_pred, my_theta_matrix = fit_and_make_prediction(X_train, y_train, X_valid, y_valid, 3, epochs = 50)

valid_pred_count = len(valid_pred[valid_pred == y_valid])
train_pred_count = len(train_pred[train_pred == y_train])

print(f"\nmy validation count: {valid_pred_count} / {len(valid_pred)}")
print(f"validation accuracy score: {accuracy_score(y_valid, valid_pred)}")

print(f"my training count: {train_pred_count} / {len(train_pred)}")
print(f"training accuracy score: {accuracy_score(y_train, train_pred)}")

print(f"\nmy validation predictions  : {valid_pred}")
print(f"true validation predictions: {y_valid}")

print()
print(f"my training predictions  : {train_pred}")
print(f"true training predictions: {y_train}")

theta_matrix:
[[ 1.48356796  0.41831849  0.1698247 ]
 [ 0.07265836  0.47269283  0.33817076]
 [-0.2215154   0.36102665  1.01339246]]

best_theta_matrix:
[[ 1.28931799  0.44259767  0.3397955 ]
 [ 0.1203259   0.45017675  0.31301931]
 [-0.13388611  0.39265436  0.89413547]]

my validation count: 24 / 30
validation accuracy score: 0.8
my training count: 72 / 90
training accuracy score: 0.8

my validation predictions  : [0 2 2 1 0 2 1 0 2 0 2 1 2 1 0 0 1 0 1 2 1 0 2 1 2 0 2 2 2 0]
true validation predictions: [0 1 2 1 0 2 1 0 2 0 2 2 2 2 0 0 1 0 1 2 1 0 1 1 1 0 2 1 2 0]

my training predictions  : [2 2 2 0 2 2 0 2 1 0 0 2 2 2 2 1 1 0 2 2 0 2 2 0 1 0 1 0 0 1 0 1 1 2 2 0 0
 2 0 1 0 1 2 2 2 2 0 2 2 1 1 1 2 2 1 0 2 2 1 1 0 1 0 2 2 2 1 2 0 1 2 1 2 0
 2 2 2 1 0 1 2 2 1 2 2 0 1 0 2 0]
true training predictions: [2 1 1 0 1 2 0 1 1 0 0 2 1 2 2 1 1 0 2 2 0 2 2 0 1 0 1 0 0 1 0 1 1 2 2 0 0
 1 0 1 0 1 2 2 2 1 0 1 2 1 1 2 2 1 1 0 2 2 1 1 0 1 0 2 2 1 2 1 0 2 1 1 2 0
 2 2 2 1 0 1 2 1 1 2 2 0 2 0 1 0]
