### An Implementation Of Batch Gradient Descent With Early Stopping For Softmax Regression Without Using Scikit-learn

In [116]:
import numpy as np
np.random.seed(2042)

In [None]:
def my_test_split(X, y, test_ratio = 0.2, validation_ratio = 0.2):
    total_size = len(X)
    test_size = total_size * test_ratio
    valid_size = total_size * validation_ratio
    train_size = total_size - test_size - valid_size
    
    permutation_indices = np.ranndom.permutation(total_size)
    
    X_train = X[permutation_indices[:train_size]]
    y_train = y[permutation_indices[:train_size]]
    X_valid = X[permutation_indices[train_size:-test_size]]
    y_valid = y[permutation_indices[train_size:-test_size]]
    X_test = X[permutation_indices[-test_size:]]
    

In [63]:
def get_gradient_scalar(x_i, theta_matrix, output_class_k, true_outcome):
    '''return p_k^(i) hat - y_k^(i)'''
    
    matrix_product = x_i @ theta_matrix
    pki_hat = np.exp(matrix_product[output_class_k]) / np.sum(np.exp(matrix_product)) # probability
    
    yki = 1 if output_class_k == true_outcome else 0
    return pki_hat - yki

In [64]:
def get_theta_gradient(output_class_k, X, y, theta_matrix):
    theta_gradient = np.zeros((1, X.shape[1]))
    m = X.shape[0]
    for x_i, true_outcome in zip(X, y):
        theta_gradient += get_gradient_scalar(x_i, theta_matrix, output_class_k, true_outcome) * x_i
    
    return theta_gradient / m

In [65]:
t0, t1 = 1, 1

def get_step_multiplier(alpha, epoch):
    return alpha * (t0 / (t1 + epoch))

In [109]:
def my_batch_GD(X, y, num_output_classes, alpha = 1, epochs = 10):
    theta_matrix = np.random.rand(X[0].size, num_output_classes)
    
    for e in range(epochs):
        delta_theta_matrix = np.array([]).reshape((0, X.shape[1]))
#         print(f"delta_theta_matrix: {delta_theta_matrix}")
        for k in range(num_output_classes):
            delta_theta_matrix = np.vstack((delta_theta_matrix, np.array(get_theta_gradient(k, X, y, theta_matrix))))
#         print(f"delta_theta_matrix: \n{delta_theta_matrix}, \ntheta_matrix: \n{theta_matrix}")
        
        step_multiplier = get_step_multiplier(alpha, e)
        theta_matrix -= step_multiplier * np.transpose(delta_theta_matrix)
    return theta_matrix

In [110]:
print(my_batch_GD(np.array( [ [1,2,3], [4,5,6], [7,8,9], [10,11,12] ] ), np.array([0, 1, 1, 0]), 3))

[[ 2.14574938  1.90136613 -3.19411938]
 [ 2.41716302  2.12054434 -4.1087619 ]
 [ 3.13768481  2.49425445 -4.62433411]]


In [108]:
def fit_and_make_prediction(X_train, y_train, X_test, num_output_classes, alpha=1.0, epochs = 10):
    theta_matrix = my_batch_GD(X_train, y_train, num_output_classes, alpha, epochs)
    print(theta_matrix)
    relative_weights = X_test @ theta_matrix
    return np.argmax(relative_weights, axis=1)

In [99]:
from sklearn import datasets
iris = datasets.load_iris()

In [112]:
X = iris["data"][:, (2,3)]
y = iris["target"]

X_with_bias = np.c_[np.ones([len(X), 1]), X]

In [113]:
# Reference regressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=10, random_state=11)
softmax_reg.fit(X_with_bias, y)

y_pred = softmax_reg.predict(X_with_bias)
matches = len(y_pred[y_pred == y])
total = len(y_pred)
print(len(y_pred[y_pred == y]))
print(len(y_pred))
print(f"Logistic Regression: {matches} / {total}: {matches / total}")
print(f"Accuracy Score: {accuracy_score(y, y_pred)}")

144
150
Logistic Regression: 144 / 150: 0.96
Accuracy Score: 0.96


In [115]:
my_imp_y_pred = fit_and_make_prediction(X, y, X, 3, epochs = 40)

# print(my_imp_y_pred)
my_imp_matches = len(my_imp_y_pred[my_imp_y_pred == y])
my_imp_total = len(my_imp_y_pred)
print(f"My Implementation Of Batch Gradient Descent: {my_imp_matches} / {my_imp_total}")
print(f"Accuracy Score: {accuracy_score(y, my_imp_y_pred)}")
print(f"my predictions: {my_imp_y_pred}")

[[0.57588492 0.72834736 0.63385184]
 [0.05161284 0.37855156 0.83367592]]
My Implementation Of Batch Gradient Descent: 50 / 150
Accuracy Score: 0.3333333333333333
my predictions: [1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 2 2 2 1 1 1 2 1 2 1 1 2 1 1 1 1 2 1 1 1 1 1
 1 1 1 2 2 1 2 2 2 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
