In [1]:
import numpy as np
from sklearn.datasets import load_iris

data = load_iris()
print(data.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [35]:
# Implement batch gradient descent with early stopping for softmax regression
# - softmax for prediction
# - cross entropy as cost function
# - gradient of cross entropy as direction
# - parameter: no. of epochs since last minimum
# - parameter: learning rate


In [29]:
from sklearn.model_selection import train_test_split

# One hot encoded targets
target_onehot = np.zeros((data.target.size, data.target.max()+1), dtype=int)
target_onehot[np.arange(data.target.size), data.target] = 1

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(data.data, target_onehot, test_size=0.1, random_state=42)

In [3]:
## Softmax
 
# softmax score (these go through softmax to get the probs), 
# theta is the parameter matrix (k, n)
# X is the features (m, n)
# --> softmax scores for each sample (m, k)
def softmax_score(X, theta):
    return X @ np.transpose(theta)

# turn softmax scores into probabilities
# scores from softmax_scores() (m, k)
# --> probabilities for each sample (m, k)
def softmax(scores):
    nominator = np.exp(scores)
    denominator = np.sum(nominator, axis=1)[:, np.newaxis]
    return nominator / denominator


X = np.array([[2,2,3,4], [5,1,7,8]]).reshape(2,4)
theta_k = np.array([1,1,0,1]).reshape(-1,1)

softmax(X)

array([[8.25945394e-02, 8.25945394e-02, 2.24515236e-01, 6.10295685e-01],
       [3.50964520e-02, 6.42813940e-04, 2.59329652e-01, 7.04931082e-01]])

In [4]:
# cross entropy for batch (ie. cost function)
# p probabilities from softmax (m, k)
# y targets (m, k)
# --> cross entropy (scalar)
def cross_entropy(p, y):
    cost = -np.mean(np.transpose(np.log(p)) @ y)
    return cost

# cross entropy gradient matrix
# p probabilities from softmax (m, k)
# y targets (m, k)
# X features (m, n)
# --> gradient vector (k, n)
def cross_entropy_gradient(p, y, X):
    k = y.shape[1]
    m, n = X.shape
    gradient = np.zeros((k, n))
    for i in range(k):
        gradient[i,:] = np.mean((p[:,i] - y[:,i])[:, np.newaxis] * X, axis=0)
    return gradient

# Features (X) of shape (m, n)
X = np.array([[0.5, 1.2, 0.8],
              [1.5, 2.3, 1.8],
              [0.3, 1.1, 0.7],
              [1.0, 2.0, 1.0]])

# Predicted probabilities from softmax (p) of shape (m, k)
p = np.array([[0.7, 0.2, 0.1],
              [0.6, 0.3, 0.1],
              [0.8, 0.1, 0.1],
              [0.5, 0.4, 0.1]])

# One-hot encoded true labels (y) of shape (m, k)
y = np.array([[1, 0, 0],
              [0, 1, 0],
              [1, 0, 0],
              [0, 1, 0]])

cross_entropy_gradient(p, y, X)

array([[ 0.2975,  0.45  ,  0.3   ],
       [-0.38  , -0.615 , -0.4075],
       [ 0.0825,  0.165 ,  0.1075]])

In [28]:
# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

preprocessing = Pipeline([("scale", StandardScaler())])

In [33]:
# Preprocess data
X_train_preprocessed = preprocessing.fit_transform(X_train)

In [62]:
epochs = 10000
a = 1 # learning rate
threshold = 10
theta = np.random.random((y_train.shape[1], X_train_preprocessed.shape[1]))
min_cost = float('inf')
min_cost_epoch = 0
min_cost_theta = theta.copy()
cost_history = []

for e in range(epochs):
    scores = softmax_score(X_train_preprocessed, theta)
    probs = softmax(scores)
    cost = cross_entropy(probs, y_train)
    cost_history.append(cost)
    if cost < min_cost:
        min_cost_epoch = e
        min_cost = cost
        min_cost_theta = theta

    if e - min_cost_epoch >= threshold:
        print(cost)
        print(f"breaking at epoch: {e, min_cost_epoch}")
        break
    else:
        theta -= a * cross_entropy_gradient(probs, y_train, X_train_preprocessed)

# Final weights after training
theta = min_cost_theta

# Accuracy on training set
train_scores = softmax_score(X_train_preprocessed, theta)
train_probs = softmax(train_scores)
train_preds = np.argmax(train_probs, axis=1)
y_train_labels = np.argmax(y_train, axis=1)
accuracy = np.mean(train_preds == y_train_labels)
print(f"Train Accuracy: {accuracy * 100:.2f}%")

# Testing on test set
test_scores = softmax_score(X_test, theta)
test_probs = softmax(test_scores)
test_predictions = np.argmax(test_probs, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

# Accuracy
accuracy = np.mean(test_predictions == y_test_labels)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Train Accuracy: 84.44%
Test Accuracy: 60.00%


In [27]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load Iris dataset
data = load_iris()
X = data['data']
y = data['target']

# One-hot encode the target
target_onehot = np.zeros((y.size, y.max() + 1))
target_onehot[np.arange(y.size), y] = 1

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, target_onehot, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Softmax function
def softmax(scores):
    exp_scores = np.exp(scores - np.max(scores, axis=1, keepdims=True))  # Numerical stability
    return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

# Cross entropy loss
def cross_entropy(p, y):
    return -np.mean(np.sum(y * np.log(p + 1e-8), axis=1))  # Adding small value for stability

# Gradient of the cross-entropy loss
def cross_entropy_gradient(p, y, X):
    return np.dot((p - y).T, X) / X.shape[0]

# Softmax scores
def softmax_score(X, theta):
    return np.dot(X, theta.T)

# Parameters
epochs = 1000
learning_rate = 0.01
early_stop_threshold = 20
num_classes = y_train.shape[1]
num_features = X_train.shape[1]

# Initialize weights
theta = np.random.randn(num_classes, num_features)

# Early stopping variables
min_cost = float('inf')
min_cost_epoch = 0
min_cost_theta = theta.copy()

# Gradient Descent Loop
for e in range(epochs):
    # Compute softmax probabilities
    scores = softmax_score(X_train, theta)
    probs = softmax(scores)
    
    # Compute cost
    cost = cross_entropy(probs, y_train)
    
    # Early stopping check
    if cost < min_cost:
        min_cost = cost
        min_cost_epoch = e
        min_cost_theta = theta.copy()  # Save best parameters
    
    if e - min_cost_epoch >= early_stop_threshold:
        print(f"Early stopping at epoch {e} (no improvement since epoch {min_cost_epoch})")
        break
    
    # Update parameters using gradient descent
    gradient = cross_entropy_gradient(probs, y_train, X_train)
    theta -= learning_rate * gradient

    if e % 100 == 0:
        print(f"Epoch {e}: Cost = {cost}")

# Final weights after training
theta = min_cost_theta

# Testing on test set
test_scores = softmax_score(X_test, theta)
test_probs = softmax(test_scores)
test_predictions = np.argmax(test_probs, axis=1)
y_test_labels = np.argmax(y_test, axis=1)

# Accuracy
accuracy = np.mean(test_predictions == y_test_labels)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Epoch 0: Cost = 0.6884630265432256
Epoch 100: Cost = 0.533918007268345
Epoch 200: Cost = 0.4766152401901341
Epoch 300: Cost = 0.4468655680295043
Epoch 400: Cost = 0.4281875941826558
Epoch 500: Cost = 0.4150928121699509
Epoch 600: Cost = 0.40526130506428687
Epoch 700: Cost = 0.3975373827332128
Epoch 800: Cost = 0.39127325654512013
Epoch 900: Cost = 0.38607261777292406
Test Accuracy: 90.00%
