### P5: CE Loss Manual

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [2]:
def load_and_preprocess_data():
    wine = load_wine()
    X = wine.data 
    y = wine.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test

def one_hot_encode(y, num_classes=3):
    n = len(y)
    y_one_hot = np.zeros((n, num_classes))
    y_one_hot[np.arange(n), y] = 1
    return y_one_hot

In [3]:
def relu(z):
    """ReLU activation function"""
    return np.maximum(0, z)

def relu_derivative(z):
    """Derivative of ReLU"""
    return (z > 0).astype(float)

def sigmoid(z):
    """Sigmoid activation function"""
    return 1 / (1 + np.exp(-np.clip(z, -500, 500)))

def sigmoid_derivative(z):
    """Derivative of sigmoid"""
    s = sigmoid(z)
    return s * (1 - s)

def softmax(z):
    """Softmax activation function (numerically stable)"""
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

In [4]:
def cross_entropy_loss(y_true, y_pred):
    n = y_true.shape[0]
    log_pred = np.log(y_pred + 1e-10)
    loss = -np.sum(y_true * log_pred) / n
    return loss

In [5]:
class MaxLikelihoodNeuralNetwork:
    
    def __init__(self, input_size, hidden_size, output_size, 
                 hidden_activation='relu', random_seed=42):
        """
        Initialize neural network
        
        Parameters:
        - input_size: number of input features
        - hidden_size: number of neurons in hidden layer
        - output_size: number of output classes (3 for wine)
        - hidden_activation: 'relu' or 'sigmoid'
        """
        np.random.seed(random_seed)
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.hidden_activation = hidden_activation
        
        self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
        self.b1 = np.zeros((1, hidden_size))
        
        self.W2 = np.random.randn(hidden_size, output_size) * np.sqrt(2.0 / hidden_size)
        self.b2 = np.zeros((1, output_size))
        
        self.cache = {}
        
    def forward(self, X):
        """Forward propagation
        
        X -> [W1, b1] -> z1 -> ReLU/Sigmoid -> a1 -> [W2, b2] -> z2 -> Softmax -> output
        """
        n = X.shape[0]
        
        z1 = X @ self.W1 + self.b1
        
        if self.hidden_activation == 'relu':
            a1 = relu(z1)  
        else:
            a1 = sigmoid(z1)  
        
        z2 = a1 @ self.W2 + self.b2 
        output = softmax(z2)  
        self.cache = {
            'X': X,
            'z1': z1,
            'a1': a1,
            'z2': z2,
            'output': output,
            'n': n
        }
        
        return output
    
    def backward(self, y_true):
        """Backward propagation
        
        Uses the simplified derivative: d_loss/d_z2 = (output - y_true) / n
        """
        X = self.cache['X']
        z1 = self.cache['z1']
        a1 = self.cache['a1']
        output = self.cache['output']
        n = self.cache['n']

        dz2 = (output - y_true) / n
        
        dW2 = a1.T @ dz2
        db2 = np.sum(dz2, axis=0, keepdims=True)
        
        da1 = dz2 @ self.W2.T
        
        if self.hidden_activation == 'relu':
            dz1 = da1 * relu_derivative(z1)
        else:
            dz1 = da1 * sigmoid_derivative(z1)
        
        dW1 = X.T @ dz1
        db1 = np.sum(dz1, axis=0, keepdims=True)
        
        return dW1, db1, dW2, db2
    
    def update_parameters(self, dW1, db1, dW2, db2, learning_rate):
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2
    
    def train(self, X_train, y_train, X_test, y_test, 
              epochs=1000, learning_rate=0.1, print_every=100):
        
        y_train_oh = one_hot_encode(y_train, self.output_size)
        y_test_oh = one_hot_encode(y_test, self.output_size)
        
        train_losses = []
        test_losses = []
        train_accuracies = []
        test_accuracies = []
        
        for epoch in range(epochs):
            output = self.forward(X_train)
            
            train_loss = cross_entropy_loss(y_train_oh, output)
            
            dW1, db1, dW2, db2 = self.backward(y_train_oh)
            
            self.update_parameters(dW1, db1, dW2, db2, learning_rate)
            
            test_output = self.forward(X_test)
            test_loss = cross_entropy_loss(y_test_oh, test_output)
            
            train_acc = self.accuracy(X_train, y_train)
            test_acc = self.accuracy(X_test, y_test)
            
            train_losses.append(train_loss)
            test_losses.append(test_loss)
            train_accuracies.append(train_acc)
            test_accuracies.append(test_acc)

            if (epoch + 1) % print_every == 0:
                print(f"Epoch {epoch+1}/{epochs} | "
                      f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}% | "
                      f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.2f}%")
        
        return train_losses, test_losses, train_accuracies, test_accuracies
    
    def predict(self, X):
        output = self.forward(X)
        predictions = np.argmax(output, axis=1)
        return predictions
    
    def accuracy(self, X, y_true):
        predictions = self.predict(X)
        accuracy = np.mean(predictions == y_true) * 100
        return accuracy
        
def main():
    print("=" * 70)
    print("PROBLEM 5: MAX LIKELIHOOD NEURAL NETWORK CLASSIFIER")
    print("=" * 70)

    X_train, X_test, y_train, y_test = load_and_preprocess_data()
    
    input_size = X_train.shape[1] 
    hidden_size = 10  
    output_size = 3  
    
    model = MaxLikelihoodNeuralNetwork(input_size=input_size, hidden_size=hidden_size, output_size=output_size, hidden_activation='relu')
    
    train_losses, test_losses, train_accs, test_accs = model.train(X_train, y_train, X_test, y_test, epochs=1000, learning_rate=0.1, print_every=200)    

if __name__ == "__main__":
    main()

PROBLEM 5: MAX LIKELIHOOD NEURAL NETWORK CLASSIFIER
Epoch 200/1000 | Train Loss: 0.0241 | Train Acc: 100.00% | Test Loss: 0.0506 | Test Acc: 100.00%
Epoch 400/1000 | Train Loss: 0.0089 | Train Acc: 100.00% | Test Loss: 0.0386 | Test Acc: 100.00%
Epoch 600/1000 | Train Loss: 0.0052 | Train Acc: 100.00% | Test Loss: 0.0341 | Test Acc: 100.00%
Epoch 800/1000 | Train Loss: 0.0036 | Train Acc: 100.00% | Test Loss: 0.0318 | Test Acc: 100.00%
Epoch 1000/1000 | Train Loss: 0.0027 | Train Acc: 100.00% | Test Loss: 0.0304 | Test Acc: 100.00%


Cross entropy loss is better than square loss for classification as it treats the problem as "what's the prob of each class?" instead of just calculating numerical differences. With softmax, the network calculates probabilities that add up to 1. Cross entropy the measures how wrong these probs are compared to the "true" answer. this gives network more strong learning values when it is taking a wrong step which helps it learn faster. 