In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

class MLP:
    def __init__(self, input_size, hidden_size=16, learning_rate=0.001, lambda_reg=0.01):
        self.learning_rate = learning_rate
        self.lambda_reg = lambda_reg

        # Print initialization
        print("\nInitializing Network:")
        print(f"Input size: {input_size}")
        print(f"Hidden size: {hidden_size}")
        print(f"Learning rate: {learning_rate}")
        print(f"L2 regularization: {lambda_reg}\n")
        print("Explanation: The input size is the number of features in the input data. The hidden size represents the number of neurons in the hidden layer. The learning rate controls how much the model adjusts weights with each update, and L2 regularization helps to prevent overfitting.")

        # Xavier/Glorot initialization
        self.W1 = np.random.randn(input_size, hidden_size) * np.sqrt(2.0 / input_size)
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, 1) * np.sqrt(2.0 / hidden_size)
        self.b2 = np.zeros((1, 1))

        print("Initial weights and biases:")
        print(f"W1 shape: {self.W1.shape}, mean: {self.W1.mean():.6f}, std: {self.W1.std():.6f}")
        print(f"W2 shape: {self.W2.shape}, mean: {self.W2.mean():.6f}, std: {self.W2.std():.6f}")
        print(f"b1 shape: {self.b1.shape}")
        print(f"b2 shape: {self.b2.shape}")
        print("Explanation: W1 and W2 are the weight matrices for the connections between the layers. b1 and b2 are the bias vectors. The weights are initialized using a method to ensure efficient training.")
        print("Explanation of variables:")
        print("W1: Weight matrix connecting the input layer to the hidden layer.")
        print("b1: Bias vector for the hidden layer.")
        print("W2: Weight matrix connecting the hidden layer to the output layer.")
        print("b2: Bias vector for the output layer.")

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

    def relu(self, x):
        return np.maximum(0, x)

    def relu_derivative(self, x):
        return np.where(x > 0, 1, 0)

    def forward(self, X, print_details=False):
        if print_details:
            print("\nForward Propagation Steps:")
            print(f"Input X shape: {X.shape}")
            print(f"Input X: {X}")
            print("Explanation: Forward propagation involves passing the input through the network to get the output predictions.")

        # First layer
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = self.relu(self.z1)

        if print_details:
            print(f"Z1 shape: {self.z1.shape}, mean: {self.z1.mean():.6f}")
            print(f"Z1: {self.z1}")
            print(f"A1 shape: {self.a1.shape}, mean: {self.a1.mean():.6f}")
            print(f"A1: {self.a1}")
            print("Explanation: Z1 is the linear combination of inputs and weights for the first layer, and A1 is the output after applying the ReLU activation function.")
            print("Explanation of variables:")
            print("Z1: Linear combination of inputs and weights for the first layer (input layer to hidden layer).")
            print("A1: Output after applying the ReLU activation function to Z1.")

        # Output layer
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.a2 = self.sigmoid(self.z2)

        if print_details:
            print(f"Z2 shape: {self.z2.shape}, mean: {self.z2.mean():.6f}")
            print(f"Z2: {self.z2}")
            print(f"A2 shape: {self.a2.shape}, mean: {self.a2.mean():.6f}")
            print(f"A2 (Output Prediction): {self.a2}")
            print("Explanation: Z2 is the linear combination of the hidden layer outputs and weights for the output layer, and A2 is the final output after applying the sigmoid activation function.")
            print("Explanation of variables:")
            print("Z2: Linear combination of hidden layer outputs and weights for the output layer.")
            print("A2: Output prediction after applying the sigmoid activation function to Z2.")

        return self.a2

    def compute_cost(self, y_pred, y_true, print_details=False):
        m = y_true.shape[0]

        # Cross-entropy loss
        cross_entropy = -np.mean(y_true * np.log(y_pred + 1e-15) +
                               (1 - y_true) * np.log(1 - y_pred + 1e-15))

        # L2 regularization
        l2_reg = (self.lambda_reg / (2 * m)) * (np.sum(np.square(self.W1)) + np.sum(np.square(self.W2)))

        total_cost = cross_entropy + l2_reg

        if print_details:
            print("\nCost Computation:")
            print(f"Cross-entropy loss: {cross_entropy:.6f}")
            print(f"L2 regularization term: {l2_reg:.6f}")
            print(f"Total cost: {total_cost:.6f}")
            print("Explanation: The total cost is computed using cross-entropy loss, which measures how well the predicted outputs match the true labels, plus an L2 regularization term to penalize large weights.")

        return total_cost

    def train(self, X, y, epochs=1000, batch_size=32, early_stopping_patience=50, print_frequency=100):
        m = X.shape[0]
        y = y.reshape(-1, 1)

        best_loss = float('inf')
        patience_counter = 0

        print("\nStarting Training:")
        print(f"Total samples: {m}")
        print(f"Batch size: {batch_size}")
        print(f"Total epochs: {epochs}\n")
        print("Explanation: Training involves multiple epochs, where each epoch goes through all the data samples. Batch size is the number of samples used in each iteration to update the weights.")

        for epoch in range(epochs):
            epoch_print = (epoch + 1) % print_frequency == 0
            indices = np.random.permutation(m)

            for i in range(0, m, batch_size):
                batch_indices = indices[i:min(i + batch_size, m)]
                X_batch = X[batch_indices]
                y_batch = y[batch_indices]

                # Forward pass
                output = self.forward(X_batch, print_details=epoch_print and i == 0)

                if epoch_print and i == 0:
                    print("\nBackpropagation Steps:")
                    print("Explanation: Backpropagation involves computing gradients for each parameter and updating the weights accordingly.")

                # Backpropagation
                batch_size = X_batch.shape[0]

                # Output layer gradients
                dz2 = output - y_batch
                dW2 = (np.dot(self.a1.T, dz2) + self.lambda_reg * self.W2) / batch_size
                db2 = np.sum(dz2, axis=0, keepdims=True) / batch_size

                if epoch_print and i == 0:
                    print(f"dZ2 shape: {dz2.shape}, mean: {dz2.mean():.6f}")
                    print(f"dZ2: {dz2}")
                    print(f"dW2 shape: {dW2.shape}, mean: {dW2.mean():.6f}")
                    print(f"dW2: {dW2}")
                    print(f"db2 shape: {db2.shape}, mean: {db2.mean():.6f}")
                    print(f"db2: {db2}")
                    print("Explanation: dZ2 is the gradient of the loss with respect to Z2. dW2 and db2 are the gradients of the weights and biases for the output layer.")
                    print("Explanation of variables:")
                    print("dZ2: Gradient of the loss with respect to the output Z2.")
                    print("dW2: Gradient of the weights connecting the hidden layer to the output layer.")
                    print("db2: Gradient of the bias for the output layer.")

                # Hidden layer gradients
                da1 = np.dot(dz2, self.W2.T)
                dz1 = da1 * self.relu_derivative(self.z1)
                dW1 = (np.dot(X_batch.T, dz1) + self.lambda_reg * self.W1) / batch_size
                db1 = np.sum(dz1, axis=0, keepdims=True) / batch_size

                if epoch_print and i == 0:
                    print(f"dA1 shape: {da1.shape}, mean: {da1.mean():.6f}")
                    print(f"dA1: {da1}")
                    print(f"dZ1 shape: {dz1.shape}, mean: {dz1.mean():.6f}")
                    print(f"dZ1: {dz1}")
                    print(f"dW1 shape: {dW1.shape}, mean: {dW1.mean():.6f}")
                    print(f"dW1: {dW1}")
                    print(f"db1 shape: {db1.shape}, mean: {db1.mean():.6f}")
                    print(f"db1: {db1}")
                    print("Explanation: dA1 is the gradient passed from the output layer to the hidden layer. dZ1, dW1, and db1 are the gradients for the hidden layer.")
                    print("Explanation of variables:")
                    print("dA1: Gradient of the loss propagated back to the hidden layer.")
                    print("dZ1: Gradient of the activation function applied to Z1.")
                    print("dW1: Gradient of the weights connecting the input layer to the hidden layer.")
                    print("db1: Gradient of the bias for the hidden layer.")

                # Update weights
                self.W2 -= self.learning_rate * dW2
                self.b2 -= self.learning_rate * db2
                self.W1 -= self.learning_rate * dW1
                self.b1 -= self.learning_rate * db1

                if epoch_print and i == 0:
                    print("\nUpdated Weights:")
                    print(f"W1 mean: {self.W1.mean():.6f}, std: {self.W1.std():.6f}")
                    print(f"W1: {self.W1}")
                    print(f"W2 mean: {self.W2.mean():.6f}, std: {self.W2.std():.6f}")
                    print(f"W2: {self.W2}")
                    print("Explanation: After calculating the gradients, the weights are updated using the learning rate to minimize the cost function.")

            # Calculate metrics
            if epoch_print:
                output_full = self.forward(X)
                loss = self.compute_cost(output_full, y, print_details=True)
                accuracy = np.mean((output_full >= 0.5) == y)
                print(f"\nEpoch {epoch + 1}/{epochs}")
                print(f"Loss: {loss:.4f}")
                print(f"Accuracy: {accuracy:.4f}")
                print("-" * 50)
                print("Explanation: The loss and accuracy are metrics that help monitor the performance of the model during training.")

                # Early stopping
                if loss < best_loss:
                    best_loss = loss
                    patience_counter = 0
                else:
                    patience_counter += 1

                if patience_counter >= early_stopping_patience:
                    print("Early stopping triggered!")
                    print("Explanation: Early stopping is used to prevent overfitting by stopping the training if the loss does not improve for a certain number of epochs.")
                    break

# Load and preprocess data
print("Loading and preprocessing data...")
data = load_breast_cancer()
X = data.data
y = data.target

# Dataset Variables
print(f"Dataset Variables:")
print(f"X shape (features): {X.shape}")
print(f"y shape (labels): {y.shape}")
print("Explanation: X contains the input features for each sample, and y contains the corresponding labels indicating the classification.")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")
print("Explanation: The dataset is split into training and testing sets to evaluate the model's performance on unseen data.")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Explanation: Feature scaling is done using StandardScaler to normalize the features to have a mean of 0 and a standard deviation of 1.")

# Create and train model
input_size = X_train.shape[1]
mlp = MLP(input_size=input_size, hidden_size=16, learning_rate=0.001, lambda_reg=0.01)
mlp.train(X_train_scaled, y_train, epochs=1000, batch_size=32, print_frequency=100)

# Evaluate
print("\nFinal Evaluation:")
test_predictions = mlp.forward(X_test_scaled, print_details=True)
test_loss = mlp.compute_cost(test_predictions, y_test.reshape(-1,1), print_details=True)
test_accuracy = np.mean((test_predictions >= 0.5) == y_test.reshape(-1,1))
print(f"Test Accuracy: {test_accuracy:.4f}")
print("Explanation: The test accuracy represents how well the model generalizes to unseen data after training.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
   5.21379568e-02  2.99835003e-04 -5.80857182e-05 -4.74924466e-04
  -5.41994884e-02 -1.24028089e-03  1.93174978e-02 -1.06656567e-03
  -1.62612124e-03 -4.24168559e-02  1.86446921e-01  6.86067139e-02]
 [ 1.86634119e-04  4.23464538e-03  3.10917005e-02  1.17496639e-03
  -2.70888104e-02 -5.51750347e-04 -3.22405375e-05 -3.01319885e-04
   3.17133734e-02  4.66422964e-04 -1.01898868e-02  6.52337650e-04
  -1.27946785e-03  2.46123776e-02 -9.79134704e-02 -3.54246808e-02]
 [ 4.16730366e-04  4.09117242e-03  5.46315173e-02  1.62207398e-03
  -5.10478267e-02  7.13057773e-04  1.10507795e-04 -1.14934145e-03
   5.57453611e-02 -1.32962978e-03 -1.87004156e-02 -7.37875892e-04
  -1.51109681e-03  4.34723080e-02 -1.80411780e-01 -6.52982402e-02]
 [-1.52809789e-04 -1.49416826e-03  1.51650152e-02  4.43093035e-05
  -1.56199837e-02 -5.28185404e-05 -5.19168381e-04 -1.11292849e-03
   1.60403532e-02 -1.35383289e-03 -5.60060639e-03 -1.28259041e-03
   2.609