# Notebook 07: MLP Fundamentals

Build intuition for Multi-Layer Perceptron (Neural Network) architecture.

## Learning Objectives
- Understand MLP architecture
- Visualize activation functions
- Learn forward pass mechanics
- Train basic MLP classifiers and regressors
- Understand loss curves and convergence

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, make_moons, make_circles, make_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

np.random.seed(42)
plt.style.use('seaborn-v0_8-whitegrid')

## Part 1: Activation Functions

Activation functions introduce non-linearity, allowing neural networks to learn complex patterns.

In [None]:
# Define activation functions
x = np.linspace(-5, 5, 1000)

# Activation functions
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

def relu(x):
    return np.maximum(0, x)

def leaky_relu(x, alpha=0.01):
    return np.where(x > 0, x, alpha * x)

# Derivatives (for understanding backpropagation)
def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)

def tanh_derivative(x):
    return 1 - np.tanh(x)**2

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

In [None]:
# Visualize activation functions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

activations = [
    ('Sigmoid', sigmoid(x), sigmoid_derivative(x)),
    ('Tanh', tanh(x), tanh_derivative(x)),
    ('ReLU', relu(x), relu_derivative(x)),
    ('Leaky ReLU', leaky_relu(x), np.where(x > 0, 1, 0.01))
]

for idx, (name, y, dy) in enumerate(activations):
    ax = axes.ravel()[idx]
    ax.plot(x, y, 'b-', lw=2, label=f'{name}')
    ax.plot(x, dy, 'r--', lw=1, label='Derivative')
    ax.axhline(y=0, color='k', linestyle='-', linewidth=0.5)
    ax.axvline(x=0, color='k', linestyle='-', linewidth=0.5)
    ax.set_xlabel('x')
    ax.set_ylabel('f(x)')
    ax.set_title(f'{name} Activation Function')
    ax.legend()
    ax.set_xlim(-5, 5)
    ax.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Compare all activations on same plot
plt.figure(figsize=(12, 6))

plt.plot(x, sigmoid(x), 'b-', lw=2, label='Sigmoid')
plt.plot(x, tanh(x), 'r-', lw=2, label='Tanh')
plt.plot(x, relu(x), 'g-', lw=2, label='ReLU')
plt.plot(x, leaky_relu(x), 'm-', lw=2, label='Leaky ReLU')

plt.axhline(y=0, color='k', linestyle='-', linewidth=0.5)
plt.axvline(x=0, color='k', linestyle='-', linewidth=0.5)
plt.xlabel('x')
plt.ylabel('f(x)')
plt.title('Activation Functions Comparison')
plt.legend()
plt.xlim(-5, 5)
plt.ylim(-2, 5)
plt.grid(True)
plt.show()

print("Activation Function Properties:")
print("- Sigmoid: Output (0, 1), good for probabilities, vanishing gradient")
print("- Tanh: Output (-1, 1), zero-centered, vanishing gradient")
print("- ReLU: Output [0, ∞), fast, no vanishing gradient, dead neurons")
print("- Leaky ReLU: Prevents dead neurons")

## Part 2: MLP Architecture Basics

In [None]:
# Generate simple data
X, y = make_moons(n_samples=500, noise=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Visualize
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=y, cmap='RdYlBu', edgecolors='black')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Classification Data (Moons)')
plt.show()

In [None]:
# Train a simple MLP
mlp = MLPClassifier(
    hidden_layer_sizes=(10,),  # Single hidden layer with 10 neurons
    activation='relu',
    solver='adam',
    alpha=0.001,
    learning_rate_init=0.01,
    max_iter=500,
    random_state=42
)

mlp.fit(X_train_scaled, y_train)

# Model information
print("MLP Architecture:")
print(f"  Input layer: {X_train.shape[1]} features")
print(f"  Hidden layers: {mlp.hidden_layer_sizes}")
print(f"  Output layer: {len(np.unique(y))} classes")
print(f"\nTraining:")
print(f"  Iterations: {mlp.n_iter_}")
print(f"  Final loss: {mlp.loss_:.6f}")
print(f"\nPerformance:")
print(f"  Training accuracy: {mlp.score(X_train_scaled, y_train):.4f}")
print(f"  Test accuracy: {mlp.score(X_test_scaled, y_test):.4f}")

In [None]:
# Examine weights and biases
print("Network Parameters:")
for i, (w, b) in enumerate(zip(mlp.coefs_, mlp.intercepts_)):
    print(f"\nLayer {i+1}:")
    print(f"  Weights shape: {w.shape}")
    print(f"  Biases shape: {b.shape}")
    print(f"  Total parameters: {w.size + b.size}")

total_params = sum(w.size + b.size for w, b in zip(mlp.coefs_, mlp.intercepts_))
print(f"\nTotal network parameters: {total_params}")

## Part 3: Effect of Architecture (Number of Neurons)

In [None]:
# Decision boundary function
def plot_decision_boundary(model, X, y, ax, title):
    h = 0.02
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    ax.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu)
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='black', s=30)
    ax.set_title(title)

# Compare different numbers of neurons
neuron_counts = [1, 2, 5, 10, 50, 100]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, n_neurons in enumerate(neuron_counts):
    mlp = MLPClassifier(
        hidden_layer_sizes=(n_neurons,),
        activation='relu',
        solver='adam',
        max_iter=500,
        random_state=42
    )
    mlp.fit(X_train_scaled, y_train)
    
    acc = mlp.score(X_test_scaled, y_test)
    plot_decision_boundary(mlp, X_train_scaled, y_train, axes[idx], 
                          f'{n_neurons} neurons\nAcc: {acc:.3f}')

plt.tight_layout()
plt.show()

## Part 4: Effect of Depth (Number of Layers)

In [None]:
# Compare different network depths
architectures = [
    (10,),              # 1 hidden layer
    (20, 10),           # 2 hidden layers
    (30, 20, 10),       # 3 hidden layers
    (50, 30, 20, 10),   # 4 hidden layers
]

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for idx, arch in enumerate(architectures):
    mlp = MLPClassifier(
        hidden_layer_sizes=arch,
        activation='relu',
        solver='adam',
        max_iter=500,
        random_state=42
    )
    mlp.fit(X_train_scaled, y_train)
    
    acc = mlp.score(X_test_scaled, y_test)
    plot_decision_boundary(mlp, X_train_scaled, y_train, axes[idx], 
                          f'Architecture: {arch}\nAcc: {acc:.3f}')

plt.tight_layout()
plt.show()

## Part 5: Effect of Activation Function

In [None]:
# Compare activation functions
activations = ['identity', 'logistic', 'tanh', 'relu']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for idx, activation in enumerate(activations):
    mlp = MLPClassifier(
        hidden_layer_sizes=(50,),
        activation=activation,
        solver='adam',
        max_iter=1000,
        random_state=42
    )
    mlp.fit(X_train_scaled, y_train)
    
    acc = mlp.score(X_test_scaled, y_test)
    plot_decision_boundary(mlp, X_train_scaled, y_train, axes[idx], 
                          f'{activation}\nAcc: {acc:.3f}, Iter: {mlp.n_iter_}')

plt.tight_layout()
plt.show()

## Part 6: Loss Curves and Convergence

In [None]:
# Train MLP and track loss
mlp = MLPClassifier(
    hidden_layer_sizes=(50, 25),
    activation='relu',
    solver='adam',
    alpha=0.001,
    learning_rate_init=0.001,
    max_iter=1000,
    random_state=42
)

mlp.fit(X_train_scaled, y_train)

# Plot loss curve
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(mlp.loss_curve_, 'b-', linewidth=1)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Training Loss Curve')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.semilogy(mlp.loss_curve_, 'b-', linewidth=1)
plt.xlabel('Iteration')
plt.ylabel('Loss (log scale)')
plt.title('Training Loss Curve (Log Scale)')
plt.grid(True)

plt.tight_layout()
plt.show()

print(f"Final loss: {mlp.loss_curve_[-1]:.6f}")
print(f"Total iterations: {mlp.n_iter_}")

In [None]:
# Compare loss curves for different learning rates
learning_rates = [0.0001, 0.001, 0.01, 0.1]

plt.figure(figsize=(12, 6))

for lr in learning_rates:
    mlp = MLPClassifier(
        hidden_layer_sizes=(50,),
        activation='relu',
        solver='adam',
        learning_rate_init=lr,
        max_iter=500,
        random_state=42
    )
    mlp.fit(X_train_scaled, y_train)
    
    plt.plot(mlp.loss_curve_, label=f'lr={lr}')

plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Effect of Learning Rate on Convergence')
plt.legend()
plt.grid(True)
plt.show()

## Part 7: Solvers (Optimization Algorithms)

In [None]:
# Compare different solvers
solvers = ['sgd', 'adam', 'lbfgs']

plt.figure(figsize=(12, 6))

results = []

for solver in solvers:
    mlp = MLPClassifier(
        hidden_layer_sizes=(50,),
        activation='relu',
        solver=solver,
        learning_rate_init=0.01,
        max_iter=500,
        random_state=42
    )
    mlp.fit(X_train_scaled, y_train)
    
    plt.plot(mlp.loss_curve_, label=f'{solver}')
    
    results.append({
        'solver': solver,
        'iterations': mlp.n_iter_,
        'final_loss': mlp.loss_curve_[-1],
        'test_acc': mlp.score(X_test_scaled, y_test)
    })

plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Comparison of Optimization Algorithms')
plt.legend()
plt.grid(True)
plt.show()

print("\nSolver Comparison:")
print(pd.DataFrame(results).to_string(index=False))

## Part 8: MLP for Regression

In [None]:
# Generate regression data
X_reg, y_reg = make_regression(n_samples=500, n_features=10, n_informative=5, 
                               noise=20, random_state=42)

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Scale
scaler_reg = StandardScaler()
X_train_reg_scaled = scaler_reg.fit_transform(X_train_reg)
X_test_reg_scaled = scaler_reg.transform(X_test_reg)

# Train MLP Regressor
mlp_reg = MLPRegressor(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='adam',
    alpha=0.001,
    learning_rate_init=0.001,
    max_iter=1000,
    random_state=42
)

mlp_reg.fit(X_train_reg_scaled, y_train_reg)

# Evaluate
y_pred_reg = mlp_reg.predict(X_test_reg_scaled)

print("MLP Regressor Results:")
print(f"  R² Score: {r2_score(y_test_reg, y_pred_reg):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test_reg, y_pred_reg)):.4f}")
print(f"  Iterations: {mlp_reg.n_iter_}")

In [None]:
# Visualize regression results
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Predicted vs Actual
axes[0].scatter(y_test_reg, y_pred_reg, alpha=0.5)
axes[0].plot([y_test_reg.min(), y_test_reg.max()], 
             [y_test_reg.min(), y_test_reg.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual')
axes[0].set_ylabel('Predicted')
axes[0].set_title('Predicted vs Actual')

# Residuals
residuals = y_test_reg - y_pred_reg
axes[1].scatter(y_pred_reg, residuals, alpha=0.5)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Residuals')
axes[1].set_title('Residual Plot')

# Loss curve
axes[2].plot(mlp_reg.loss_curve_)
axes[2].set_xlabel('Iteration')
axes[2].set_ylabel('Loss')
axes[2].set_title('Training Loss')

plt.tight_layout()
plt.show()

## Part 9: Early Stopping

In [None]:
# Train with early stopping
mlp_early = MLPClassifier(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='adam',
    alpha=0.001,
    learning_rate_init=0.001,
    max_iter=1000,
    early_stopping=True,          # Enable early stopping
    validation_fraction=0.1,      # Use 10% of training data for validation
    n_iter_no_change=10,          # Stop if no improvement for 10 iterations
    random_state=42
)

mlp_early.fit(X_train_scaled, y_train)

# Compare with no early stopping
mlp_no_early = MLPClassifier(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='adam',
    alpha=0.001,
    learning_rate_init=0.001,
    max_iter=1000,
    early_stopping=False,
    random_state=42
)

mlp_no_early.fit(X_train_scaled, y_train)

# Plot comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(mlp_early.loss_curve_, label=f'Early stop (iter={mlp_early.n_iter_})')
axes[0].plot(mlp_no_early.loss_curve_, label=f'No early stop (iter={mlp_no_early.n_iter_})')
axes[0].set_xlabel('Iteration')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training Loss')
axes[0].legend()
axes[0].grid(True)

# Validation curve (only available with early stopping)
if hasattr(mlp_early, 'validation_scores_'):
    axes[1].plot(mlp_early.validation_scores_, 'g-', label='Validation')
    best_iter = np.argmax(mlp_early.validation_scores_)
    axes[1].axvline(x=best_iter, color='r', linestyle='--', label=f'Best: {best_iter}')
    axes[1].set_xlabel('Iteration')
    axes[1].set_ylabel('Validation Score')
    axes[1].set_title('Validation Score (Early Stopping)')
    axes[1].legend()
    axes[1].grid(True)

plt.tight_layout()
plt.show()

print(f"\nWith early stopping:")
print(f"  Iterations: {mlp_early.n_iter_}")
print(f"  Test accuracy: {mlp_early.score(X_test_scaled, y_test):.4f}")

print(f"\nWithout early stopping:")
print(f"  Iterations: {mlp_no_early.n_iter_}")
print(f"  Test accuracy: {mlp_no_early.score(X_test_scaled, y_test):.4f}")

## Part 10: Practical Tips

In [None]:
# Common MLP configurations
print("Common MLP Configurations:")
print("="*60)
print()
print("1. ARCHITECTURE")
print("   - Start with 1-2 hidden layers")
print("   - Neurons: Start with 50-100, tune based on data complexity")
print("   - Pyramid structure often works: (100, 50, 25)")
print()
print("2. ACTIVATION")
print("   - ReLU: Default choice for hidden layers")
print("   - Tanh: Good for normalized data")
print()
print("3. SOLVER")
print("   - Adam: Best for most cases")
print("   - LBFGS: Good for small datasets")
print("   - SGD: When you need fine control")
print()
print("4. REGULARIZATION")
print("   - alpha: L2 penalty (0.0001 to 0.1)")
print("   - early_stopping: Prevents overfitting")
print()
print("5. LEARNING RATE")
print("   - Start with 0.001 for Adam")
print("   - Reduce if loss oscillates")
print("   - Increase if convergence is too slow")

## Summary

In this notebook, you learned:

### MLP Architecture
- Input layer → Hidden layers → Output layer
- Each layer has neurons with weights and biases
- Activation functions add non-linearity

### Key Components
- **Activation Functions**: ReLU (default), tanh, sigmoid
- **hidden_layer_sizes**: Architecture tuple, e.g., (100, 50)
- **Solvers**: Adam (default), SGD, LBFGS

### Training
- Loss curves show convergence
- Early stopping prevents overfitting
- Learning rate affects convergence speed

### Key Takeaways
- Always scale input features
- Start simple, increase complexity as needed
- Use early stopping to prevent overfitting
- Monitor loss curves for debugging

### Next Steps
Continue to **Notebook 08** for comprehensive MLP parameter space exploration!