### Nama : Zahrani Cahya Priesa
### NIM : 1103223074
### Kelas: TK-46-03
### Mata Kuliah : Machine Learning

In [4]:
# ============================================================================
# EXPERIMENT 1: Demonstrating Vanishing Gradients Problem
# ============================================================================

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print("\n" + "="*70)
print("üî¨ EXPERIMENT 1: Demonstrating Vanishing Gradients")
print("="*70)

def create_deep_network_bad():
    model = keras.Sequential([
        layers.Flatten(input_shape=[28, 28], name="input_flatten"),
        layers.Dense(100, activation='sigmoid',
                     kernel_initializer=keras.initializers.RandomNormal(stddev=1.0),
                     name="dense_1"),
        layers.Dense(100, activation='sigmoid',
                     kernel_initializer=keras.initializers.RandomNormal(stddev=1.0),
                     name="dense_2"),
        layers.Dense(100, activation='sigmoid',
                     kernel_initializer=keras.initializers.RandomNormal(stddev=1.0),
                     name="dense_3"),
        layers.Dense(100, activation='sigmoid',
                     kernel_initializer=keras.initializers.RandomNormal(stddev=1.0),
                     name="dense_4"),
        layers.Dense(100, activation='sigmoid',
                     kernel_initializer=keras.initializers.RandomNormal(stddev=1.0),
                     name="dense_5"),
        layers.Dense(10, activation='softmax', name="output")
    ])
    return model

# Load Fashion MNIST
(X_train, y_train), _ = keras.datasets.fashion_mnist.load_data()
X_train = X_train / 255.0

model_bad = create_deep_network_bad()

# =======================
# üîë CRITICAL FIX (FINAL)
# =======================
activation_model = keras.Model(
    inputs=model_bad.layers[0].input,     # ‚Üê INI KUNCI UTAMA
    outputs=[layer.output for layer in model_bad.layers[1:6]]
)

# Analyze activations
sample_batch = X_train[:1000]

print("\nüîç Activation Analysis (Forward Pass):")
activations = activation_model.predict(sample_batch, verbose=0)

for i, act in enumerate(activations):
    mean_act = act.mean()
    std_act = act.std()
    pct_saturated = ((act < 0.01) | (act > 0.99)).mean() * 100
    print(
        f"   Layer {i+1}: "
        f"mean={mean_act:.4f}, std={std_act:.4f}, "
        f"saturated={pct_saturated:.1f}%"
    )

print("\n‚ö†Ô∏è PROBLEM DETECTED:")
print("   ‚Ä¢ Sigmoid activations saturate")
print("   ‚Ä¢ Gradients vanish in deep layers")
print("   ‚Ä¢ Learning becomes ineffective")


üî¨ EXPERIMENT 1: Demonstrating Vanishing Gradients

üîç Activation Analysis (Forward Pass):
   Layer 1: mean=0.5117, std=0.4645, saturated=69.3%
   Layer 2: mean=0.5576, std=0.4372, saturated=50.6%
   Layer 3: mean=0.4830, std=0.4410, saturated=51.0%
   Layer 4: mean=0.5291, std=0.4376, saturated=49.5%
   Layer 5: mean=0.5808, std=0.4381, saturated=53.8%

‚ö†Ô∏è PROBLEM DETECTED:
   ‚Ä¢ Sigmoid activations saturate
   ‚Ä¢ Gradients vanish in deep layers
   ‚Ä¢ Learning becomes ineffective


In [5]:
# ============================================================================
# Part 4-5 : Activation Functions & Batch Normalization
# ============================================================================

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from datetime import datetime
import time

np.random.seed(42)
tf.random.set_seed(42)

print("="*70)
print("CHAPTER 11: Part 4-5")
print("Activation Functions & Batch Normalization")
print("="*70)

# Load data
(X_train, y_train), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train, X_test = X_train / 255.0, X_test / 255.0

# Prepare subsets for faster experimentation
X_train_subset = X_train[:20000]
y_train_subset = y_train[:20000]
X_val = X_test[:2000]
y_val = y_test[:2000]

print(f"\nDataset:")
print(f"   Training: {X_train_subset.shape[0]:,} samples")
print(f"   Validation: {X_val.shape[0]:,} samples")

# ============================================================================
# EXPERIMENT 4: Comparing Activation Functions
# ============================================================================

print("\n" + "="*70)
print("EXPERIMENT 4: Activation Functions Comparison")
print("="*70)

def create_model_with_activation(activation, use_lecun_init=False):
    """Create model with specified activation function"""
    if use_lecun_init:
        init = 'lecun_normal'
    else:
        if activation in ['relu', 'elu']:
            init = 'he_normal'
        else:
            init = 'glorot_uniform'
    
    model = keras.Sequential([
        layers.Flatten(input_shape=[28, 28]),
        layers.Dense(100, activation=activation, kernel_initializer=init),
        layers.Dense(100, activation=activation, kernel_initializer=init),
        layers.Dense(100, activation=activation, kernel_initializer=init),
        layers.Dense(10, activation='softmax')
    ])
    return model

# Test different activations
activations_to_test = {
    'ReLU': 'relu',
    'Leaky ReLU': layers.LeakyReLU(alpha=0.01),
    'ELU': 'elu',
    'SELU': 'selu'
}

print("\nTraining models with different activations...")
print("   (10 epochs each, this will take a few minutes)")

activation_results = {}

for name, activation in activations_to_test.items():
    print(f"\n{'='*70}")
    print(f"Training with {name}...")
    print(f"{'='*70}")
    
    # For SELU, need special handling
    if name == 'SELU':
        # Standardize inputs for SELU
        X_train_std = (X_train_subset - X_train_subset.mean()) / X_train_subset.std()
        X_val_std = (X_val - X_train_subset.mean()) / X_train_subset.std()
        
        model = keras.Sequential([
            layers.Flatten(input_shape=[28, 28]),
            layers.Dense(100, activation='selu', kernel_initializer='lecun_normal'),
            layers.Dense(100, activation='selu', kernel_initializer='lecun_normal'),
            layers.Dense(100, activation='selu', kernel_initializer='lecun_normal'),
            layers.Dense(10, activation='softmax')
        ])
        
        train_data = (X_train_std, y_train_subset)
        val_data = (X_val_std, y_val)
    else:
        if isinstance(activation, str):
            model = create_model_with_activation(activation)
        else:  # LeakyReLU case
            model = keras.Sequential([
                layers.Flatten(input_shape=[28, 28]),
                layers.Dense(100, kernel_initializer='he_normal'),
                activation,
                layers.Dense(100, kernel_initializer='he_normal'),
                activation,
                layers.Dense(100, kernel_initializer='he_normal'),
                activation,
                layers.Dense(10, activation='softmax')
            ])
        
        train_data = (X_train_subset, y_train_subset)
        val_data = (X_val, y_val)
    
    model.compile(
        optimizer='sgd',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    start_time = time.time()
    
    history = model.fit(
        train_data[0], train_data[1],
        epochs=10,
        batch_size=32,
        validation_data=val_data,
        verbose=0
    )
    
    training_time = time.time() - start_time
    
    activation_results[name] = {
        'history': history.history,
        'time': training_time,
        'final_train_acc': history.history['accuracy'][-1],
        'final_val_acc': history.history['val_accuracy'][-1]
    }
    
    print(f"   ‚úì Training Time: {training_time:.2f}s")
    print(f"   ‚úì Final Training Accuracy: {history.history['accuracy'][-1]:.4f}")
    print(f"   ‚úì Final Validation Accuracy: {history.history['val_accuracy'][-1]:.4f}")

# Summary comparison
print("\n" + "="*70)
print("ACTIVATION FUNCTIONS SUMMARY")
print("="*70)

summary_data = []
for name, results in activation_results.items():
    summary_data.append({
        'Activation': name,
        'Train Acc': results['final_train_acc'],
        'Val Acc': results['final_val_acc'],
        'Time (s)': results['time']
    })

df_activations = pd.DataFrame(summary_data)
df_activations = df_activations.sort_values('Val Acc', ascending=False)
print("\n" + df_activations.to_string(index=False))

# Find best activation
best_activation = df_activations.iloc[0]['Activation']
best_val_acc = df_activations.iloc[0]['Val Acc']
print(f"\nüèÜ BEST ACTIVATION: {best_activation} ({best_val_acc:.4f})")

# ============================================================================
# EXPERIMENT 5: Batch Normalization Impact
# ============================================================================

print("\n" + "="*70)
print("üî¨ EXPERIMENT 5: Batch Normalization")
print("="*70)

# Model WITHOUT Batch Normalization
print("\nüèóÔ∏è Model 1: WITHOUT Batch Normalization")
model_no_bn = keras.Sequential([
    layers.Flatten(input_shape=[28, 28]),
    layers.Dense(100, activation='relu', kernel_initializer='he_normal'),
    layers.Dense(100, activation='relu', kernel_initializer='he_normal'),
    layers.Dense(100, activation='relu', kernel_initializer='he_normal'),
    layers.Dense(10, activation='softmax')
])

model_no_bn.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print("   Architecture: Dense ‚Üí ReLU ‚Üí Dense ‚Üí ReLU ‚Üí Dense ‚Üí ReLU ‚Üí Output")

# Model WITH Batch Normalization (before activation)
print("\nüèóÔ∏è Model 2: WITH Batch Normalization (before activation)")
model_with_bn = keras.Sequential([
    layers.Flatten(input_shape=[28, 28]),
    layers.Dense(100, kernel_initializer='he_normal'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dense(100, kernel_initializer='he_normal'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dense(100, kernel_initializer='he_normal'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dense(10, activation='softmax')
])

model_with_bn.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print("   Architecture: Dense ‚Üí BN ‚Üí ReLU ‚Üí Dense ‚Üí BN ‚Üí ReLU ‚Üí Dense ‚Üí BN ‚Üí ReLU ‚Üí Output")

# Model WITH Batch Normalization (after activation)
print("\nüèóÔ∏è Model 3: WITH Batch Normalization (after activation)")
model_bn_after = keras.Sequential([
    layers.Flatten(input_shape=[28, 28]),
    layers.Dense(100, activation='relu', kernel_initializer='he_normal'),
    layers.BatchNormalization(),
    layers.Dense(100, activation='relu', kernel_initializer='he_normal'),
    layers.BatchNormalization(),
    layers.Dense(100, activation='relu', kernel_initializer='he_normal'),
    layers.BatchNormalization(),
    layers.Dense(10, activation='softmax')
])

model_bn_after.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print("   Architecture: Dense ‚Üí ReLU ‚Üí BN ‚Üí Dense ‚Üí ReLU ‚Üí BN ‚Üí Dense ‚Üí ReLU ‚Üí BN ‚Üí Output")

# Train all models
print("\n" + "="*70)
print("‚è≥ Training all models (10 epochs each)...")
print("="*70)

bn_results = {}

for model_name, model in [
    ('No BN', model_no_bn),
    ('BN Before Activation', model_with_bn),
    ('BN After Activation', model_bn_after)
]:
    print(f"\nüèãÔ∏è Training: {model_name}...")
    
    start_time = time.time()
    
    history = model.fit(
        X_train_subset, y_train_subset,
        epochs=10,
        batch_size=32,
        validation_data=(X_val, y_val),
        verbose=0
    )
    
    training_time = time.time() - start_time
    
    bn_results[model_name] = {
        'history': history.history,
        'time': training_time,
        'final_train_acc': history.history['accuracy'][-1],
        'final_val_acc': history.history['val_accuracy'][-1]
    }
    
    print(f"   ‚úì Training Time: {training_time:.2f}s")
    print(f"   ‚úì Final Training Accuracy: {history.history['accuracy'][-1]:.4f}")
    print(f"   ‚úì Final Validation Accuracy: {history.history['val_accuracy'][-1]:.4f}")

# Compare results
print("\n" + "="*70)
print("üìä BATCH NORMALIZATION COMPARISON")
print("="*70)

bn_summary = []
for name, results in bn_results.items():
    bn_summary.append({
        'Model': name,
        'Train Acc': results['final_train_acc'],
        'Val Acc': results['final_val_acc'],
        'Time (s)': results['time']
    })

df_bn = pd.DataFrame(bn_summary)
df_bn = df_bn.sort_values('Val Acc', ascending=False)
print("\n" + df_bn.to_string(index=False))

# Epoch-by-epoch comparison
print("\n" + "="*70)
print("üìà CONVERGENCE SPEED COMPARISON")
print("="*70)
print("\nValidation Accuracy by Epoch:")
print("-" * 70)
print(f"{'Epoch':<8} {'No BN':<15} {'BN Before':<15} {'BN After':<15}")
print("-" * 70)

for epoch in range(10):
    no_bn_acc = bn_results['No BN']['history']['val_accuracy'][epoch]
    bn_before_acc = bn_results['BN Before Activation']['history']['val_accuracy'][epoch]
    bn_after_acc = bn_results['BN After Activation']['history']['val_accuracy'][epoch]
    
    print(f"{epoch+1:<8} {no_bn_acc:<15.4f} {bn_before_acc:<15.4f} {bn_after_acc:<15.4f}")

# Find improvement
no_bn_final = bn_results['No BN']['final_val_acc']
best_bn_final = max(bn_results['BN Before Activation']['final_val_acc'], 
                    bn_results['BN After Activation']['final_val_acc'])
improvement = best_bn_final - no_bn_final

print(f"\nüí° IMPROVEMENT WITH BN: {improvement:+.4f} ({improvement*100:+.2f}%)")

print("\n" + "="*70)
print("‚úÖ PART 4-5 COMPLETED!")
print("="*70)

CHAPTER 11: Part 4-5
Activation Functions & Batch Normalization

Dataset:
   Training: 20,000 samples
   Validation: 2,000 samples

EXPERIMENT 4: Activation Functions Comparison

Training models with different activations...
   (10 epochs each, this will take a few minutes)

Training with ReLU...




   ‚úì Training Time: 17.88s
   ‚úì Final Training Accuracy: 0.8680
   ‚úì Final Validation Accuracy: 0.8490

Training with Leaky ReLU...
   ‚úì Training Time: 14.01s
   ‚úì Final Training Accuracy: 0.8669
   ‚úì Final Validation Accuracy: 0.8490

Training with ELU...
   ‚úì Training Time: 13.49s
   ‚úì Final Training Accuracy: 0.8676
   ‚úì Final Validation Accuracy: 0.8540

Training with SELU...
   ‚úì Training Time: 13.61s
   ‚úì Final Training Accuracy: 0.8981
   ‚úì Final Validation Accuracy: 0.8515

ACTIVATION FUNCTIONS SUMMARY

Activation  Train Acc  Val Acc  Time (s)
       ELU    0.86760   0.8540 13.487924
      SELU    0.89810   0.8515 13.614416
      ReLU    0.86795   0.8490 17.881197
Leaky ReLU    0.86685   0.8490 14.007352

üèÜ BEST ACTIVATION: ELU (0.8540)

üî¨ EXPERIMENT 5: Batch Normalization

üèóÔ∏è Model 1: WITHOUT Batch Normalization
   Architecture: Dense ‚Üí ReLU ‚Üí Dense ‚Üí ReLU ‚Üí Dense ‚Üí ReLU ‚Üí Output

üèóÔ∏è Model 2: WITH Batch Normalization (before 

  super().__init__(**kwargs)


   Architecture: Dense ‚Üí BN ‚Üí ReLU ‚Üí Dense ‚Üí BN ‚Üí ReLU ‚Üí Dense ‚Üí BN ‚Üí ReLU ‚Üí Output

üèóÔ∏è Model 3: WITH Batch Normalization (after activation)
   Architecture: Dense ‚Üí ReLU ‚Üí BN ‚Üí Dense ‚Üí ReLU ‚Üí BN ‚Üí Dense ‚Üí ReLU ‚Üí BN ‚Üí Output

‚è≥ Training all models (10 epochs each)...

üèãÔ∏è Training: No BN...
   ‚úì Training Time: 13.38s
   ‚úì Final Training Accuracy: 0.8651
   ‚úì Final Validation Accuracy: 0.8450

üèãÔ∏è Training: BN Before Activation...
   ‚úì Training Time: 16.66s
   ‚úì Final Training Accuracy: 0.9094
   ‚úì Final Validation Accuracy: 0.8570

üèãÔ∏è Training: BN After Activation...
   ‚úì Training Time: 16.55s
   ‚úì Final Training Accuracy: 0.9176
   ‚úì Final Validation Accuracy: 0.8475

üìä BATCH NORMALIZATION COMPARISON

               Model  Train Acc  Val Acc  Time (s)
BN Before Activation    0.90945   0.8570 16.662968
 BN After Activation    0.91755   0.8475 16.552445
               No BN    0.86510   0.8450 13.375244

üìà C

In [6]:
# ============================================================================
# CHAPTER 11: Part 6-7
# Transfer Learning & Gradient Clipping
# ============================================================================

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from datetime import datetime
import time

np.random.seed(42)
tf.random.set_seed(42)

print("="*70)
print("CHAPTER 11: Part 6-7")
print("Transfer Learning & Gradient Clipping")
print("="*70)

# ============================================================================
# EXPERIMENT 6: Gradient Clipping
# ============================================================================

print("\n" + "="*70)
print("EXPERIMENT 6: Gradient Clipping")
print("="*70)

# Load data
(X_train, y_train), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train, X_test = X_train / 255.0, X_test / 255.0

X_train_subset = X_train[:20000]
y_train_subset = y_train[:20000]
X_val = X_test[:2000]
y_val = y_test[:2000]

print(f"\n Dataset: {X_train_subset.shape[0]:,} training samples")

# Create a model that might have exploding gradients
def create_deep_model():
    """Very deep model with potential gradient issues"""
    model = keras.Sequential([
        layers.Flatten(input_shape=[28, 28]),
        layers.Dense(200, activation='relu', kernel_initializer='he_normal'),
        layers.Dense(200, activation='relu', kernel_initializer='he_normal'),
        layers.Dense(200, activation='relu', kernel_initializer='he_normal'),
        layers.Dense(200, activation='relu', kernel_initializer='he_normal'),
        layers.Dense(200, activation='relu', kernel_initializer='he_normal'),
        layers.Dense(200, activation='relu', kernel_initializer='he_normal'),
        layers.Dense(10, activation='softmax')
    ])
    return model

# Test different gradient clipping strategies
print("\n Testing Gradient Clipping Strategies:")
print("-" * 70)

clipping_configs = {
    'No Clipping': None,
    'Clip by Value (1.0)': keras.optimizers.SGD(learning_rate=0.01, clipvalue=1.0),
    'Clip by Norm (1.0)': keras.optimizers.SGD(learning_rate=0.01, clipnorm=1.0),
}

clipping_results = {}

for config_name, optimizer in clipping_configs.items():
    print(f"\nTraining with: {config_name}")
    
    model = create_deep_model()
    
    if optimizer is None:
        optimizer = keras.optimizers.SGD(learning_rate=0.01)
    
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    start_time = time.time()
    
    history = model.fit(
        X_train_subset, y_train_subset,
        epochs=10,
        batch_size=32,
        validation_data=(X_val, y_val),
        verbose=0
    )
    
    training_time = time.time() - start_time
    
    clipping_results[config_name] = {
        'history': history.history,
        'time': training_time,
        'final_val_acc': history.history['val_accuracy'][-1]
    }
    
    print(f"   ‚úì Time: {training_time:.2f}s")
    print(f"   ‚úì Final Val Acc: {history.history['val_accuracy'][-1]:.4f}")

# Summary
print("\n" + "="*70)
print(" GRADIENT CLIPPING COMPARISON")
print("="*70)

clip_summary = []
for name, results in clipping_results.items():
    clip_summary.append({
        'Strategy': name,
        'Val Acc': results['final_val_acc'],
        'Time (s)': results['time']
    })

df_clip = pd.DataFrame(clip_summary)
df_clip = df_clip.sort_values('Val Acc', ascending=False)
print("\n" + df_clip.to_string(index=False))

# ============================================================================
# EXPERIMENT 7: Transfer Learning Simulation
# ============================================================================

print("\n" + "="*70)
print(" EXPERIMENT 7: Transfer Learning")
print("="*70)

print("\n SCENARIO:")
print("   Task A: Train on Fashion MNIST classes 0-4 (5 classes)")
print("   Task B: Fine-tune for classes 5-9 (5 different classes)")
print("   Simulate: Pre-training ‚Üí Transfer ‚Üí Fine-tuning")

# Prepare datasets for Task A (classes 0-4)
mask_train_A = y_train < 5
X_train_A = X_train[mask_train_A][:15000]
y_train_A = y_train[mask_train_A][:15000]

mask_test_A = y_test < 5
X_test_A = X_test[mask_test_A][:1000]
y_test_A = y_test[mask_test_A][:1000]

print(f"\n Task A Dataset (classes 0-4):")
print(f"   Training: {X_train_A.shape[0]:,} samples")
print(f"   Test: {X_test_A.shape[0]:,} samples")

# Prepare datasets for Task B (classes 5-9)
mask_train_B = y_train >= 5
X_train_B = X_train[mask_train_B][:5000]  # Less data!
y_train_B = y_train[mask_train_B][:5000] - 5  # Remap to 0-4

mask_test_B = y_test >= 5
X_test_B = X_test[mask_test_B][:1000]
y_test_B = y_test[mask_test_B][:1000] - 5  # Remap to 0-4

print(f"\n Task B Dataset (classes 5-9, LIMITED DATA):")
print(f"   Training: {X_train_B.shape[0]:,} samples (only 1/3 of Task A!)")
print(f"   Test: {X_test_B.shape[0]:,} samples")

# ============================================================================
# Step 1: Pre-train on Task A
# ============================================================================

print("\n" + "-"*70)
print("STEP 1: Pre-training on Task A")
print("-"*70)

model_pretrain = keras.Sequential([
    layers.Flatten(input_shape=[28, 28]),
    layers.Dense(100, activation='relu', kernel_initializer='he_normal', name='hidden1'),
    layers.Dense(100, activation='relu', kernel_initializer='he_normal', name='hidden2'),
    layers.Dense(100, activation='relu', kernel_initializer='he_normal', name='hidden3'),
    layers.Dense(5, activation='softmax', name='output_A')  # 5 classes
])

model_pretrain.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print("\nüèãÔ∏è Pre-training on Task A (15 epochs)...")
history_pretrain = model_pretrain.fit(
    X_train_A, y_train_A,
    epochs=15,
    batch_size=32,
    validation_data=(X_test_A, y_test_A),
    verbose=0
)

pretrain_acc = history_pretrain.history['val_accuracy'][-1]
print(f"   ‚úì Task A Validation Accuracy: {pretrain_acc:.4f}")

# ============================================================================
# Step 2: Transfer Learning - Reuse Lower Layers
# ============================================================================

print("\n" + "-"*70)
print("STEP 2: Transfer Learning to Task B")
print("-"*70)

# Strategy 1: Train from scratch (baseline)
print("\nüèóÔ∏è BASELINE: Training from scratch on Task B (limited data)")
model_scratch = keras.Sequential([
    layers.Flatten(input_shape=[28, 28]),
    layers.Dense(100, activation='relu', kernel_initializer='he_normal'),
    layers.Dense(100, activation='relu', kernel_initializer='he_normal'),
    layers.Dense(100, activation='relu', kernel_initializer='he_normal'),
    layers.Dense(5, activation='softmax')  # 5 classes
])

model_scratch.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history_scratch = model_scratch.fit(
    X_train_B, y_train_B,
    epochs=15,
    batch_size=32,
    validation_data=(X_test_B, y_test_B),
    verbose=0
)

scratch_acc = history_scratch.history['val_accuracy'][-1]
print(f"   ‚úì From Scratch Val Acc: {scratch_acc:.4f}")

# Strategy 2: Transfer Learning - Freeze lower layers
print("\n TRANSFER: Reuse pre-trained layers, freeze them")

# Create new model reusing lower layers
model_transfer_frozen = keras.Sequential([
    layers.Flatten(input_shape=[28, 28]),
    model_pretrain.get_layer('hidden1'),
    model_pretrain.get_layer('hidden2'),
    model_pretrain.get_layer('hidden3'),
    layers.Dense(5, activation='softmax', name='output_B')  # New output layer
])

# Freeze the transferred layers
for layer in model_transfer_frozen.layers[1:4]:  # hidden1, hidden2, hidden3
    layer.trainable = False

model_transfer_frozen.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print("   Frozen layers: hidden1, hidden2, hidden3")
print("   Trainable: output_B only")

history_frozen = model_transfer_frozen.fit(
    X_train_B, y_train_B,
    epochs=15,
    batch_size=32,
    validation_data=(X_test_B, y_test_B),
    verbose=0
)

frozen_acc = history_frozen.history['val_accuracy'][-1]
print(f"   ‚úì Transfer (Frozen) Val Acc: {frozen_acc:.4f}")

# Strategy 3: Transfer Learning - Fine-tune all layers
print("\n FINE-TUNE: Reuse pre-trained layers, train all")

model_transfer_finetune = keras.Sequential([
    layers.Flatten(input_shape=[28, 28]),
    model_pretrain.get_layer('hidden1'),
    model_pretrain.get_layer('hidden2'),
    model_pretrain.get_layer('hidden3'),
    layers.Dense(5, activation='softmax', name='output_B2')
])

# All layers trainable
for layer in model_transfer_finetune.layers:
    layer.trainable = True

model_transfer_finetune.compile(
    optimizer=keras.optimizers.SGD(learning_rate=0.001),  # Lower LR for fine-tuning
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print("   All layers trainable")
print("   Lower learning rate (0.001) for fine-tuning")

history_finetune = model_transfer_finetune.fit(
    X_train_B, y_train_B,
    epochs=15,
    batch_size=32,
    validation_data=(X_test_B, y_test_B),
    verbose=0
)

finetune_acc = history_finetune.history['val_accuracy'][-1]
print(f"   ‚úì Transfer (Fine-tune) Val Acc: {finetune_acc:.4f}")

# ============================================================================
# Compare Transfer Learning Strategies
# ============================================================================

print("\n" + "="*70)
print(" TRANSFER LEARNING COMPARISON")
print("="*70)

transfer_summary = pd.DataFrame([
    {'Strategy': 'From Scratch (baseline)', 'Val Acc': scratch_acc, 'Description': 'No pre-training'},
    {'Strategy': 'Transfer (Frozen)', 'Val Acc': frozen_acc, 'Description': 'Freeze lower layers'},
    {'Strategy': 'Transfer (Fine-tune)', 'Val Acc': finetune_acc, 'Description': 'Fine-tune all layers'}
])

transfer_summary = transfer_summary.sort_values('Val Acc', ascending=False)
print("\n" + transfer_summary.to_string(index=False))

# Calculate improvements
improvement_frozen = frozen_acc - scratch_acc
improvement_finetune = finetune_acc - scratch_acc

print(f"\n TRANSFER LEARNING GAINS:")
print(f"   Frozen layers: {improvement_frozen:+.4f} ({improvement_frozen*100:+.2f}%)")
print(f"   Fine-tuning:   {improvement_finetune:+.4f} ({improvement_finetune*100:+.2f}%)")

# Convergence comparison
print("\n" + "="*70)
print(" CONVERGENCE COMPARISON (First 10 Epochs)")
print("="*70)
print(f"{'Epoch':<8} {'Scratch':<15} {'Frozen':<15} {'Fine-tune':<15}")
print("-" * 70)

for epoch in range(10):
    scratch = history_scratch.history['val_accuracy'][epoch]
    frozen = history_frozen.history['val_accuracy'][epoch]
    finetune = history_finetune.history['val_accuracy'][epoch]
    print(f"{epoch+1:<8} {scratch:<15.4f} {frozen:<15.4f} {finetune:<15.4f}")

print("\n" + "="*70)
print("‚úÖ PART 6-7 COMPLETED!")
print("="*70)

CHAPTER 11: Part 6-7
Transfer Learning & Gradient Clipping

EXPERIMENT 6: Gradient Clipping

 Dataset: 20,000 training samples

 Testing Gradient Clipping Strategies:
----------------------------------------------------------------------

Training with: No Clipping


  super().__init__(**kwargs)


   ‚úì Time: 26.41s
   ‚úì Final Val Acc: 0.8530

Training with: Clip by Value (1.0)
   ‚úì Time: 23.84s
   ‚úì Final Val Acc: 0.8490

Training with: Clip by Norm (1.0)
   ‚úì Time: 23.69s
   ‚úì Final Val Acc: 0.8540

 GRADIENT CLIPPING COMPARISON

           Strategy  Val Acc  Time (s)
 Clip by Norm (1.0)    0.854 23.692467
        No Clipping    0.853 26.411210
Clip by Value (1.0)    0.849 23.843929

 EXPERIMENT 7: Transfer Learning

 SCENARIO:
   Task A: Train on Fashion MNIST classes 0-4 (5 classes)
   Task B: Fine-tune for classes 5-9 (5 different classes)
   Simulate: Pre-training ‚Üí Transfer ‚Üí Fine-tuning

 Task A Dataset (classes 0-4):
   Training: 15,000 samples
   Test: 1,000 samples

 Task B Dataset (classes 5-9, LIMITED DATA):
   Training: 5,000 samples (only 1/3 of Task A!)
   Test: 1,000 samples

----------------------------------------------------------------------
STEP 1: Pre-training on Task A
----------------------------------------------------------------------



  super().__init__(**kwargs)


   ‚úì Task A Validation Accuracy: 0.8540

----------------------------------------------------------------------
STEP 2: Transfer Learning to Task B
----------------------------------------------------------------------

üèóÔ∏è BASELINE: Training from scratch on Task B (limited data)
   ‚úì From Scratch Val Acc: 0.9370

 TRANSFER: Reuse pre-trained layers, freeze them
   Frozen layers: hidden1, hidden2, hidden3
   Trainable: output_B only
   ‚úì Transfer (Frozen) Val Acc: 0.8120

 FINE-TUNE: Reuse pre-trained layers, train all
   All layers trainable
   Lower learning rate (0.001) for fine-tuning
   ‚úì Transfer (Fine-tune) Val Acc: 0.8990

 TRANSFER LEARNING COMPARISON

               Strategy  Val Acc          Description
From Scratch (baseline)    0.937      No pre-training
   Transfer (Fine-tune)    0.899 Fine-tune all layers
      Transfer (Frozen)    0.812  Freeze lower layers

 TRANSFER LEARNING GAINS:
   Frozen layers: -0.1250 (-12.50%)
   Fine-tuning:   -0.0380 (-3.80%)

 CO

In [7]:
# ============================================================================
# CHAPTER 11: Part 8-9
# Advanced Optimizers & Learning Rate Scheduling
# ============================================================================

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from datetime import datetime
import time

np.random.seed(42)
tf.random.set_seed(42)

print("="*70)
print("CHAPTER 11: Part 8-9")
print("Advanced Optimizers & Learning Rate Scheduling")
print("="*70)

# Load data
(X_train, y_train), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train, X_test = X_train / 255.0, X_test / 255.0

X_train_subset = X_train[:20000]
y_train_subset = y_train[:20000]
X_val = X_test[:2000]
y_val = y_test[:2000]

print(f"\n Dataset: {X_train_subset.shape[0]:,} training samples")

# ============================================================================
# EXPERIMENT 8: Comparing Optimizers
# ============================================================================

print("\n" + "="*70)
print(" EXPERIMENT 8: Optimizer Comparison")
print("="*70)

def create_standard_model():
    """Standard model for optimizer comparison"""
    return keras.Sequential([
        layers.Flatten(input_shape=[28, 28]),
        layers.Dense(100, activation='relu', kernel_initializer='he_normal'),
        layers.Dense(100, activation='relu', kernel_initializer='he_normal'),
        layers.Dense(100, activation='relu', kernel_initializer='he_normal'),
        layers.Dense(10, activation='softmax')
    ])

# Test different optimizers
optimizers_config = {
    'SGD (vanilla)': keras.optimizers.SGD(learning_rate=0.01),
    'SGD + Momentum': keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
    'SGD + Nesterov': keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True),
    'Adagrad': keras.optimizers.Adagrad(learning_rate=0.01),
    'RMSprop': keras.optimizers.RMSprop(learning_rate=0.001),
    'Adam': keras.optimizers.Adam(learning_rate=0.001),
    'Nadam': keras.optimizers.Nadam(learning_rate=0.001),
}

print("\n Training with different optimizers (15 epochs each)...")
print("   This will take several minutes...")

optimizer_results = {}

for opt_name, optimizer in optimizers_config.items():
    print(f"\n{'='*70}")
    print(f"üèãÔ∏è Training with {opt_name}...")
    print(f"{'='*70}")
    
    model = create_standard_model()
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    start_time = time.time()
    
    history = model.fit(
        X_train_subset, y_train_subset,
        epochs=15,
        batch_size=32,
        validation_data=(X_val, y_val),
        verbose=0
    )
    
    training_time = time.time() - start_time
    
    optimizer_results[opt_name] = {
        'history': history.history,
        'time': training_time,
        'final_train_acc': history.history['accuracy'][-1],
        'final_val_acc': history.history['val_accuracy'][-1],
        'best_val_acc': max(history.history['val_accuracy'])
    }
    
    print(f"   ‚úì Time: {training_time:.2f}s")
    print(f"   ‚úì Final Train Acc: {history.history['accuracy'][-1]:.4f}")
    print(f"   ‚úì Final Val Acc: {history.history['val_accuracy'][-1]:.4f}")
    print(f"   ‚úì Best Val Acc: {max(history.history['val_accuracy']):.4f}")

# Summary comparison
print("\n" + "="*70)
print(" OPTIMIZER COMPARISON SUMMARY")
print("="*70)

opt_summary = []
for name, results in optimizer_results.items():
    opt_summary.append({
        'Optimizer': name,
        'Final Val Acc': results['final_val_acc'],
        'Best Val Acc': results['best_val_acc'],
        'Time (s)': results['time']
    })

df_opt = pd.DataFrame(opt_summary)
df_opt = df_opt.sort_values('Best Val Acc', ascending=False)
print("\n" + df_opt.to_string(index=False))

# Find best optimizer
best_opt = df_opt.iloc[0]['Optimizer']
best_acc = df_opt.iloc[0]['Best Val Acc']
print(f"\n BEST OPTIMIZER: {best_opt} ({best_acc:.4f})")

# Detailed convergence analysis
print("\n" + "="*70)
print(" CONVERGENCE SPEED (Validation Accuracy by Epoch)")
print("="*70)
print(f"{'Epoch':<7} {'SGD':<10} {'Momentum':<10} {'RMSprop':<10} {'Adam':<10} {'Nadam':<10}")
print("-" * 70)

for epoch in range(15):
    sgd_acc = optimizer_results['SGD (vanilla)']['history']['val_accuracy'][epoch]
    mom_acc = optimizer_results['SGD + Momentum']['history']['val_accuracy'][epoch]
    rms_acc = optimizer_results['RMSprop']['history']['val_accuracy'][epoch]
    adam_acc = optimizer_results['Adam']['history']['val_accuracy'][epoch]
    nadam_acc = optimizer_results['Nadam']['history']['val_accuracy'][epoch]
    
    print(f"{epoch+1:<7} {sgd_acc:<10.4f} {mom_acc:<10.4f} {rms_acc:<10.4f} {adam_acc:<10.4f} {nadam_acc:<10.4f}")

# ============================================================================
# EXPERIMENT 9: Learning Rate Scheduling
# ============================================================================

print("\n" + "="*70)
print(" EXPERIMENT 9: Learning Rate Scheduling")
print("="*70)

# Prepare longer training for scheduling
X_train_large = X_train[:30000]
y_train_large = y_train[:30000]

print(f"\n Extended Dataset: {X_train_large.shape[0]:,} training samples")
print("   Training for 30 epochs to see scheduling effects")

# Different scheduling strategies
scheduling_configs = {}

# 1. Constant LR (baseline)
print("\n BASELINE: Constant Learning Rate")
model_const = create_standard_model()
model_const.compile(
    optimizer=keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history_const = model_const.fit(
    X_train_large, y_train_large,
    epochs=30,
    batch_size=32,
    validation_data=(X_val, y_val),
    verbose=0
)

scheduling_configs['Constant LR'] = {
    'history': history_const.history,
    'final_val_acc': history_const.history['val_accuracy'][-1],
    'best_val_acc': max(history_const.history['val_accuracy'])
}

print(f"   ‚úì Best Val Acc: {max(history_const.history['val_accuracy']):.4f}")

# 2. Exponential Decay
print("\n EXPONENTIAL DECAY SCHEDULING")
model_exp = create_standard_model()

lr_schedule_exp = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.01,
    decay_steps=1000,
    decay_rate=0.96
)

model_exp.compile(
    optimizer=keras.optimizers.SGD(learning_rate=lr_schedule_exp, momentum=0.9),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history_exp = model_exp.fit(
    X_train_large, y_train_large,
    epochs=30,
    batch_size=32,
    validation_data=(X_val, y_val),
    verbose=0
)

scheduling_configs['Exponential Decay'] = {
    'history': history_exp.history,
    'final_val_acc': history_exp.history['val_accuracy'][-1],
    'best_val_acc': max(history_exp.history['val_accuracy'])
}

print(f"   ‚úì Best Val Acc: {max(history_exp.history['val_accuracy']):.4f}")

# 3. Piecewise Constant (Step Decay)
print("\n PIECEWISE CONSTANT SCHEDULING")
model_piece = create_standard_model()

# Define boundaries and values
boundaries = [10 * 938, 20 * 938]  # Steps at epoch 10 and 20 (938 steps per epoch)
values = [0.01, 0.005, 0.001]

lr_schedule_piece = keras.optimizers.schedules.PiecewiseConstantDecay(
    boundaries=boundaries,
    values=values
)

model_piece.compile(
    optimizer=keras.optimizers.SGD(learning_rate=lr_schedule_piece, momentum=0.9),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history_piece = model_piece.fit(
    X_train_large, y_train_large,
    epochs=30,
    batch_size=32,
    validation_data=(X_val, y_val),
    verbose=0
)

scheduling_configs['Piecewise Constant'] = {
    'history': history_piece.history,
    'final_val_acc': history_piece.history['val_accuracy'][-1],
    'best_val_acc': max(history_piece.history['val_accuracy'])
}

print(f"   ‚úì Best Val Acc: {max(history_piece.history['val_accuracy']):.4f}")

# 4. ReduceLROnPlateau (Performance-based)
print("\n REDUCE LR ON PLATEAU")
model_plateau = create_standard_model()

model_plateau.compile(
    optimizer=keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

reduce_lr = keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=3,
    min_lr=0.0001,
    verbose=0
)

history_plateau = model_plateau.fit(
    X_train_large, y_train_large,
    epochs=30,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[reduce_lr],
    verbose=0
)

scheduling_configs['ReduceLROnPlateau'] = {
    'history': history_plateau.history,
    'final_val_acc': history_plateau.history['val_accuracy'][-1],
    'best_val_acc': max(history_plateau.history['val_accuracy'])
}

print(f"   ‚úì Best Val Acc: {max(history_plateau.history['val_accuracy']):.4f}")

# Summary comparison
print("\n" + "="*70)
print(" LEARNING RATE SCHEDULING COMPARISON")
print("="*70)

schedule_summary = []
for name, results in scheduling_configs.items():
    schedule_summary.append({
        'Strategy': name,
        'Final Val Acc': results['final_val_acc'],
        'Best Val Acc': results['best_val_acc']
    })

df_schedule = pd.DataFrame(schedule_summary)
df_schedule = df_schedule.sort_values('Best Val Acc', ascending=False)
print("\n" + df_schedule.to_string(index=False))

# Epoch-by-epoch comparison (first 20 epochs)
print("\n" + "="*70)
print(" VALIDATION ACCURACY OVER TIME (First 20 Epochs)")
print("="*70)
print(f"{'Epoch':<7} {'Constant':<12} {'Exponential':<12} {'Piecewise':<12} {'Plateau':<12}")
print("-" * 70)

for epoch in range(20):
    const = scheduling_configs['Constant LR']['history']['val_accuracy'][epoch]
    exp = scheduling_configs['Exponential Decay']['history']['val_accuracy'][epoch]
    piece = scheduling_configs['Piecewise Constant']['history']['val_accuracy'][epoch]
    plateau = scheduling_configs['ReduceLROnPlateau']['history']['val_accuracy'][epoch]
    
    print(f"{epoch+1:<7} {const:<12.4f} {exp:<12.4f} {piece:<12.4f} {plateau:<12.4f}")

print("\n" + "="*70)
print("‚úÖ PART 8-9 COMPLETED!")
print("="*70)
print("\nüí° KEY INSIGHTS:")
print("   ‚Ä¢ Modern optimizers (Adam, RMSprop) converge MUCH faster than SGD")
print("   ‚Ä¢ Momentum significantly improves SGD performance")
print("   ‚Ä¢ Learning rate scheduling helps achieve better final accuracy")
print("   ‚Ä¢ ReduceLROnPlateau is adaptive and often best for real-world tasks")

CHAPTER 11: Part 8-9
Advanced Optimizers & Learning Rate Scheduling

 Dataset: 20,000 training samples

 EXPERIMENT 8: Optimizer Comparison

 Training with different optimizers (15 epochs each)...
   This will take several minutes...

üèãÔ∏è Training with SGD (vanilla)...


  super().__init__(**kwargs)


   ‚úì Time: 24.30s
   ‚úì Final Train Acc: 0.8828
   ‚úì Final Val Acc: 0.8575
   ‚úì Best Val Acc: 0.8575

üèãÔ∏è Training with SGD + Momentum...
   ‚úì Time: 24.60s
   ‚úì Final Train Acc: 0.9118
   ‚úì Final Val Acc: 0.8515
   ‚úì Best Val Acc: 0.8670

üèãÔ∏è Training with SGD + Nesterov...
   ‚úì Time: 26.09s
   ‚úì Final Train Acc: 0.9166
   ‚úì Final Val Acc: 0.8535
   ‚úì Best Val Acc: 0.8650

üèãÔ∏è Training with Adagrad...
   ‚úì Time: 24.80s
   ‚úì Final Train Acc: 0.8996
   ‚úì Final Val Acc: 0.8675
   ‚úì Best Val Acc: 0.8680

üèãÔ∏è Training with RMSprop...
   ‚úì Time: 25.82s
   ‚úì Final Train Acc: 0.9090
   ‚úì Final Val Acc: 0.8620
   ‚úì Best Val Acc: 0.8620

üèãÔ∏è Training with Adam...
   ‚úì Time: 26.88s
   ‚úì Final Train Acc: 0.9211
   ‚úì Final Val Acc: 0.8600
   ‚úì Best Val Acc: 0.8700

üèãÔ∏è Training with Nadam...
   ‚úì Time: 29.70s
   ‚úì Final Train Acc: 0.9277
   ‚úì Final Val Acc: 0.8610
   ‚úì Best Val Acc: 0.8640

 OPTIMIZER COMPARISON SUMMARY


  super().__init__(**kwargs)


   ‚úì Best Val Acc: 0.8720

 EXPONENTIAL DECAY SCHEDULING
   ‚úì Best Val Acc: 0.8815

 PIECEWISE CONSTANT SCHEDULING
   ‚úì Best Val Acc: 0.8815

 REDUCE LR ON PLATEAU
   ‚úì Best Val Acc: 0.8930

 LEARNING RATE SCHEDULING COMPARISON

          Strategy  Final Val Acc  Best Val Acc
 ReduceLROnPlateau         0.8930        0.8930
 Exponential Decay         0.8740        0.8815
Piecewise Constant         0.8795        0.8815
       Constant LR         0.8580        0.8720

 VALIDATION ACCURACY OVER TIME (First 20 Epochs)
Epoch   Constant     Exponential  Piecewise    Plateau     
----------------------------------------------------------------------
1       0.8470       0.8505       0.8490       0.8360      
2       0.8495       0.8565       0.8510       0.8600      
3       0.8530       0.8575       0.8605       0.8565      
4       0.8620       0.8595       0.8735       0.8605      
5       0.8585       0.8695       0.8715       0.8580      
6       0.8585       0.8740       0.8800  

In [8]:
# ============================================================================
# CHAPTER 11: Part 10
# Regularization Techniques
# ============================================================================

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
import pandas as pd
from datetime import datetime
import time

np.random.seed(42)
tf.random.set_seed(42)

print("="*70)
print("CHAPTER 11: Part 10")
print("Regularization Techniques")
print("="*70)

# Load data
(X_train, y_train), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()
X_train, X_test = X_train / 255.0, X_test / 255.0

# Use more data to see overfitting
X_train_large = X_train[:40000]
y_train_large = y_train[:40000]
X_val = X_test[:2000]
y_val = y_test[:2000]

print(f"\nDataset: {X_train_large.shape[0]:,} training samples")

# ============================================================================
# EXPERIMENT 10: L1 and L2 Regularization
# ============================================================================

print("\n" + "="*70)
print(" EXPERIMENT 10: L1/L2 Regularization")
print("="*70)

def create_model_with_regularization(reg_type=None, reg_strength=0.01):
    """Create model with specified regularization"""
    
    if reg_type == 'l1':
        regularizer = regularizers.l1(reg_strength)
    elif reg_type == 'l2':
        regularizer = regularizers.l2(reg_strength)
    elif reg_type == 'l1_l2':
        regularizer = regularizers.l1_l2(l1=reg_strength, l2=reg_strength)
    else:
        regularizer = None
    
    model = keras.Sequential([
        layers.Flatten(input_shape=[28, 28]),
        layers.Dense(200, activation='relu', kernel_initializer='he_normal',
                    kernel_regularizer=regularizer),
        layers.Dense(200, activation='relu', kernel_initializer='he_normal',
                    kernel_regularizer=regularizer),
        layers.Dense(200, activation='relu', kernel_initializer='he_normal',
                    kernel_regularizer=regularizer),
        layers.Dense(10, activation='softmax')
    ])
    return model

# Test different regularization strategies
reg_configs = {
    'No Regularization': (None, 0),
    'L2 (0.001)': ('l2', 0.001),
    'L2 (0.01)': ('l2', 0.01),
    'L1 (0.001)': ('l1', 0.001),
    'L1 + L2 (0.001)': ('l1_l2', 0.001),
}

print("\n Training models with different regularization (20 epochs)...")

reg_results = {}

for config_name, (reg_type, reg_strength) in reg_configs.items():
    print(f"\n{'='*70}")
    print(f" Training with {config_name}...")
    print(f"{'='*70}")
    
    model = create_model_with_regularization(reg_type, reg_strength)
    model.compile(
        optimizer=keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    history = model.fit(
        X_train_large, y_train_large,
        epochs=20,
        batch_size=32,
        validation_data=(X_val, y_val),
        verbose=0
    )
    
    reg_results[config_name] = {
        'history': history.history,
        'final_train_acc': history.history['accuracy'][-1],
        'final_val_acc': history.history['val_accuracy'][-1],
        'overfitting': history.history['accuracy'][-1] - history.history['val_accuracy'][-1]
    }
    
    print(f"   ‚úì Train Acc: {history.history['accuracy'][-1]:.4f}")
    print(f"   ‚úì Val Acc: {history.history['val_accuracy'][-1]:.4f}")
    print(f"   ‚úì Overfitting Gap: {reg_results[config_name]['overfitting']:.4f}")

# Summary
print("\n" + "="*70)
print(" L1/L2 REGULARIZATION COMPARISON")
print("="*70)

reg_summary = []
for name, results in reg_results.items():
    reg_summary.append({
        'Configuration': name,
        'Train Acc': results['final_train_acc'],
        'Val Acc': results['final_val_acc'],
        'Overfit Gap': results['overfitting']
    })

df_reg = pd.DataFrame(reg_summary)
df_reg = df_reg.sort_values('Val Acc', ascending=False)
print("\n" + df_reg.to_string(index=False))

# ============================================================================
# EXPERIMENT 11: Dropout
# ============================================================================

print("\n" + "="*70)
print(" EXPERIMENT 11: Dropout Regularization")
print("="*70)

def create_model_with_dropout(dropout_rate=0.0):
    """Create model with dropout"""
    model = keras.Sequential([
        layers.Flatten(input_shape=[28, 28]),
        layers.Dense(200, activation='relu', kernel_initializer='he_normal'),
        layers.Dropout(dropout_rate),
        layers.Dense(200, activation='relu', kernel_initializer='he_normal'),
        layers.Dropout(dropout_rate),
        layers.Dense(200, activation='relu', kernel_initializer='he_normal'),
        layers.Dropout(dropout_rate),
        layers.Dense(10, activation='softmax')
    ])
    return model

# Test different dropout rates
dropout_configs = {
    'No Dropout': 0.0,
    'Dropout 10%': 0.1,
    'Dropout 20%': 0.2,
    'Dropout 30%': 0.3,
    'Dropout 50%': 0.5,
}

print("\n Training models with different dropout rates (20 epochs)...")

dropout_results = {}

for config_name, dropout_rate in dropout_configs.items():
    print(f"\n{'='*70}")
    print(f"üèãÔ∏è Training with {config_name}...")
    print(f"{'='*70}")
    
    model = create_model_with_dropout(dropout_rate)
    model.compile(
        optimizer=keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    history = model.fit(
        X_train_large, y_train_large,
        epochs=20,
        batch_size=32,
        validation_data=(X_val, y_val),
        verbose=0
    )
    
    dropout_results[config_name] = {
        'history': history.history,
        'final_train_acc': history.history['accuracy'][-1],
        'final_val_acc': history.history['val_accuracy'][-1],
        'overfitting': history.history['accuracy'][-1] - history.history['val_accuracy'][-1]
    }
    
    print(f"   ‚úì Train Acc: {history.history['accuracy'][-1]:.4f}")
    print(f"   ‚úì Val Acc: {history.history['val_accuracy'][-1]:.4f}")
    print(f"   ‚úì Overfitting Gap: {dropout_results[config_name]['overfitting']:.4f}")

# Summary
print("\n" + "="*70)
print(" DROPOUT COMPARISON")
print("="*70)

dropout_summary = []
for name, results in dropout_results.items():
    dropout_summary.append({
        'Configuration': name,
        'Train Acc': results['final_train_acc'],
        'Val Acc': results['final_val_acc'],
        'Overfit Gap': results['overfitting']
    })

df_dropout = pd.DataFrame(dropout_summary)
df_dropout = df_dropout.sort_values('Val Acc', ascending=False)
print("\n" + df_dropout.to_string(index=False))

# ============================================================================
# EXPERIMENT 12: Combining Techniques
# ============================================================================

print("\n" + "="*70)
print(" EXPERIMENT 12: Combining Regularization Techniques")
print("="*70)

# Best combination: L2 + Dropout + Batch Normalization
print("\n BEST PRACTICES MODEL")
model_best = keras.Sequential([
    layers.Flatten(input_shape=[28, 28]),
    
    layers.Dense(200, kernel_initializer='he_normal', 
                kernel_regularizer=regularizers.l2(0.001)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.2),
    
    layers.Dense(200, kernel_initializer='he_normal',
                kernel_regularizer=regularizers.l2(0.001)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.2),
    
    layers.Dense(200, kernel_initializer='he_normal',
                kernel_regularizer=regularizers.l2(0.001)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.2),
    
    layers.Dense(10, activation='softmax')
])

model_best.compile(
    optimizer=keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

reduce_lr = keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=5,
    min_lr=1e-7,
    verbose=0
)

early_stop = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=0
)

print("   Architecture:")
print("   ‚Ä¢ L2 Regularization (0.001)")
print("   ‚Ä¢ Batch Normalization")
print("   ‚Ä¢ Dropout (20%)")
print("   ‚Ä¢ SGD + Nesterov Momentum")
print("   ‚Ä¢ ReduceLROnPlateau callback")
print("   ‚Ä¢ EarlyStopping callback")

history_best = model_best.fit(
    X_train_large, y_train_large,
    epochs=50,  # More epochs with early stopping
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[reduce_lr, early_stop],
    verbose=0
)

best_train_acc = history_best.history['accuracy'][-1]
best_val_acc = history_best.history['val_accuracy'][-1]
best_overfit = best_train_acc - best_val_acc

print(f"\n   ‚úì Stopped at epoch: {len(history_best.history['loss'])}")
print(f"   ‚úì Train Acc: {best_train_acc:.4f}")
print(f"   ‚úì Val Acc: {best_val_acc:.4f}")
print(f"   ‚úì Overfitting Gap: {best_overfit:.4f}")

# Final comparison
print("\n" + "="*70)
print(" FINAL COMPARISON: Best from Each Category")
print("="*70)

final_comparison = pd.DataFrame([
    {
        'Technique': 'No Regularization',
        'Val Acc': reg_results['No Regularization']['final_val_acc'],
        'Overfit Gap': reg_results['No Regularization']['overfitting']
    },
    {
        'Technique': 'Best L2',
        'Val Acc': df_reg.iloc[0]['Val Acc'],
        'Overfit Gap': df_reg.iloc[0]['Overfit Gap']
    },
    {
        'Technique': 'Best Dropout',
        'Val Acc': df_dropout.iloc[0]['Val Acc'],
        'Overfit Gap': df_dropout.iloc[0]['Overfit Gap']
    },
    {
        'Technique': 'Combined (Best Practices)',
        'Val Acc': best_val_acc,
        'Overfit Gap': best_overfit
    }
])

final_comparison = final_comparison.sort_values('Val Acc', ascending=False)
print("\n" + final_comparison.to_string(index=False))

print("\n" + "="*70)
print("‚úÖ PART 10 COMPLETED!")
print("="*70)

CHAPTER 11: Part 10
Regularization Techniques

Dataset: 40,000 training samples

 EXPERIMENT 10: L1/L2 Regularization

 Training models with different regularization (20 epochs)...

 Training with No Regularization...


  super().__init__(**kwargs)


   ‚úì Train Acc: 0.9380
   ‚úì Val Acc: 0.8720
   ‚úì Overfitting Gap: 0.0660

 Training with L2 (0.001)...
   ‚úì Train Acc: 0.8839
   ‚úì Val Acc: 0.8690
   ‚úì Overfitting Gap: 0.0149

 Training with L2 (0.01)...
   ‚úì Train Acc: 0.8074
   ‚úì Val Acc: 0.8180
   ‚úì Overfitting Gap: -0.0106

 Training with L1 (0.001)...
   ‚úì Train Acc: 0.8214
   ‚úì Val Acc: 0.8165
   ‚úì Overfitting Gap: 0.0049

 Training with L1 + L2 (0.001)...
   ‚úì Train Acc: 0.8093
   ‚úì Val Acc: 0.8015
   ‚úì Overfitting Gap: 0.0078

 L1/L2 REGULARIZATION COMPARISON

    Configuration  Train Acc  Val Acc  Overfit Gap
No Regularization   0.938025   0.8720     0.066025
       L2 (0.001)   0.883950   0.8690     0.014950
        L2 (0.01)   0.807425   0.8180    -0.010575
       L1 (0.001)   0.821375   0.8165     0.004875
  L1 + L2 (0.001)   0.809275   0.8015     0.007775

 EXPERIMENT 11: Dropout Regularization

 Training models with different dropout rates (20 epochs)...

üèãÔ∏è Training with No Dropout...


  super().__init__(**kwargs)


   ‚úì Train Acc: 0.9374
   ‚úì Val Acc: 0.8755
   ‚úì Overfitting Gap: 0.0619

üèãÔ∏è Training with Dropout 10%...
   ‚úì Train Acc: 0.9103
   ‚úì Val Acc: 0.8865
   ‚úì Overfitting Gap: 0.0238

üèãÔ∏è Training with Dropout 20%...
   ‚úì Train Acc: 0.8969
   ‚úì Val Acc: 0.8750
   ‚úì Overfitting Gap: 0.0219

üèãÔ∏è Training with Dropout 30%...
   ‚úì Train Acc: 0.8793
   ‚úì Val Acc: 0.8735
   ‚úì Overfitting Gap: 0.0058

üèãÔ∏è Training with Dropout 50%...
   ‚úì Train Acc: 0.8307
   ‚úì Val Acc: 0.8650
   ‚úì Overfitting Gap: -0.0343

 DROPOUT COMPARISON

Configuration  Train Acc  Val Acc  Overfit Gap
  Dropout 10%   0.910275   0.8865     0.023775
   No Dropout   0.937400   0.8755     0.061900
  Dropout 20%   0.896900   0.8750     0.021900
  Dropout 30%   0.879300   0.8735     0.005800
  Dropout 50%   0.830750   0.8650    -0.034250

 EXPERIMENT 12: Combining Regularization Techniques

 BEST PRACTICES MODEL
   Architecture:
   ‚Ä¢ L2 Regularization (0.001)
   ‚Ä¢ Batch Normaliza

  super().__init__(**kwargs)



   ‚úì Stopped at epoch: 36
   ‚úì Train Acc: 0.9503
   ‚úì Val Acc: 0.8910
   ‚úì Overfitting Gap: 0.0593

 FINAL COMPARISON: Best from Each Category

                Technique  Val Acc  Overfit Gap
Combined (Best Practices)   0.8910     0.059300
             Best Dropout   0.8865     0.023775
        No Regularization   0.8720     0.066025
                  Best L2   0.8720     0.066025

‚úÖ PART 10 COMPLETED!


In [9]:
# ============================================================================
# CHAPTER 11: Part 11-12
# Practical Guidelines & Comprehensive Experiments
# ============================================================================

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
import pandas as pd
from datetime import datetime
import time

np.random.seed(42)
tf.random.set_seed(42)

print("="*70)
print("CHAPTER 11: Part 11-12")
print("Practical Guidelines & Comprehensive Experiments")
print("="*70)

# ============================================================================
# EXPERIMENT 13: Comprehensive Fashion MNIST Pipeline
# ============================================================================

print("\n" + "="*70)
print("üî¨ EXPERIMENT 13: Complete Fashion MNIST Pipeline")
print("="*70)

# Load full dataset
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()

# Split into train/validation
X_train = X_train_full[:-5000] / 255.0
y_train = y_train_full[:-5000]
X_valid = X_train_full[-5000:] / 255.0
y_valid = y_train_full[-5000:]
X_test = X_test / 255.0

print(f"\nDataset Split:")
print(f"   Training:   {X_train.shape[0]:,} samples")
print(f"   Validation: {X_valid.shape[0]:,} samples")
print(f"   Test:       {X_test.shape[0]:,} samples")

# Build comprehensive model with best practices
print("\n" + "="*70)
print(" BUILDING PRODUCTION-GRADE MODEL")
print("="*70)

model_fashion = keras.Sequential([
    layers.Flatten(input_shape=[28, 28]),
    
    # Layer 1
    layers.Dense(300, kernel_initializer='he_normal',
                kernel_regularizer=regularizers.l2(0.0001)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.2),
    
    # Layer 2
    layers.Dense(200, kernel_initializer='he_normal',
                kernel_regularizer=regularizers.l2(0.0001)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.2),
    
    # Layer 3
    layers.Dense(100, kernel_initializer='he_normal',
                kernel_regularizer=regularizers.l2(0.0001)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.2),
    
    # Output
    layers.Dense(10, activation='softmax')
])

print("\nModel Architecture:")
print("   ‚Ä¢ 3 Hidden Layers: [300, 200, 100]")
print("   ‚Ä¢ Initialization: He Normal")
print("   ‚Ä¢ Activation: ReLU")
print("   ‚Ä¢ Regularization: L2(0.0001) + Dropout(0.2) + BatchNorm")
print("   ‚Ä¢ Output: 10 classes (softmax)")

# Compile with best optimizer
model_fashion.compile(
    optimizer=keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Callbacks
callbacks_fashion = [
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-7,
        verbose=1
    ),
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=15,
        restore_best_weights=True,
        verbose=1
    ),
    keras.callbacks.ModelCheckpoint(
        'best_fashion_model.keras',
        monitor='val_accuracy',
        save_best_only=True,
        verbose=0
    )
]

print("\nTraining Configuration:")
print("   ‚Ä¢ Optimizer: SGD + Nesterov Momentum (0.9)")
print("   ‚Ä¢ Initial LR: 0.01")
print("   ‚Ä¢ Callbacks: ReduceLROnPlateau + EarlyStopping + ModelCheckpoint")
print("   ‚Ä¢ Max Epochs: 100 (with early stopping)")

# Train
print("\n" + "="*70)
print(" TRAINING MODEL")
print("="*70)

start_time = time.time()

history_fashion = model_fashion.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_data=(X_valid, y_valid),
    callbacks=callbacks_fashion,
    verbose=2
)

training_time = time.time() - start_time

print("\n" + "="*70)
print("‚úÖ TRAINING COMPLETED!")
print("="*70)
print(f"   Total Training Time: {training_time/60:.2f} minutes")
print(f"   Epochs Trained: {len(history_fashion.history['loss'])}")

# Evaluate on test set
print("\n" + "="*70)
print("FINAL EVALUATION")
print("="*70)

test_loss, test_acc = model_fashion.evaluate(X_test, y_test, verbose=0)

print(f"\nTest Set Performance:")
print(f"   Loss:     {test_loss:.4f}")
print(f"   Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")

# Training history summary
best_epoch = np.argmax(history_fashion.history['val_accuracy'])
best_val_acc = history_fashion.history['val_accuracy'][best_epoch]
best_train_acc = history_fashion.history['accuracy'][best_epoch]

print(f"\nTraining Summary:")
print(f"   Best Epoch: {best_epoch + 1}")
print(f"   Best Val Accuracy: {best_val_acc:.4f}")
print(f"   Train Accuracy at Best: {best_train_acc:.4f}")
print(f"   Overfitting Gap: {best_train_acc - best_val_acc:.4f}")

# ============================================================================
# EXPERIMENT 14: CIFAR-10 Deep Network (More Challenging)
# ============================================================================

print("\n" + "="*70)
print("EXPERIMENT 14: CIFAR-10 Deep Network")
print("="*70)

# Load CIFAR-10
(X_train_cifar, y_train_cifar), (X_test_cifar, y_test_cifar) = keras.datasets.cifar10.load_data()

# Normalize
X_train_cifar = X_train_cifar.astype('float32') / 255.0
X_test_cifar = X_test_cifar.astype('float32') / 255.0

# Flatten labels
y_train_cifar = y_train_cifar.flatten()
y_test_cifar = y_test_cifar.flatten()

# Split
X_train_c = X_train_cifar[:-5000]
y_train_c = y_train_cifar[:-5000]
X_valid_c = X_train_cifar[-5000:]
y_valid_c = y_train_cifar[-5000:]

print(f"\nCIFAR-10 Dataset:")
print(f"   Training:   {X_train_c.shape[0]:,} samples")
print(f"   Validation: {X_valid_c.shape[0]:,} samples")
print(f"   Test:       {X_test_cifar.shape[0]:,} samples")
print(f"   Image shape: {X_train_cifar.shape[1:]}")

# Build deeper model for CIFAR-10
print("\n" + "="*70)
print("BUILDING DEEP CIFAR-10 MODEL")
print("="*70)

model_cifar = keras.Sequential([
    layers.Flatten(input_shape=[32, 32, 3]),
    
    # Layer 1
    layers.Dense(400, kernel_initializer='he_normal'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.3),
    
    # Layer 2
    layers.Dense(300, kernel_initializer='he_normal'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.3),
    
    # Layer 3
    layers.Dense(200, kernel_initializer='he_normal'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.3),
    
    # Layer 4
    layers.Dense(100, kernel_initializer='he_normal'),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.3),
    
    # Output
    layers.Dense(10, activation='softmax')
])

print("\nModel Architecture:")
print("   ‚Ä¢ 4 Hidden Layers: [400, 300, 200, 100]")
print("   ‚Ä¢ Deeper network for more complex task")
print("   ‚Ä¢ Higher Dropout (0.3) due to more complex data")

model_cifar.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print("\nTraining Configuration:")
print("   ‚Ä¢ Optimizer: Adam (faster for complex tasks)")
print("   ‚Ä¢ Initial LR: 0.001")

callbacks_cifar = [
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        min_lr=1e-7,
        verbose=1
    ),
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=1
    )
]

# Train (fewer epochs for demo)
print("\n" + "="*70)
print("üèãÔ∏è  TRAINING CIFAR-10 MODEL (30 epochs)...")
print("="*70)

start_time = time.time()

history_cifar = model_cifar.fit(
    X_train_c, y_train_c,
    epochs=30,
    batch_size=128,
    validation_data=(X_valid_c, y_valid_c),
    callbacks=callbacks_cifar,
    verbose=2
)

training_time = time.time() - start_time

print("\n" + "="*70)
print("‚úÖ CIFAR-10 TRAINING COMPLETED!")
print("="*70)
print(f"   Total Training Time: {training_time/60:.2f} minutes")

# Evaluate
test_loss_c, test_acc_c = model_cifar.evaluate(X_test_cifar, y_test_cifar, verbose=0)

print(f"\nCIFAR-10 Test Performance:")
print(f"   Loss:     {test_loss_c:.4f}")
print(f"   Accuracy: {test_acc_c:.4f} ({test_acc_c*100:.2f}%)")

best_val_acc_c = max(history_cifar.history['val_accuracy'])
print(f"   Best Val Accuracy: {best_val_acc_c:.4f}")

# ============================================================================
# COMPARISON SUMMARY
# ============================================================================

print("\n" + "="*70)
print("COMPREHENSIVE EXPERIMENTS SUMMARY")
print("="*70)

summary_table = pd.DataFrame([
    {
        'Dataset': 'Fashion MNIST',
        'Architecture': '3 Layers [300,200,100]',
        'Test Accuracy': f'{test_acc:.4f}',
        'Training Time': f'{training_time/60:.1f} min'
    },
    {
        'Dataset': 'CIFAR-10',
        'Architecture': '4 Layers [400,300,200,100]',
        'Test Accuracy': f'{test_acc_c:.4f}',
        'Training Time': f'{training_time/60:.1f} min'
    }
])

print("\n" + summary_table.to_string(index=False))

print("\n" + "="*70)
print("‚úÖ PART 11-12 COMPLETED!")
print("="*70)

CHAPTER 11: Part 11-12
Practical Guidelines & Comprehensive Experiments

üî¨ EXPERIMENT 13: Complete Fashion MNIST Pipeline

Dataset Split:
   Training:   55,000 samples
   Validation: 5,000 samples
   Test:       10,000 samples

 BUILDING PRODUCTION-GRADE MODEL

Model Architecture:
   ‚Ä¢ 3 Hidden Layers: [300, 200, 100]
   ‚Ä¢ Initialization: He Normal
   ‚Ä¢ Activation: ReLU
   ‚Ä¢ Regularization: L2(0.0001) + Dropout(0.2) + BatchNorm
   ‚Ä¢ Output: 10 classes (softmax)

Training Configuration:
   ‚Ä¢ Optimizer: SGD + Nesterov Momentum (0.9)
   ‚Ä¢ Initial LR: 0.01
   ‚Ä¢ Callbacks: ReduceLROnPlateau + EarlyStopping + ModelCheckpoint
   ‚Ä¢ Max Epochs: 100 (with early stopping)

 TRAINING MODEL


  super().__init__(**kwargs)


Epoch 1/100
1719/1719 - 16s - 10ms/step - accuracy: 0.7930 - loss: 0.7023 - val_accuracy: 0.8476 - val_loss: 0.5543 - learning_rate: 0.0100
Epoch 2/100
1719/1719 - 10s - 6ms/step - accuracy: 0.8431 - loss: 0.5526 - val_accuracy: 0.8680 - val_loss: 0.4841 - learning_rate: 0.0100
Epoch 3/100
1719/1719 - 9s - 5ms/step - accuracy: 0.8589 - loss: 0.5079 - val_accuracy: 0.8632 - val_loss: 0.4823 - learning_rate: 0.0100
Epoch 4/100
1719/1719 - 9s - 5ms/step - accuracy: 0.8639 - loss: 0.4800 - val_accuracy: 0.8772 - val_loss: 0.4407 - learning_rate: 0.0100
Epoch 5/100
1719/1719 - 10s - 6ms/step - accuracy: 0.8728 - loss: 0.4544 - val_accuracy: 0.8650 - val_loss: 0.4740 - learning_rate: 0.0100
Epoch 6/100
1719/1719 - 10s - 6ms/step - accuracy: 0.8780 - loss: 0.4374 - val_accuracy: 0.8748 - val_loss: 0.4447 - learning_rate: 0.0100
Epoch 7/100
1719/1719 - 9s - 5ms/step - accuracy: 0.8831 - loss: 0.4222 - val_accuracy: 0.8742 - val_loss: 0.4548 - learning_rate: 0.0100
Epoch 8/100
1719/1719 - 9s - 

In [10]:
# ============================================================================
# CHAPTER 11: Part 13-15
# Exercise Solutions & Final Summary
# ============================================================================

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd

np.random.seed(42)
tf.random.set_seed(42)

print("="*70)
print("CHAPTER 11: Part 13-15")
print("Exercise Solutions & Final Summary")
print("="*70)

# ============================================================================
# EXERCISE SOLUTIONS
# ============================================================================

print("\n" + "="*70)
print("üìù CHAPTER 11 EXERCISES")
print("="*70)

# ============================================================================
# EXERCISE 1: Is it okay to initialize all weights to the same value?
# ============================================================================

print("\n" + "-"*70)
print("EXERCISE 1: Initialize all weights to same value?")
print("-"*70)

print("""
‚ùå NO! Initializing all weights to the same value is WRONG.

üîç REASON: Symmetry Problem
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
- If all weights start with same value (e.g., all 0.5)
- All neurons in a layer compute SAME output
- All neurons receive SAME gradient during backprop
- All weights update by SAME amount
- Neurons remain identical throughout training!
- Network effectively has only 1 neuron per layer

‚úÖ SOLUTION: Random Initialization
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
- Initialize weights RANDOMLY (but with proper variance)
- Breaks symmetry ‚Üí neurons learn different features
- Use: Glorot/He/LeCun initialization depending on activation

‚ö†Ô∏è  NOTE: It IS okay to initialize biases to 0 or small constant
   (biases don't have symmetry problem)
""")

# Demonstration
print("\nüî¨ DEMONSTRATION:")

# Bad: Same initialization
weights_same = np.full((5, 3), 0.5)
print("\n‚ùå Same Weight Initialization:")
print(weights_same)
print("   ‚Üí All columns identical! Symmetry problem!")

# Good: Random initialization
weights_random = np.random.randn(5, 3) * 0.1
print("\n‚úÖ Random Weight Initialization (He):")
print(weights_random.round(3))
print("   ‚Üí All columns different! Symmetry broken!")

# ============================================================================
# EXERCISE 2: Is it okay to initialize biases to 0?
# ============================================================================

print("\n" + "-"*70)
print("EXERCISE 2: Initialize biases to 0?")
print("-"*70)

print("""
‚úÖ YES! Initializing biases to 0 is PERFECTLY FINE.

üîç REASON: No Symmetry Problem
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
- Biases are added AFTER weight multiplication
- Even if all biases = 0, weights are still different (random)
- No symmetry problem because weights break symmetry
- Biases adjust during training based on gradients

üìä COMMON PRACTICE:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
- Default: Initialize biases to 0
- Alternative: Small constant (e.g., 0.01)
- Special cases: 
  - Output layer of regression: Initialize to mean of targets
  - Binary classification: Initialize to log(p/(1-p)) if class imbalance

üéØ SUMMARY:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
Weights: ‚ùå NEVER initialize to same value ‚Üí Random (He/Glorot/LeCun)
Biases:  ‚úÖ CAN initialize to 0 ‚Üí Works fine
""")

# ============================================================================
# EXERCISE 3: Name 3 advantages of SELU over ReLU
# ============================================================================

print("\n" + "-"*70)
print("EXERCISE 3: Three advantages of SELU over ReLU")
print("-"*70)

print("""
‚úÖ 3 ADVANTAGES OF SELU OVER RELU:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

1. SELF-NORMALIZATION üîÑ
   ‚Ä¢ SELU automatically maintains mean ‚âà 0, std ‚âà 1
   ‚Ä¢ No need for Batch Normalization!
   ‚Ä¢ Activations naturally normalized through forward pass
   ‚Ä¢ Enables training VERY deep networks (100+ layers)

2. NO DYING NEURON PROBLEM üíÄ
   ‚Ä¢ ReLU can "die" (output always 0) if weights become negative
   ‚Ä¢ SELU has negative part: can still backprop gradients
   ‚Ä¢ More robust training, fewer dead neurons

3. SMOOTHER GRADIENTS üìà
   ‚Ä¢ SELU is smooth everywhere (differentiable)
   ‚Ä¢ ReLU has kink at 0 (not differentiable at x=0)
   ‚Ä¢ Smoother optimization landscape
   ‚Ä¢ Often converges faster than ReLU

‚ö†Ô∏è  SELU REQUIREMENTS:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
- Must use LeCun initialization
- Input must be standardized (mean=0, std=1)
- Sequential Dense layers only (no CNN, no skip connections)
- Use AlphaDropout (not regular Dropout)
""")

# ============================================================================
# EXERCISE 4: Which activation functions to use?
# ============================================================================

print("\n" + "-"*70)
print("EXERCISE 4: Which activation function to use?")
print("-"*70)

print("""
üéØ ACTIVATION FUNCTION SELECTION GUIDE:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

üìä HIDDEN LAYERS:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

1. ‚úÖ ReLU (Default Choice)
   ‚Ä¢ Use for: Most cases (80% of the time)
   ‚Ä¢ Pros: Fast, works well, simple
   ‚Ä¢ Cons: Dying ReLU problem
   ‚Ä¢ Initialization: He

2. ‚úÖ ELU (Better Performance)
   ‚Ä¢ Use for: When you want better performance than ReLU
   ‚Ä¢ Pros: No dying neurons, mean closer to 0, faster convergence
   ‚Ä¢ Cons: Slightly slower (exponential computation)
   ‚Ä¢ Initialization: He

3. ‚úÖ Leaky ReLU (ReLU Dying Problem)
   ‚Ä¢ Use for: When ReLU neurons are dying
   ‚Ä¢ Pros: Fixes dying ReLU, still fast
   ‚Ä¢ Cons: Need to tune alpha parameter
   ‚Ä¢ Initialization: He

4. ‚úÖ SELU (Very Deep Networks)
   ‚Ä¢ Use for: Very deep networks (10+ layers), no BatchNorm needed
   ‚Ä¢ Pros: Self-normalizing, no BN needed
   ‚Ä¢ Cons: Strict requirements (LeCun init, standardized input)
   ‚Ä¢ Initialization: LeCun

5. ‚ùå sigmoid/tanh (Avoid)
   ‚Ä¢ Use for: LSTM cells only (internal gates)
   ‚Ä¢ Pros: Bounded output
   ‚Ä¢ Cons: Vanishing gradients, slow
   ‚Ä¢ Initialization: Glorot

üìä OUTPUT LAYER:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

- Binary Classification ‚Üí sigmoid
- Multi-class Classification ‚Üí softmax
- Regression ‚Üí None (linear)
- Multi-label Classification ‚Üí sigmoid (per output)
- Regression (bounded) ‚Üí sigmoid or tanh (scaled)

üéØ QUICK DECISION:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
- Don't know? ‚Üí Use ReLU
- Want better? ‚Üí Try ELU
- Very deep? ‚Üí Try SELU
- ReLU dying? ‚Üí Use Leaky ReLU
""")

# ============================================================================
# EXERCISE 5: Momentum hyperparameter effects
# ============================================================================

print("\n" + "-"*70)
print("EXERCISE 5: Momentum Hyperparameter (Œ≤)")
print("-"*70)

print("""
üéØ MOMENTUM HYPERPARAMETER (Œ≤) EFFECTS:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

üìê MOMENTUM FORMULA:
   v_t = Œ≤ √ó v_{t-1} + (1-Œ≤) √ó ‚àáL
   Œ∏_t = Œ∏_{t-1} - Œ∑ √ó v_t

WHERE:
   ‚Ä¢ Œ≤ = momentum coefficient (typically 0.9)
   ‚Ä¢ v_t = velocity (exponentially decaying average of gradients)
   ‚Ä¢ ‚àáL = current gradient
   ‚Ä¢ Œ∑ = learning rate

üîß HYPERPARAMETER Œ≤ SETTINGS:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

Œ≤ = 0 (No Momentum):
   ‚Ä¢ Same as vanilla SGD
   ‚Ä¢ ‚ùå Slow convergence
   ‚Ä¢ ‚ùå Stuck in local minima
   ‚Ä¢ ‚ùå Oscillates in ravines

Œ≤ = 0.5 (Low Momentum):
   ‚Ä¢ ‚ö†Ô∏è Some smoothing, but weak
   ‚Ä¢ ‚ö†Ô∏è Still oscillates
   ‚Ä¢ Not recommended

Œ≤ = 0.9 (STANDARD - BEST):
   ‚Ä¢ ‚úÖ Good balance
   ‚Ä¢ ‚úÖ Smooths oscillations
   ‚Ä¢ ‚úÖ Escapes local minima
   ‚Ä¢ ‚úÖ Faster convergence
   ‚Ä¢ üëâ DEFAULT CHOICE

Œ≤ = 0.99 (High Momentum):
   ‚Ä¢ ‚ö†Ô∏è Very smooth trajectory
   ‚Ä¢ ‚ö†Ô∏è May overshoot minima
   ‚Ä¢ ‚ö†Ô∏è Slower to adapt to changes
   ‚Ä¢ Use for: Very noisy gradients

Œ≤ ‚Üí 1 (Too High):
   ‚Ä¢ ‚ùå Never converges
   ‚Ä¢ ‚ùå Keeps accelerating
   ‚Ä¢ DON'T USE

üìä PRACTICAL EFFECTS:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
- Œ≤ = 0.9 ‚Üí considers last ~10 gradients
- Œ≤ = 0.99 ‚Üí considers last ~100 gradients
- Higher Œ≤ ‚Üí more "memory" of past gradients

üéØ RECOMMENDATION:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
START WITH Œ≤ = 0.9 and only change if:
   ‚Ä¢ Too much oscillation ‚Üí increase to 0.95 or 0.99
   ‚Ä¢ Overshooting ‚Üí decrease to 0.8 or 0.85
""")

# ============================================================================
# EXERCISE 6: Creating a sparse model
# ============================================================================

print("\n" + "-"*70)
print("EXERCISE 6: Three ways to create a sparse model")
print("-"*70)

print("""
‚úÖ 3 WAYS TO CREATE SPARSE MODEL (many weights = 0):
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

1. L1 REGULARIZATION (‚Ñì1) üéØ
   ‚Ä¢ Add penalty: Œª √ó Œ£|w_i| to loss function
   ‚Ä¢ Pushes many weights EXACTLY to 0
   ‚Ä¢ Creates sparse model automatically during training
   
   Code:
   model.add(Dense(100, kernel_regularizer=regularizers.l1(0.01)))
   
   Pros: ‚úÖ Automatic feature selection
         ‚úÖ Reduces model size
   Cons: ‚ö†Ô∏è May hurt performance
         ‚ö†Ô∏è Need to tune Œª

2. DROPOUT WITH HIGH RATE (50%+) üíß
   ‚Ä¢ Randomly drops 50%+ of neurons during training
   ‚Ä¢ At inference: all weights active but scaled
   ‚Ä¢ Effectively creates sparse activation patterns
   
   Code:
   model.add(Dropout(0.5))  # or 0.6, 0.7
   
   Pros: ‚úÖ Strong regularization
         ‚úÖ Ensemble effect
   Cons: ‚ö†Ô∏è Not truly sparse (weights still exist)
         ‚ö†Ô∏è May underfit if too high

3. MAGNITUDE PRUNING ‚úÇÔ∏è
   ‚Ä¢ Train full model first
   ‚Ä¢ Remove smallest weights (set to 0)
   ‚Ä¢ Fine-tune remaining weights
   ‚Ä¢ Iteratively prune more if needed
   
   Steps:
   a) Train model normally
   b) Sort weights by magnitude
   c) Set bottom X% to 0 (e.g., 50%)
   d) Fine-tune with remaining weights frozen at 0
   
   Pros: ‚úÖ Controlled sparsity level
         ‚úÖ Often maintains accuracy
   Cons: ‚ö†Ô∏è Requires extra training step
         ‚ö†Ô∏è Manual process

üìä COMPARISON:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
Method          | Sparsity | Performance | Automatic | Inference Speed
----------------|----------|-------------|-----------|----------------
L1 Reg          | Medium   | Good        | ‚úÖ Yes     | ‚úÖ Faster
Dropout         | Low      | Good        | ‚úÖ Yes     | ‚ùå Same
Pruning         | High     | Best        | ‚ùå No      | ‚úÖ Much Faster

üéØ WHEN TO USE EACH:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
- Feature selection needed ‚Üí L1 Regularization
- Training regularization ‚Üí Dropout
- Deploy to mobile/edge ‚Üí Magnitude Pruning
- Want smallest model ‚Üí Combine all three!
""")

# ============================================================================
# EXERCISE 7: Dropout - Does it slow down training/inference?
# ============================================================================

print("\n" + "-"*70)
print("EXERCISE 7: Does Dropout slow down training/inference?")
print("-"*70)

print("""
üéØ DROPOUT EFFECTS ON SPEED:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

üìä TRAINING:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
‚úÖ YES, Dropout SLOWS DOWN training convergence

WHY:
- Each iteration uses only subset of neurons (e.g., 80% if dropout=0.2)
- Effective network capacity reduced during training
- Model needs MORE epochs to converge
- Each epoch is slightly faster (fewer neurons active)
- But TOTAL training time INCREASES (need ~2x more epochs)

TYPICAL IMPACT:
- Without Dropout: 20 epochs to converge
- With Dropout 20%: 30-40 epochs to converge
- Net effect: ~1.5-2x longer training time

üìä INFERENCE (PRODUCTION):
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
‚ùå NO, Dropout does NOT slow down inference

WHY:
- Dropout is TURNED OFF during inference/testing
- All neurons are active (no random dropping)
- Weights are scaled down by (1 - dropout_rate)
- No additional computation compared to no-dropout model
- Inference speed IDENTICAL

KERAS IMPLEMENTATION:
- model.fit() ‚Üí Dropout active (training=True)
- model.predict() ‚Üí Dropout OFF (training=False)
- Handled automatically!

üìä SUMMARY TABLE:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
Phase       | Speed Impact | Reason
------------|--------------|----------------------------------------
Training    | ‚úÖ SLOWER    | Needs more epochs to converge (~2x)
Inference   | ‚úÖ SAME      | Dropout turned OFF, all neurons active

üéØ PRACTICAL IMPLICATION:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
- Training: Budget more time/epochs when using Dropout
- Production: NO performance penalty, only benefits!
- Trade-off: Longer training for better generalization
""")

# ============================================================================
# EXERCISES 8-10: Practical Implementation
# ============================================================================

print("\n" + "="*70)
print("üìù EXERCISES 8-10: Deep Network on Fashion MNIST")
print("="*70)

print("""
These exercises require implementing a deep neural network on Fashion MNIST
using all the techniques we've learned. We already completed this in 
EXPERIMENT 13 (Part 11-12) with the production-grade model!

üéØ KEY ACHIEVEMENTS FROM EXPERIMENT 13:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
‚úÖ Exercise 8: Built deep network with proper initialization
‚úÖ Exercise 9: Applied Batch Normalization + Dropout + L2
‚úÖ Exercise 10: Used Adam/SGD+Momentum with learning rate scheduling

RESULTS ACHIEVED:
- Test Accuracy: ~90% (see Experiment 13 output above)
- Proper regularization (minimal overfitting)
- Fast convergence with callbacks
- Production-ready pipeline

Refer to the comprehensive Fashion MNIST model trained above! ‚úÖ
""")

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("\n" + "="*70)
print("üéì CHAPTER 11: FINAL SUMMARY & KEY TAKEAWAYS")
print("="*70)

print("""
‚úÖ COMPREHENSIVE CHAPTER 11 LEARNINGS:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

1. INITIALIZATION (Critical Foundation)
   üèÜ Best Practice: He initialization for ReLU
   üìä Impact: 10% ‚Üí 85% accuracy (75 percentage points!)
   üí° Never initialize all weights to same value (symmetry!)

2. ACTIVATION FUNCTIONS
   üèÜ Best Practice: ReLU (default), ELU (better), SELU (very deep)
   üìä Impact: Small differences (~1-2%), but critical for deep networks
   üí° Avoid sigmoid/tanh in hidden layers (vanishing gradients)

3. BATCH NORMALIZATION
   üèÜ Best Practice: Use after Dense, before Activation
   üìä Impact: +3.2% faster convergence in epoch 1
   üí° Enables higher learning rates, acts as regularizer

4. TRANSFER LEARNING
   üèÜ Best Practice: Works when tasks similar + limited target data
   üìä Impact: Variable (can hurt if tasks too different!)
   üí° Freeze lower layers first, then fine-tune

5. OPTIMIZERS (Game Changer!)
   üèÜ Best Practice: SGD + Momentum (0.9) + Nesterov for production
   üìä Impact: 86.65% ‚Üí 87.85% (+1.2%)
   üí° Adam for prototyping (fast), SGD+Momentum for final models

6. LEARNING RATE SCHEDULING
   üèÜ Best Practice: ReduceLROnPlateau (adaptive, no tuning)
   üìä Impact: +0.25% improvement, helps convergence
   üí° Essential for very long training runs

7. REGULARIZATION (Production Essential!)
   üèÜ Best Practice: Dropout 10-30% (primary), weak L2 (secondary)
   üìä Impact: 87.6% ‚Üí 88.8% with Dropout 10% (+1.2%)
   üí° Dropout > L2 regularization for neural networks

8. COMBINED BEST PRACTICES
   üèÜ Best Practice: All techniques together
   üìä Impact: Simple model 87.6% ‚Üí Best practices 90.05% (+2.45%)
   üí° Production model achieves ~90% on Fashion MNIST!

‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

üéØ DEFAULT CONFIGURATION (Copy-Paste Ready):
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

model = keras.Sequential([
    layers.Dense(units, kernel_initializer='he_normal',
                kernel_regularizer=regularizers.l2(0.0001)),
    layers.BatchNormalization(),
    layers.Activation('relu'),
    layers.Dropout(0.2),
])

optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)

callbacks = [
    ReduceLROnPlateau(patience=5),
    EarlyStopping(patience=15, restore_best_weights=True)
]

‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

üìö WHAT WE ACCOMPLISHED IN THIS CHAPTER:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
‚úÖ Understood vanishing/exploding gradients problem
‚úÖ Mastered weight initialization strategies
‚úÖ Compared activation functions systematically
‚úÖ Implemented Batch Normalization properly
‚úÖ Explored transfer learning (and its limitations)
‚úÖ Tested modern optimizers (SGD, Adam, RMSprop, etc.)
‚úÖ Applied learning rate scheduling strategies
‚úÖ Mastered regularization (Dropout, L1/L2)
‚úÖ Built production-grade models (90%+ accuracy)
‚úÖ Solved all chapter exercises

‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

üöÄ YOU NOW HAVE:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
- Deep understanding of training deep neural networks
- Production-ready code templates
- Systematic approach to hyperparameter selection
- Debugging strategies for training issues
- Best practices for real-world deployment

üéì NEXT STEPS:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
- Apply these techniques to your own datasets
- Experiment with different architectures (CNN, RNN)
- Study Chapter 12 (Custom Models and Training)
- Build production models with confidence!

üéâ CONGRATULATIONS ON COMPLETING CHAPTER 11! üéâ
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
""")

print("\n" + "="*70)
print("‚úÖ CHAPTER 11 COMPLETE - ALL PARTS FINISHED!")
print("="*70)

CHAPTER 11: Part 13-15
Exercise Solutions & Final Summary

üìù CHAPTER 11 EXERCISES

----------------------------------------------------------------------
EXERCISE 1: Initialize all weights to same value?
----------------------------------------------------------------------

‚ùå NO! Initializing all weights to the same value is WRONG.

üîç REASON: Symmetry Problem
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ
- If all weights start with same value (e.g., all 0.5)
- All neurons in a layer compute SAME output
- All neurons receive SAME gradient during backprop
- All weights update by SAME amount
- Neurons remain identical throughout training!
- Network effectively has only 1 neuron per layer

‚úÖ SOLUTION: Random Initialization
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î