In [17]:
import torch
import torch.nn as nn

In [18]:
# 1. Simulating Vanishing Gradient
# Many layers with small weights
weights = torch.tensor([0.1])
gradient = torch.tensor([1.0])

for i in range(50):
    gradient = gradient * weights

print(f"Vanishing Gradient after 50 layers: {gradient.item()}") # Very close to 0

Vanishing Gradient after 50 layers: 0.0


In [19]:
# 2. Simulating Exploding Gradient
# Many layers with large weights
weights_large = torch.tensor([2.0])
gradient_large = torch.tensor([1.0])

for i in range(150):
    gradient_large = gradient_large * weights_large
    if torch.isinf(gradient_large):
        print(f"Gradient Exploded at layer {i}!")
        break
else:
    print("Gradient not Exploded")

Gradient Exploded at layer 127!


In [20]:
# Gradient clipping for gredient explosion
# Simulating a very large gradient
gradient = torch.tensor([1000.0])
max_norm = 1.0 # Our safety limit

# Logic: If gradient > max_norm, scale it down
if gradient.norm() > max_norm:
    clipped_gradient = (gradient / gradient.norm()) * max_norm

# In PyTorch, we use this simple line:
# torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

print(f"Original: {gradient.item()} | Clipped: {clipped_gradient.item()}")

Original: 1000.0 | Clipped: 1.0


In [22]:
# Batch normalization for gredient explosion

# In a Neural Network layer
layer = nn.Sequential(
    nn.Linear(10, 10),
    nn.BatchNorm1d(10), # Normalizing the outputs
    nn.ReLU()
)

In [23]:
# Weight normalization for gredient explosion

#Logic: Initializing weights based on layer size
input_dim = 100
output_dim = 50

# He Initialization for ReLU layers
std = torch.sqrt(torch.tensor(2.0 / input_dim))
weights = torch.randn(input_dim, output_dim) * std