In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Define a feed-forward network with 10 layers of constant width 2.
class FeedForwardNN(nn.Module):
    def __init__(self):
        super(FeedForwardNN, self).__init__()
        # Create a list of 10 linear layers (2x2 each)
        self.layers = nn.ModuleList([nn.Linear(2, 2) for _ in range(10)])
    
    def forward(self, x):
        # Apply ReLU activation for the first 9 layers; final layer outputs logits.
        for layer in self.layers[:-1]:
            x = torch.relu(layer(x))
        x = self.layers[-1](x)
        return x

# Training loop using a DataLoader with added debugging prints.
def train_model(model, dataloader, epochs=100, lr=0.01):
    criterion = nn.CrossEntropyLoss()  # For binary classification (2 classes).
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    # Debug: Print initial parameter norms.
    print("Initial parameter norms:")
    for name, param in model.named_parameters():
         print(f"{name}: {param.data.norm().item():.4f}")
    
    for epoch in range(epochs):
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(dataloader):
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            
            # Debug: Print gradient norms for the first batch in each epoch.
            if i == 0:
                for name, param in model.named_parameters():
                    if param.grad is not None:
                        print(f"Epoch {epoch+1} {name} grad norm: {param.grad.norm().item():.4f}")
            
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(dataloader.dataset)
        if (epoch + 1) % 20 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")
    
    # Debug: Print final parameter norms.
    print("\nFinal parameter norms:")
    for name, param in model.named_parameters():
         print(f"{name}: {param.data.norm().item():.4f}")

# (Assuming you have a proper DataLoader defined somewhere, for example:)
def generate_data(n_samples=1000):
    X = torch.randn(n_samples, 2)
    y = (X.sum(dim=1) >= 0).long()  # Binary labels: 0 or 1.
    return X, y

if __name__ == "__main__":
    torch.manual_seed(42)
    
    # Create model and data.
    model = FeedForwardNN()
    X, y = generate_data(n_samples=1000)
    dataset = TensorDataset(X, y)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    
    # Train the model.
    train_model(model, dataloader, epochs=200, lr=0.01)


Initial parameter norms:
layers.0.weight: 1.0422
layers.0.bias: 0.2106
layers.1.weight: 0.9740
layers.1.bias: 0.6287
layers.2.weight: 0.6390
layers.2.bias: 0.5550
layers.3.weight: 0.5045
layers.3.bias: 0.5500
layers.4.weight: 0.7990
layers.4.bias: 0.7016
layers.5.weight: 1.0395
layers.5.bias: 0.4936
layers.6.weight: 0.6279
layers.6.bias: 0.2699
layers.7.weight: 0.8659
layers.7.bias: 0.4273
layers.8.weight: 0.9377
layers.8.bias: 0.7942
layers.9.weight: 0.4212
layers.9.bias: 0.7474
Epoch 1 layers.0.weight grad norm: 0.0003
Epoch 1 layers.0.bias grad norm: 0.0002
Epoch 1 layers.1.weight grad norm: 0.0005
Epoch 1 layers.1.bias grad norm: 0.0004
Epoch 1 layers.2.weight grad norm: 0.0005
Epoch 1 layers.2.bias grad norm: 0.0007
Epoch 1 layers.3.weight grad norm: 0.0021
Epoch 1 layers.3.bias grad norm: 0.0022
Epoch 1 layers.4.weight grad norm: 0.0010
Epoch 1 layers.4.bias grad norm: 0.0066
Epoch 1 layers.5.weight grad norm: 0.0002
Epoch 1 layers.5.bias grad norm: 0.0121
Epoch 1 layers.6.weight

In [2]:
for data in dataloader:
    print(data)
    break

[tensor([[-0.9773, -1.5335],
        [ 0.3004,  1.6395],
        [-2.2940,  0.7744],
        [ 0.0089, -1.5551],
        [-2.0043,  0.0055],
        [-1.3776, -0.4410],
        [ 0.9364,  0.7122],
        [-0.7420,  0.1556],
        [-0.2590,  0.4479],
        [-0.6864,  0.0041],
        [-0.7917,  0.4702],
        [-0.0252,  0.3789],
        [ 0.3120,  0.7174],
        [-0.7939,  0.3752],
        [ 0.1070, -0.1015],
        [ 0.3488,  0.9676],
        [ 0.6588, -0.2967],
        [ 0.5058, -1.1371],
        [-0.3637, -1.2479],
        [-0.4828, -1.0801],
        [-0.1933,  0.6526],
        [-0.1722,  0.5238],
        [-1.6535,  0.6814],
        [ 1.3695, -0.2519],
        [-0.8704,  0.0305],
        [-0.1718, -0.6867],
        [-0.1727,  0.6042],
        [-0.1341, -1.0408],
        [-0.1693,  0.2332],
        [ 1.8890,  0.5935],
        [-0.1260, -0.6126],
        [-0.7054,  1.1545]]), tensor([0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
        0, 0, 1, 0, 1