# 06_Batch_Normalization
* https://www.youtube.com/watch?v=1U5nOKh9OLQ&list=PLjy4p-07OYzuy_lHcRW8lPTLPTTOmUpmi&index=20
* https://github.com/jeffheaton/app_deep_learning/blob/main/t81_558_class_04_4_batch_norm.ipynb

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
import copy
import torch

try:
    import google.colab

    COLAB = True
    print("Note: using Google CoLab")
except:
    print("Note: not using Google CoLab")
    COLAB = False

# Make use of a GPU or MPS (Apple) if one is available.  (see module 3.2)
import torch
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Note: using Google CoLab
Using device: cpu


In [3]:
# Early Stopping
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_model = None
        self.best_loss = None
        self.counter = 0
        self.status = ""

    def __call__(self, model, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.best_model = copy.deepcopy(model.state_dict())
        elif self.best_loss - val_loss > self.min_delta:
            self.best_loss = val_loss
            self.best_model = copy.deepcopy(model.state_dict())
            self.counter = 0
            self.status = f"Improvement found, counter reset to {self.counter}"
        else:
            self.counter += 1
            self.status = f"No improvement in the last {self.counter} epochs"
            if self.counter >= self.patience:
                self.status = f"Early stopping after {self.counter} epochs"
                if self.restore_best_weights:
                    model.load_state_dict(self.best_model)
                return True
        return False

The goal of batch normalization is to accelerate learning and stabilize the learning process. The same dataset used previously in this chapter. To implement batch normalization we will modify both preprocessing and a neural network setup. The original preprocessing method used a z-score standardization to make the data more suited for the training process. The model itself was a simple feed-forward neural network built with PyTorch's nn.Sequential API.

<br>

In the revised code, we've made two major changes. The first change was in data preprocessing. Since batch normalization can reduce the impact of input distribution changes (referred to as **internal covariate(共変量) shift**, we can remove the step of z-score normalization. Batch normalization tends to make the network less sensitive to the scale and distribution of its inputs, thereby minimizing the need for manual, meticulous(几帳面な) data normalization.

<br>

The second, and most crucial, change was made in the architecture of the neural network itself. We've inserted batch normalization layers into our model by using the **nn.BatchNorm1d()** function. It's important to note that the batch normalization layers are typically <u>added after linear (or convolutional for CovNets) layers but before the activation function</u>. In our case, the sequence is: Linear -> BatchNorm -> ReLU.

<br>


The batch normalization layers normalize the activations and gradients propagating through a neural network, making the model training more efficient. This can even have a slight regularization efect, somewhat akin(~と同じ) to Dropout.

<br>

Remember that the use of batch normalization may require some additional computational resources due to additional complexity of the model, but it often results in a significant performance boost that more than compensates for the extra computation time.

<br>

In summary, the introduction of batch normalization in our neural network model simplifies preprocessing and can potentially improve model training speed, stability, and overall performance.




In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold

# Read the data set
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])

# Generate dummies for job
df = pd.concat([df,pd.get_dummies(df['job'],prefix="job",dtype=int)],axis=1)
df.drop('job', axis=1, inplace=True)

# Generate dummies for area
df = pd.concat([df,pd.get_dummies(df['area'],prefix="area",dtype=int)],axis=1)
df.drop('area', axis=1, inplace=True)

# Generate dummies for product
df = pd.concat([df,pd.get_dummies(df['product'],prefix="product",dtype=int)],axis=1)
df.drop('product', axis=1, inplace=True)

# Missing values for income
med = df['income'].median()
df['income'] = df['income'].fillna(med)

# Convert to PyTorch Tensors
x_columns = df.columns.drop(['age', 'id'])
x = torch.tensor(df[x_columns].values, dtype=torch.float32, device=device)
y = torch.tensor(df['age'].values, dtype=torch.float32, device=device).view(-1, 1)

df.head()

Unnamed: 0,id,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,...,area_b,area_c,area_d,product_a,product_b,product_c,product_d,product_e,product_f,product_g
0,1,50876.0,13.1,1,9.017895,35,11.738935,49,0.885827,0.492126,...,0,1,0,0,1,0,0,0,0,0
1,2,60369.0,18.625,2,7.766643,59,6.805396,51,0.874016,0.34252,...,0,1,0,0,0,1,0,0,0,0
2,3,55126.0,34.766667,1,3.632069,6,13.671772,44,0.944882,0.724409,...,0,1,0,0,1,0,0,0,0,0
3,4,51690.0,15.808333,1,5.372942,16,4.333286,50,0.889764,0.444882,...,0,1,0,0,1,0,0,0,0,0
4,5,28347.0,40.941667,3,3.822477,20,5.967121,38,0.744094,0.661417,...,0,0,1,1,0,0,0,0,0,0


In [7]:
# Set random seed for reproducibility
torch.manual_seed(42)

# Cross-Validate
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Early stopping parameters
patience = 10

fold = 0
for train_idx, test_idx in kf.split(x):
    fold += 1
    print(f"Fold {fold}")

    x_train, x_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # PyTorch DataLoader
    train_dataset = TensorDataset(x_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # Create the model and optimizer
    model = nn.Sequential(
        nn.Linear(x.shape[1], 20),
        nn.BatchNorm1d(20),
        nn.ReLU(),
        nn.Linear(20, 10),
        nn.BatchNorm1d(10),
        nn.ReLU(),
        nn.Linear(10, 1)
    )

    optimizer = optim.Adam(model.parameters(), lr=0.01)
    loss_fn = nn.MSELoss()

    # Early Stopping variables
    best_loss = float('inf')
    early_stopping_counter = 0

    # Training loop
    EPOCHS = 500
    epoch = 0
    done = False
    es = EarlyStopping(patience=patience)
    while not done and epoch < EPOCHS:
        epoch += 1
        model.train()
        for x_batch, y_batch in train_loader:
            optimizer.zero_grad()
            output = model(x_batch)
            loss = loss_fn(output, y_batch)
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            val_output = model(x_test)
            val_loss = loss_fn(val_output, y_test)

        # Early stopping
        if es(model, val_loss):
            done = True

    print(f"Epoch {epoch}/{EPOCHS}, Validation Loss: "
      f"{val_loss.item()}, {es.status}")


Fold 1
Epoch 14/500, Validation Loss: 164.2581024169922, Early stopping after 10 epochs
Fold 2
Epoch 18/500, Validation Loss: 38.37013244628906, Early stopping after 10 epochs
Fold 3
Epoch 16/500, Validation Loss: 62.91709518432617, Early stopping after 10 epochs
Fold 4
Epoch 19/500, Validation Loss: 29.32093048095703, Early stopping after 10 epochs
Fold 5
Epoch 21/500, Validation Loss: 342.0139465332031, Early stopping after 10 epochs


In [9]:
# Final evaluation
model.eval()
with torch.no_grad():
    oos_pred = model(x_test)
score = torch.sqrt(loss_fn(oos_pred, y_test)).item()
print(f"Fold score (RMSE): {score}")

Fold score (RMSE): 3.636986494064331


Batch normalization can often improve the performance of a model, but it's not a guarantee. It can depend on many factors such as the specific problem being addressed, the data, the architecture of the model, and the specific training regime.

* Dataset: Batch normalization is particularly useful when dealing with high dimensional data and tends to be more effective with larger datasets. If your dataset is small or simple, batchnormalization may not make a significant difference and might even cause the performance to decrease slightly.
* Model Complexity: In simpler models, batch normalization may not lead to a significant performance boost, and might even degrade the performance slightly, as it introduces extra complexity and computation.
* Training Prameters: The improvement due to batch normalization also depends on the other hyperparameters you're using, like the learning rate and the batch size. If the original model was already well-optimized with these parameters, the addition of batch normalization may not help much, and could potentially even harm performance.


In other words, while batch normalization is generally a useful technique for improving model performance and stability, it's not a silber bullet and it won't always lead to an improvement. It's always a good idea to experiment with different architectures and techniques to see what works best for your specific problem.

# Internal covariate shif

**Internal covariate shift** is a concept in deep learning that refers to the problem when the distribution of inputs to each layer of a neural network changes during training. This issue can make the training process slower and less stable, because each layer needs to continuously adapt to new distributions of input data.

<br>

Here's how it happens: as you train a network, the weights and biases of earlier layers change. Because the output of one layer is the input to the next, these changes can lead to significant variations in the input distibutions received by deeper layers. This forces the layers to constantly readjust to new distributions, which can slow down the learning process and make it harder to converge to an optimal set of weights.

<br>

To mitigate this problem, a technique called Batch Normaliaation was introduced. It normalizes the inputs of each layer ot have zero mean and unit variance, effectively stabilizing the distributions of inputs that each layer receives. This helps to accelerate the training process adn improve the performance of deep neural networks.

# Explanation by GPT4
Batch normalization is a technique used to improve the training of deep neural networks by stabilizing the distributions of layer inputs. Here's a breakdown of how it works and how it can be implemented:

## How Batch Normalizaion Wroks
1. **Normalization**: For each mini-batch during training, batch normalization normalizes the activations of the previous layer for each neuron. This is achieved by subtracting the mini-batch mean and dividing by the mini-batch standard devidation. Mathematically, if $x$ is an input to a neuron, its normalized value $\hat{x}$ would be:

    $
    \hat{x} = \frac{x - \mu_{\text{B}}}{\sqrt{\sigma_{\text{B}}^2 + \epsilon}}
    \$

   where $\mu_{\text{B}}$ and $ \sigma_{\text{B}}^2 $ are the mean and variance of the mini-batch, and $\epsilon$ is a small constant added for numerical stability.
   
2. **Scalling and Shifting**: After normalization, batch normalization introduces two learnable parameters, gammna(γ) and beta(β), which allow the model to scale and shift the normalzied values. This helps the model to retain the representational power of the network, even after normalization. The output of batch normalization is:

    $
    y = \gamma \hat{x} + \beta
    $

    where $y$ is the output that goes into the next layer.

3. **During Inference**: When the model is used for inference (i.e., making predictions on new data), the mean and variance used for normalization are not computed based on the individual batch. Instead, they are estimated from the entire training dataset, typically using an exponential moving average accumulated during the training phase.


## Benefits of Batch Normalization
* **Improves Gradient Flow**: Helps mitigate the problem of vanishing or exploding gradients.
* **Allows Higher Learning Rates**: Stabilizing the distribution of inputs allows for using higher learning rates without the risk of divergence.
* **Reduces Overfitting**: Acts as a form of regularization, slightly reducing the need for other regularization techniques like dropout.

## Implementation in PyTorch
In PyTorch, batch normalization is straighforward to implement using the **torch.nn.BatchNorm1d** (for 1D data) or **torch.nn.BatchNorm2d** (for images) modules.

In [None]:
import torch
import torch.nn as nn

class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.layer1 = nn.Linear(10, 20)
        self.bn1 = nn.BatchNorm1d(20)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.layer1(x)
        x = self.bn1(x)
        x = self.relu(x)
        return x

# Example usage
model = SimpleNet()
input_tensor = torch.randn(5, 10)  # batch size of 5, input features 10
output = model(input_tensor)


In this example, the **BatchNorm1d** layer normalizes the output of the first linear layer <u>before applying the ReLU activation function</u>. This pattern is typical when using batch normalization in deep learning architectures.

## Effect of batch normalization
To see how batch normalization affects training in practice, let's go through a simple experiment using PyTorch. We will two small neural networks on a dataset: one with batch normalization and one without. This will us to observe the differences in terms of training speed, stability, and final performance.

<br>

We can use a standard dataset like MNIST, which is common for demonstrating basic neural network functionalitides. MNIST is a dataset of handwritten digits, each image being 28*28 pixels. We'll create two models:
1. **Model without Batch Normalization**
2. **Model with Batch Normalization**

### Step1: Prepare the Dataset

In [11]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Transformation applied on each image
transform = transforms.Compose([
    transforms.ToTensor(), # Convert images to Pytorch tensors
    transforms.Normalize((0.5,), (0.5,)) # Normalize the dataset
])

# Load MNIST dataset
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 76013455.01it/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 32589640.52it/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz





Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 55956077.48it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 2435506.11it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw






### Step2: Define the Networks

In [12]:
class NetWithoutBN(nn.Module):
    def __init__(self):
        super(NetWithoutBN, self).__init__()
        self.fc1 = nn.Linear(28*28, 100)
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 10)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(-1, 28*28)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class NetWithBN(nn.Module):
    def __init__(self):
        super(NetWithBN, self).__init__()
        self.fc1 = nn.Linear(28*28, 100)
        self.bn1 = nn.BatchNorm1d(100)
        self.fc2 = nn.Linear(100, 50)
        self.bn2 = nn.BatchNorm1d(50)
        self.fc3 = nn.Linear(50, 10)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(-1, 28*28)
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.fc3(x)
        return x


### Step3: Train the Models
We'll train both models using the same settings for a fair comparison.

In [13]:
def train_model(model, train_loader):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    model.train()

    for epoch in range(10):
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader):
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

# Create model instance
model_without_bn = NetWithoutBN()
model_with_bn = NetWithBN()

# Train models
print("Training model without Batch Normalization:")
train_model(model_without_bn, train_loader)

print("\nTraining model with Batch Normalization:")
train_model(model_with_bn, train_loader)

Training model without Batch Normalization:
Epoch 1, Loss: 0.4218110325875313
Epoch 2, Loss: 0.20883379394470503
Epoch 3, Loss: 0.15682862677982748
Epoch 4, Loss: 0.1269275237014815
Epoch 5, Loss: 0.10774067386206407
Epoch 6, Loss: 0.09516789344921787
Epoch 7, Loss: 0.0838137591527755
Epoch 8, Loss: 0.07666492778490157
Epoch 9, Loss: 0.06920000928159216
Epoch 10, Loss: 0.06457089920896096

Training model with Batch Normalization:
Epoch 1, Loss: 0.28356621894778916
Epoch 2, Loss: 0.1024788323480056
Epoch 3, Loss: 0.0748192254465812
Epoch 4, Loss: 0.05807305201898013
Epoch 5, Loss: 0.04942067677322934
Epoch 6, Loss: 0.041073954941432816
Epoch 7, Loss: 0.03470213371075825
Epoch 8, Loss: 0.032978087577673204
Epoch 9, Loss: 0.029559690331077592
Epoch 10, Loss: 0.027585424574850095
