In [1]:
import numpy as np

In [2]:
class BatchNormalization:
    def __init__(self, epsilon=1e-8):
        self.epsilon = epsilon
        self.gamma = None
        self.beta = None
        self.mean = None
        self.var = None
        self.x_normalized = None

    def forward(self, x, training=True):
        if self.mean is None:
            self.mean = np.mean(x, axis=0)
            self.var = np.var(x, axis=0)

        if training:
            x_normalized = (x - self.mean) / np.sqrt(self.var + self.epsilon)
            self.x_normalized = x_normalized

            if self.gamma is None:
                self.gamma = np.ones_like(x[0])
                self.beta = np.zeros_like(x[0])

            out = self.gamma * x_normalized + self.beta
        else:
            x_normalized = (x - self.mean) / np.sqrt(self.var + self.epsilon)
            out = self.gamma * x_normalized + self.beta

        return out

    def backward(self, dout):
        dx_normalized = dout * self.gamma
        dx = (1.0 / len(dout)) * (1.0 / np.sqrt(self.var + self.epsilon)) * (
                len(dout) * dx_normalized - np.sum(dx_normalized, axis=0)
                - self.x_normalized * np.sum(dx_normalized * self.x_normalized, axis=0))
        dgamma = np.sum(dout * self.x_normalized, axis=0)
        dbeta = np.sum(dout, axis=0)

        self.gamma -= dgamma
        self.beta -= dbeta

        return dx

In [3]:
# Create an instance of BatchNormalization
bn = BatchNormalization()

# Assume we have an input tensor x and its gradient dout
x = np.random.randn(100, 10)  # Example input
dout = np.random.randn(100, 10)  # Example gradient from subsequent layer

# Forward pass
out = bn.forward(x, training=True)

# Backward pass
dx = bn.backward(dout)