Simple Python Notebook: Vanilla RNN from Scratch (Without Classes)

This notebook demonstrates a basic implementation of a Vanilla Recurrent Neural Network (RNN)
from scratch using Python and NumPy, without using a class structure. It focuses on the
core concepts of RNNs, including forward and backward passes, weight sharing, and
gradient updates.

Note: This implementation is for educational purposes and may not be optimized for
performance or handle all edge cases. For practical applications, consider using
established deep learning libraries like TensorFlow or PyTorch.

## Libraries

In [92]:
import numpy as np

# --- 1. Initialization ---

In [93]:
input_size = 5
hidden_size = 10
output_size = 3
learning_rate = 0.01

# Initialize weights with small random values

In [94]:
U = np.random.randn(hidden_size, input_size) * 0.01  # Input to hidden
W = np.random.randn(hidden_size, hidden_size) * 0.01 # Hidden to hidden
V = np.random.randn(output_size, hidden_size) * 0.01  # Hidden to output

# Initialize biases with zeros

In [95]:
b = np.zeros((hidden_size, 1))  # Hidden bias
c = np.zeros((output_size, 1))  # Output bias

# --- 2. Forward Pass ---

In [96]:
def softmax(x):
    """Applies the softmax activation function."""
    exp_x = np.exp(x - np.max(x)) # Subtract max for numerical stability
    return exp_x / np.sum(exp_x)

In [97]:
def forward(inputs, U, W, V, b, c):
    """
    Performs the forward pass of the RNN.

    Args:
        inputs (list of numpy arrays): A list of input vectors, where each vector
                                      has the shape (input_size, 1).
        U (numpy array): Input to hidden weights.
        W (numpy array): Hidden to hidden weights.
        V (numpy array): Hidden to output weights.
        b (numpy array): Hidden bias.
        c (numpy array): Output bias.

    Returns:
        tuple: A tuple containing:
            - outputs (list of numpy arrays): The output probabilities at each time step.
            - hidden_states (numpy array): The hidden states at each time step (including initial).
    """
    T = len(inputs)
    hidden_states = np.zeros((T + 1, hidden_size, 1)) # Store hidden states (including initial)
    outputs = []

    # Initial hidden state is often set to zero
    hidden_states[0] = np.zeros((hidden_size, 1))

    for t in range(T):
        # Calculate the next hidden state
        hidden_states[t+1] = np.tanh(np.dot(U, inputs[t]) + np.dot(W, hidden_states[t]) + b)

        # Calculate the output
        output = softmax(np.dot(V, hidden_states[t+1]) + c)
        outputs.append(output)

    return outputs, hidden_states

# --- 3. Backward Pass ---

In [98]:
def backward(inputs, outputs, hidden_states, targets, U, W, V, b, c):
    """
    Performs the backward pass of the RNN.

    Args:
        inputs (list of numpy arrays): The input sequence.
        outputs (list of numpy arrays): The output probabilities from the forward pass.
        hidden_states (numpy array): The hidden states from the forward pass.
        targets (list of int): The target class indices for each time step.
        U (numpy array): Input to hidden weights.
        W (numpy array): Hidden to hidden weights.
        V (numpy array): Hidden to output weights.
        b (numpy array): Hidden bias.
        c (numpy array): Output bias.

    Returns:
        tuple: A tuple containing the gradients for the weights and biases:
            - dU (numpy array): Gradient of the loss with respect to U.
            - dW (numpy array): Gradient of the loss with respect to W.
            - dV (numpy array): Gradient of the loss with respect to V.
            - db (numpy array): Gradient of the loss with respect to b.
            - dc (numpy array): Gradient of the loss with respect to c.
    """
    T = len(inputs)
    dU = np.zeros_like(U)
    dW = np.zeros_like(W)
    dV = np.zeros_like(V)
    db = np.zeros_like(b)
    dc = np.zeros_like(c)
    dh_next = np.zeros_like(hidden_states[0]) # Gradient of loss w.r.t. the next hidden state

    # Iterate backwards through time
    for t in reversed(range(T)):
        # Convert target to a one-hot vector
        target = np.zeros_like(outputs[t])
        target[targets[t]] = 1

        # Output layer gradient
        dy = outputs[t] - target
        dV += np.dot(dy, hidden_states[t+1].T)
        dc += dy

        # Gradient of the hidden state
        dh = np.dot(V.T, dy) + dh_next

        # Gradient of tanh activation
        dtanh = (1 - hidden_states[t+1] ** 2) * dh

        # Gradients for weights and biases
        db += dtanh
        dU += np.dot(dtanh, inputs[t].T)
        dW += np.dot(dtanh, hidden_states[t].T)

        # Update dh_next for the previous time step
        dh_next = np.dot(W.T, dtanh)

    return dU, dW, dV, db, dc

# --- 4. Training Step ---

In [99]:
def train_step(inputs, targets, U, W, V, b, c, learning_rate):
    """
    Performs one step of training (forward and backward pass, followed by weight update).

    Args:
        inputs (list of numpy arrays): The input sequence.
        targets (list of int): The target class indices for each time step.
        U (numpy array): Input to hidden weights.
        W (numpy array): Hidden to hidden weights.
        V (numpy array): Hidden to output weights.
        b (numpy array): Hidden bias.
        c (numpy array): Output bias.
        learning_rate (float): The learning rate for gradient descent.

    Returns:
        tuple: A tuple containing the updated weights and biases (U, W, V, b, c) and the average loss.
    """
    outputs, hidden_states = forward(inputs, U, W, V, b, c)
    dU, dW, dV, db, dc = backward(inputs, outputs, hidden_states, targets, U, W, V, b, c)

    # Update weights and biases
    U_new = U - learning_rate * dU
    W_new = W - learning_rate * dW
    V_new = V - learning_rate * dV
    b_new = b - learning_rate * db
    c_new = c - learning_rate * dc

    # Calculate the loss (e.g., cross-entropy loss)
    loss = 0
    for t in range(len(targets)):
        correct_prob = outputs[t][targets[t], 0]
        loss -= np.log(correct_prob + 1e-8) # Add small epsilon for numerical stability
    return U_new, W_new, V_new, b_new, c_new, loss / len(targets)

# --- 5. Example Usage ---

In [100]:
if __name__ == "__main__":
    # Define hyperparameters
    num_epochs = 100

    # Generate some dummy training data
    # Each sequence has a variable length
    training_data = [
        (np.random.randn(3, input_size, 1), [0, 1, 2]),
        (np.random.randn(5, input_size, 1), [1, 2, 0, 1, 2]),
        (np.random.randn(2, input_size, 1), [2, 0]),
        (np.random.randn(4, input_size, 1), [0, 0, 1, 1]),
    ]

    # Train the RNN
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, targets in training_data:
            U, W, V, b, c, loss = train_step(list(inputs), targets, U, W, V, b, c, learning_rate)
            total_loss += loss
        avg_loss = total_loss / len(training_data)
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

    # Test the trained RNN (simple forward pass)
    test_input = np.random.randn(4, input_size, 1)
    outputs, _ = forward(list(test_input), U, W, V, b, c)
    print("\nTest Input:")
    print(test_input.squeeze())
    print("Predicted Output Probabilities (for each time step):")
    for output in outputs:
        print(output.flatten())

Epoch 10/100, Average Loss: 1.0982
Epoch 20/100, Average Loss: 1.0971
Epoch 30/100, Average Loss: 1.0948
Epoch 40/100, Average Loss: 1.0880
Epoch 50/100, Average Loss: 1.0696
Epoch 60/100, Average Loss: 1.0285
Epoch 70/100, Average Loss: 0.9619
Epoch 80/100, Average Loss: 0.8846
Epoch 90/100, Average Loss: 0.8087
Epoch 100/100, Average Loss: 0.7354

Test Input:
[[ 0.18079297  0.5110085  -1.18064333  0.34018705 -0.41573644]
 [-0.01421383 -0.04545861 -0.79667118  0.81290003 -0.49295355]
 [ 0.75182443  1.37439125  0.68229239  0.27691023 -0.93631094]
 [ 0.20859064 -0.27829881 -0.50461821 -0.79048444 -0.70402334]]
Predicted Output Probabilities (for each time step):
[0.33013468 0.50250206 0.16736326]
[0.42897521 0.31272043 0.25830436]
[0.30548786 0.4024478  0.29206434]
[0.33512743 0.1553718  0.50950077]
