In [14]:
import numpy as np  # We use NumPy to do matrix math easily

# -----------------------------
# STEP 1: Prepare Dataset
# -----------------------------

data = "I am Parth Gala My friend is Rachit"  # This is the text we want our RNN to learn
chars = list(set(data))  # Get all unique characters, like ['h', 'e', 'l', 'o']
vocab_size = len(chars)  # Number of unique characters

# Map characters to numbers (like 'h' → 0, 'e' → 1, etc.)
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for ch, i in char_to_idx.items()}

# Convert a character to one-hot vector
def one_hot_encode(char, char_to_idx):
    vec = np.zeros((len(char_to_idx), 1))  # Create a column of zeros
    vec[char_to_idx[char]] = 1  # Set the position of the character to 1
    return vec

# Softmax function to turn scores into probabilities
def softmax(x):
    e_x = np.exp(x - np.max(x))  # Stability trick
    return e_x / np.sum(e_x, axis=0)

# -----------------------------
# STEP 2: Define the RNN Class
# -----------------------------
class RNN:
    def __init__(self, input_size, hidden_size, output_size):
        self.hidden_size = hidden_size  # How many neurons in the hidden layer

        # Weight matrices and biases (randomly initialized small numbers)
        self.U = np.random.randn(hidden_size, input_size) * 0.01  # input → hidden
        self.W = np.random.randn(hidden_size, hidden_size) * 0.01  # hidden → hidden (previous step)
        self.V = np.random.randn(output_size, hidden_size) * 0.01  # hidden → output

        self.b = np.zeros((hidden_size, 1))  # bias for hidden layer
        self.c = np.zeros((output_size, 1))  # bias for output layer

    # Forward pass: go through all characters in the input
    def forward(self, inputs, h_prev):
        xs, hs, os, ps = {}, {}, {}, {}  # Store data at each time step
        hs[-1] = np.copy(h_prev)  # Set initial hidden state

        for t in range(len(inputs)):
            xs[t] = inputs[t]  # Store input
            hs[t] = np.tanh(np.dot(self.U, xs[t]) + np.dot(self.W, hs[t - 1]) + self.b)  # Hidden state
            os[t] = np.dot(self.V, hs[t]) + self.c  # Raw scores (logits)
            ps[t] = softmax(os[t])  # Probabilities
        return xs, hs, os, ps

    # Backpropagation Through Time: calculate loss and gradients
    def loss_and_gradients(self, inputs, targets, h_prev):
        xs, hs, os, ps = self.forward(inputs, h_prev)  # Forward pass first
        loss = 0  # Total loss
        dU, dW, dV = np.zeros_like(self.U), np.zeros_like(self.W), np.zeros_like(self.V)  # Gradients
        db, dc = np.zeros_like(self.b), np.zeros_like(self.c)
        dh_next = np.zeros_like(hs[0])  # For passing gradient backward

        # Calculate loss
        for t in range(len(inputs)):
            loss += -np.log(ps[t][targets[t], 0])  # Cross-entropy loss

        # Backward pass (back through time)
        for t in reversed(range(len(inputs))):
            dy = np.copy(ps[t])
            dy[targets[t]] -= 1  # Gradient of softmax loss

            dV += np.dot(dy, hs[t].T)  # Gradient wrt V
            dc += dy  # Gradient wrt output bias

            dh = np.dot(self.V.T, dy) + dh_next  # Gradient of hidden layer
            dh_raw = (1 - hs[t] ** 2) * dh  # tanh derivative

            db += dh_raw
            dU += np.dot(dh_raw, xs[t].T)
            dW += np.dot(dh_raw, hs[t - 1].T)

            dh_next = np.dot(self.W.T, dh_raw)  # Pass gradient to previous time

        return loss, dU, dW, dV, db, dc, hs[len(inputs) - 1]  # Return everything we need

    # Update weights using simple gradient descent
    def update(self, grads, learning_rate=0.1):
        dU, dW, dV, db, dc = grads
        for param, dparam in zip([self.U, self.W, self.V, self.b, self.c],
                                 [dU, dW, dV, db, dc]):
            param -= learning_rate * dparam  # Move weights in opposite direction of gradient

# -----------------------------
# STEP 3: Train the RNN
# -----------------------------
rnn = RNN(input_size=vocab_size, hidden_size=10, output_size=vocab_size)  # Create RNN
h_prev = np.zeros((10, 1))  # Start with all hidden states as 0

for epoch in range(100):  # Train for 100 loops
    # Prepare input and target
    inputs = [one_hot_encode(ch, char_to_idx) for ch in data[:-1]]  # "hell"
    targets = [char_to_idx[ch] for ch in data[1:]]  # "ello"

    # Compute loss and gradients
    loss, dU, dW, dV, db, dc, h_prev = rnn.loss_and_gradients(inputs, targets, h_prev)

    # Update weights using gradients
    rnn.update((dU, dW, dV, db, dc), learning_rate=0.1)

    # Print progress every 10 steps
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")

# -----------------------------
# STEP 4: Generate Text (Sampling)
# -----------------------------
def sample(rnn, seed_char, char_to_idx, idx_to_char, n=5):
    x = one_hot_encode(seed_char, char_to_idx)  # Convert starting char to one-hot
    h = np.zeros((rnn.hidden_size, 1))  # Start with blank hidden state
    output = seed_char  # Start output with the seed char

    for _ in range(n):
        _, hs, _, ps = rnn.forward([x], h)  # Forward pass for one character
        h = hs[0]  # Get updated hidden state
        idx = np.random.choice(range(len(char_to_idx)), p=ps[0].ravel())  # Pick next char by probability
        x = one_hot_encode(idx_to_char[idx], char_to_idx)  # New input
        output += idx_to_char[idx]  # Add predicted char to output
    return output

# Try generating text!
print("Sampled text:", sample(rnn, seed_char="h", char_to_idx=char_to_idx, idx_to_char=idx_to_char, n=10))

Epoch 0, Loss: 101.8543
Epoch 10, Loss: 90.7653
Epoch 20, Loss: 69.1502
Epoch 30, Loss: 91.6093
Epoch 40, Loss: 118.6902
Epoch 50, Loss: 108.2234
Epoch 60, Loss: 111.1921
Epoch 70, Loss: 155.4534
Epoch 80, Loss: 185.8189
Epoch 90, Loss: 113.8598
Sampled text: hhMhPr hhh 
