In [1]:
import numpy as np

# Define the activation functions
def sigma_1(u):
    return np.maximum(0, u)  # ReLU

def sigma_2(u):
    exp_u = np.exp(u - np.max(u))  # Subtract max for numerical stability
    return exp_u / np.sum(exp_u)    # Softmax

def forward_pass(x, W1, W2, y):
    # First layer 
    u1 = np.dot(W1, x)  # Linear
    u2 = sigma_1(u1)    # ReLU

    # Second layer 
    u3 = np.dot(W2, u2)  # Linear 
    h = sigma_2(u3)      # softmax 
    # cross-entropy loss
    loss = -np.sum(y * np.log(h + 1e-10))  #epsilon to avoid log(0)
 
    return h, u2, u1 , loss

x = np.array([1, 1]) 
W1 = np.array([[0, 1], [0, -1]])  
W2 = np.array([[1, 2], [2, -1]])  
y = np.array([1, 0]) 

output = forward_pass(x, W1, W2, y)
print("Output of the forward pass:", output)


Output of the forward pass: (array([0.26894142, 0.73105858]), array([1, 0]), array([ 1, -1]), np.float64(1.3132616871463947))


In [2]:
import numpy as np

# Define the activation functions
def sigma_1(u):
    return np.maximum(0, u)  # ReLU

def sigma_2(u):
    exp_u = np.exp(u - np.max(u))  # Subtract max for numerical stability
    return exp_u / np.sum(exp_u)    # Softmax

def forward_pass(x, W1, W2, y):
    # First layer 
    u1 = np.dot(W1, x)  # Linear
    u2 = sigma_1(u1)    # ReLU

    # Second layer 
    u3 = np.dot(W2, u2)  # Linear 
    h = sigma_2(u3)      # softmax 

    # cross-entropy loss
    loss = -np.sum(y * np.log(h + 1e-10))  #epsilon to avoid log(0)

    return h, u2, u1, loss 


def backward_pass(x, y, output, u2, u1, W1, W2, eta=0.01):
    # Nombre de classes (nœuds de sortie)
    num_classes = output.shape[0]

    # Étape 1 : Calculer le gradient de la perte par rapport à la sortie
    delta2 = output - y  # Gradient de la perte par rapport à la sortie (pour softmax)

    # Étape 2 : Calculer les gradients pour W2 
    grad_W2 = np.zeros_like(W2)  # Initialiser le gradient pour W2
    for i in range(num_classes):
        for j in range(W2.shape[1]):  # Nombre de neurones dans la couche cachée
            grad_W2[i, j] = delta2[i] * u2[j]  # Calculer le gradient pour W2

    # Étape 3 : Propager l'erreur vers la couche cachée
    # Initialiser le gradient pour u2
    delta1 = np.zeros_like(u2) 
    for i in range(num_classes):
        for j in range(W2.shape[1]):
            delta1[j] += delta2[i] * W2[i, j] * (u1[j] > 0)  # Dérivée de ReLU

    # Étape 4 : Calculer les gradients pour W1 
    grad_W1 = np.zeros_like(W1)  # Initialiser le gradient pour W1
    for j in range(W1.shape[0]):  # Pour chaque unité cachée
        for k in range(W1.shape[1]):  # Pour chaque unité d'entrée
            grad_W1[j, k] = delta1[j] * x[k]  # Calculer le gradient pour W1

    # Étape 5 : Mettre à jour
    W2 -= eta * grad_W2  # Mettre à jour W2
    W1 -= eta * grad_W1  # Mettre à jour W1

    return W1, W2 


x = np.array([1, 1])  
y = np.array([1, 0])  # one-hot encoding 
W1 = np.array([[0.0, 1.0], [0.0, -1.0]])  
W2 = np.array([[1.0, 2.0], [2.0, -1.0]])  
eta = 0.01  # Learning rate

# Training loop
for iteration in range(1):
    # Forward pass
    output, u2, u1, loss = forward_pass(x, W1, W2, y)

    # Backward pass
    W1, W2 = backward_pass(x, y, output, u2, u1, W1, W2, eta)

    print(f"Iteration {iteration + 1}:")
    print("Output after forward pass:", output)
    print("Cross-entropy loss before update:", loss)
    output, u2, u1, loss = forward_pass(x, W1, W2, y)
    print("Cross-entropy loss after update:", loss)
    print("Updated weights W1:\n", W1)
    print("Updated weights W2:\n", W2)


Iteration 1:
Output after forward pass: [0.26894142 0.73105858]
Cross-entropy loss before update: 1.3132616871463947
Cross-entropy loss after update: 1.292123313673786
Updated weights W1:
 [[-0.00731059  0.99268941]
 [ 0.         -1.        ]]
Updated weights W2:
 [[ 1.00731059  2.        ]
 [ 1.99268941 -1.        ]]
