# Chapter 1: Introduction to Neural Networks

Welcome to the first chapter! In this notebook, we'll build neural networks from scratch and understand every component in depth.

## ðŸ“š Table of Contents
1. [What are Neural Networks?](#what-are-neural-networks)
2. [The Perceptron: Building Block](#perceptron)
3. [Activation Functions](#activation-functions)
4. [Multi-Layer Perceptrons (MLPs)](#mlp)
5. [Forward Propagation](#forward-prop)
6. [Loss Functions](#loss-functions)
7. [Backpropagation](#backprop)
8. [Optimization Algorithms](#optimization)
9. [Training Neural Networks](#training)
10. [Biology Application: Gene Expression Classification](#biology-app)

---

In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import make_classification, make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

print("Libraries imported successfully!")

## 1. What are Neural Networks? <a id="what-are-neural-networks"></a>

### The Biological Inspiration

Neural networks are inspired by biological neurons in the brain:
- **Dendrites** receive signals (inputs)
- **Cell body** processes signals
- **Axon** sends output to other neurons

### Artificial Neurons

An artificial neuron:
1. Receives multiple inputs: $x_1, x_2, ..., x_n$
2. Multiplies each by a weight: $w_1, w_2, ..., w_n$
3. Sums them up with a bias: $z = \sum_{i=1}^{n} w_i x_i + b$
4. Applies an activation function: $a = f(z)$

Let's visualize this:

In [None]:
def plot_neuron_diagram():
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Input layer
    inputs = [0.2, 0.5, 0.8]
    for i, val in enumerate(inputs):
        y_pos = 3 - i
        circle = plt.Circle((1, y_pos), 0.15, color='lightblue', ec='black', linewidth=2)
        ax.add_patch(circle)
        ax.text(0.5, y_pos, f'$x_{i+1}$={val}', fontsize=12, ha='right', va='center')
        
        # Draw connections
        ax.plot([1.15, 2.85], [y_pos, 2], 'k-', linewidth=1.5, alpha=0.5)
        weight = np.random.uniform(0.5, 1.5)
        ax.text(1.8, y_pos + (2-y_pos)/2 + 0.1, f'$w_{i+1}$={weight:.2f}', 
                fontsize=10, color='red')
    
    # Neuron
    circle = plt.Circle((3, 2), 0.3, color='orange', ec='black', linewidth=2)
    ax.add_patch(circle)
    ax.text(3, 2, 'Î£', fontsize=20, ha='center', va='center', weight='bold')
    
    # Bias
    circle = plt.Circle((3, 0.5), 0.15, color='lightgreen', ec='black', linewidth=2)
    ax.add_patch(circle)
    ax.text(2.5, 0.5, 'bias=$b$', fontsize=10, ha='right', va='center')
    ax.plot([3.15, 3], [0.65, 1.7], 'k-', linewidth=1.5, alpha=0.5)
    
    # Activation
    ax.plot([3.3, 4.2], [2, 2], 'k-', linewidth=2)
    ax.text(3.75, 2.3, '$z = Î£w_ix_i + b$', fontsize=11, ha='center')
    
    # Activation function
    rect = plt.Rectangle((4.2, 1.7), 0.8, 0.6, color='yellow', ec='black', linewidth=2)
    ax.add_patch(rect)
    ax.text(4.6, 2, '$f(z)$', fontsize=12, ha='center', va='center', weight='bold')
    
    # Output
    ax.plot([5, 5.8], [2, 2], 'k-', linewidth=2)
    circle = plt.Circle((6, 2), 0.15, color='lightcoral', ec='black', linewidth=2)
    ax.add_patch(circle)
    ax.text(6.5, 2, 'output', fontsize=12, ha='left', va='center')
    
    ax.set_xlim(0, 7)
    ax.set_ylim(0, 4)
    ax.axis('off')
    ax.set_title('Anatomy of an Artificial Neuron', fontsize=16, weight='bold', pad=20)
    plt.tight_layout()
    plt.show()

plot_neuron_diagram()

## 2. The Perceptron: Building Block <a id="perceptron"></a>

The **perceptron** is the simplest neural network - a single neuron!

### Mathematical Formulation

For input vector $\mathbf{x} = [x_1, x_2, ..., x_n]$ and weights $\mathbf{w} = [w_1, w_2, ..., w_n]$:

$$z = \mathbf{w}^T \mathbf{x} + b = \sum_{i=1}^{n} w_i x_i + b$$

$$\hat{y} = f(z)$$

where $f$ is an activation function.

### Implementation from Scratch

In [None]:
class Perceptron:
    """A simple perceptron implementation from scratch."""
    
    def __init__(self, n_inputs, learning_rate=0.01):
        """
        Initialize perceptron.
        
        Args:
            n_inputs: Number of input features
            learning_rate: Step size for weight updates
        """
        self.weights = np.random.randn(n_inputs) * 0.01
        self.bias = 0.0
        self.lr = learning_rate
    
    def activation(self, z):
        """Step function: returns 1 if z >= 0, else 0."""
        return np.where(z >= 0, 1, 0)
    
    def predict(self, X):
        """Make predictions for input X."""
        z = np.dot(X, self.weights) + self.bias
        return self.activation(z)
    
    def train(self, X, y, epochs=100):
        """Train the perceptron using the perceptron learning rule."""
        errors = []
        
        for epoch in range(epochs):
            total_error = 0
            
            for xi, target in zip(X, y):
                # Forward pass
                prediction = self.predict(xi.reshape(1, -1))[0]
                
                # Calculate error
                error = target - prediction
                total_error += abs(error)
                
                # Update weights and bias
                self.weights += self.lr * error * xi
                self.bias += self.lr * error
            
            errors.append(total_error)
        
        return errors

# Test the perceptron on a simple problem (AND gate)
print("Testing Perceptron on AND gate:")
X_and = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_and = np.array([0, 0, 0, 1])

perceptron = Perceptron(n_inputs=2, learning_rate=0.1)
errors = perceptron.train(X_and, y_and, epochs=100)

print("\nTrained weights:", perceptron.weights)
print("Trained bias:", perceptron.bias)
print("\nPredictions:")
for xi, yi in zip(X_and, y_and):
    pred = perceptron.predict(xi.reshape(1, -1))[0]
    print(f"Input: {xi} | True: {yi} | Predicted: {pred}")

# Plot training error
plt.figure(figsize=(10, 4))
plt.plot(errors, linewidth=2)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Total Error', fontsize=12)
plt.title('Perceptron Training Error Over Time', fontsize=14, weight='bold')
plt.grid(True, alpha=0.3)
plt.show()

### Limitations of the Perceptron

The perceptron can only solve **linearly separable** problems. Let's see what happens with XOR (not linearly separable):

In [None]:
# Try XOR problem (will fail!)
print("Testing Perceptron on XOR gate (will fail):")
X_xor = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_xor = np.array([0, 1, 1, 0])

perceptron_xor = Perceptron(n_inputs=2, learning_rate=0.1)
errors_xor = perceptron_xor.train(X_xor, y_xor, epochs=1000)

print("\nPredictions:")
for xi, yi in zip(X_xor, y_xor):
    pred = perceptron_xor.predict(xi.reshape(1, -1))[0]
    print(f"Input: {xi} | True: {yi} | Predicted: {pred} {'âœ—' if pred != yi else 'âœ“'}")

# Visualize the problem
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot AND (linearly separable)
axes[0].scatter(X_and[y_and==0, 0], X_and[y_and==0, 1], c='blue', s=200, label='Class 0', edgecolors='black', linewidth=2)
axes[0].scatter(X_and[y_and==1, 0], X_and[y_and==1, 1], c='red', s=200, label='Class 1', edgecolors='black', linewidth=2)
axes[0].set_xlabel('$x_1$', fontsize=12)
axes[0].set_ylabel('$x_2$', fontsize=12)
axes[0].set_title('AND Gate - Linearly Separable âœ“', fontsize=13, weight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Plot XOR (not linearly separable)
axes[1].scatter(X_xor[y_xor==0, 0], X_xor[y_xor==0, 1], c='blue', s=200, label='Class 0', edgecolors='black', linewidth=2)
axes[1].scatter(X_xor[y_xor==1, 0], X_xor[y_xor==1, 1], c='red', s=200, label='Class 1', edgecolors='black', linewidth=2)
axes[1].set_xlabel('$x_1$', fontsize=12)
axes[1].set_ylabel('$x_2$', fontsize=12)
axes[1].set_title('XOR Gate - NOT Linearly Separable âœ—', fontsize=13, weight='bold')
axes[1].legend(fontsize=11)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nðŸ’¡ Key Insight: We need multiple layers to solve XOR!")

## 3. Activation Functions <a id="activation-functions"></a>

Activation functions introduce **non-linearity**, allowing networks to learn complex patterns.

### Common Activation Functions

1. **Sigmoid**: $\sigma(z) = \frac{1}{1 + e^{-z}}$
   - Output range: (0, 1)
   - Use case: Binary classification output, gates in LSTMs

2. **Tanh**: $\tanh(z) = \frac{e^z - e^{-z}}{e^z + e^{-z}}$
   - Output range: (-1, 1)
   - Use case: Hidden layers (zero-centered)

3. **ReLU**: $\text{ReLU}(z) = \max(0, z)$
   - Output range: [0, âˆž)
   - Use case: Most common in hidden layers

4. **Leaky ReLU**: $\text{LeakyReLU}(z) = \max(0.01z, z)$
   - Output range: (-âˆž, âˆž)
   - Use case: Fixes "dying ReLU" problem

In [None]:
# Define activation functions
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def tanh(z):
    return np.tanh(z)

def relu(z):
    return np.maximum(0, z)

def leaky_relu(z, alpha=0.01):
    return np.where(z > 0, z, alpha * z)

# Visualize activation functions
z = np.linspace(-5, 5, 100)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

activations = [
    (sigmoid, 'Sigmoid', r'$\sigma(z) = \frac{1}{1 + e^{-z}}$'),
    (tanh, 'Tanh', r'$\tanh(z) = \frac{e^z - e^{-z}}{e^z + e^{-z}}$'),
    (relu, 'ReLU', r'$\text{ReLU}(z) = \max(0, z)$'),
    (leaky_relu, 'Leaky ReLU', r'$\text{LeakyReLU}(z) = \max(0.01z, z)$')
]

for idx, (func, name, formula) in enumerate(activations):
    ax = axes[idx]
    y = func(z)
    ax.plot(z, y, linewidth=3, color='blue')
    ax.axhline(y=0, color='black', linestyle='--', alpha=0.3)
    ax.axvline(x=0, color='black', linestyle='--', alpha=0.3)
    ax.grid(True, alpha=0.3)
    ax.set_xlabel('Input (z)', fontsize=11)
    ax.set_ylabel('Output', fontsize=11)
    ax.set_title(f'{name} Activation Function', fontsize=13, weight='bold')
    ax.text(0.05, 0.95, formula, transform=ax.transAxes, 
            fontsize=11, verticalalignment='top', 
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

print("\nðŸ“Š Activation Function Properties:")
print("\nSigmoid:")
print("  âœ“ Smooth gradient")
print("  âœ— Vanishing gradient problem")
print("  âœ— Not zero-centered")
print("\nTanh:")
print("  âœ“ Zero-centered")
print("  âœ— Vanishing gradient problem")
print("\nReLU:")
print("  âœ“ No vanishing gradient for positive values")
print("  âœ“ Computationally efficient")
print("  âœ— Dying ReLU problem (neurons can die)")
print("\nLeaky ReLU:")
print("  âœ“ Fixes dying ReLU")
print("  âœ“ Small gradient for negative values")

## 4. Multi-Layer Perceptrons (MLPs) <a id="mlp"></a>

An MLP consists of:
- **Input layer**: Receives features
- **Hidden layer(s)**: Processes information
- **Output layer**: Produces predictions

### Architecture

```
Input (n features) â†’ Hidden Layer 1 (h1 neurons) â†’ Hidden Layer 2 (h2 neurons) â†’ Output (m classes)
```

Each connection has a weight, and each neuron has a bias.

### Building an MLP with PyTorch

In [None]:
class SimpleMLP(nn.Module):
    """
    A simple Multi-Layer Perceptron.
    
    Architecture: Input â†’ Hidden (128) â†’ Hidden (64) â†’ Output
    """
    
    def __init__(self, input_size, hidden_size1=128, hidden_size2=64, output_size=2):
        super(SimpleMLP, self).__init__()
        
        # Define layers
        self.fc1 = nn.Linear(input_size, hidden_size1)  # First hidden layer
        self.relu1 = nn.ReLU()
        
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)  # Second hidden layer
        self.relu2 = nn.ReLU()
        
        self.fc3 = nn.Linear(hidden_size2, output_size)  # Output layer
        
    def forward(self, x):
        """
        Forward pass through the network.
        
        Args:
            x: Input tensor of shape (batch_size, input_size)
            
        Returns:
            Output tensor of shape (batch_size, output_size)
        """
        # Layer 1
        x = self.fc1(x)
        x = self.relu1(x)
        
        # Layer 2
        x = self.fc2(x)
        x = self.relu2(x)
        
        # Output layer (no activation here - will use softmax in loss)
        x = self.fc3(x)
        
        return x

# Create a sample network
model = SimpleMLP(input_size=10, hidden_size1=128, hidden_size2=64, output_size=2)

# Print architecture
print("MLP Architecture:")
print(model)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nTotal parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# Test forward pass
batch_size = 5
test_input = torch.randn(batch_size, 10)
output = model(test_input)
print(f"\nInput shape: {test_input.shape}")
print(f"Output shape: {output.shape}")

### Solving XOR with MLP

Now let's solve the XOR problem that defeated the single perceptron!

In [None]:
# Prepare XOR data
X_xor_tensor = torch.FloatTensor(X_xor)
y_xor_tensor = torch.LongTensor(y_xor)

# Create small MLP for XOR
xor_model = SimpleMLP(input_size=2, hidden_size1=4, hidden_size2=4, output_size=2)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(xor_model.parameters(), lr=0.01)

# Training loop
losses = []
epochs = 1000

for epoch in range(epochs):
    # Forward pass
    outputs = xor_model(X_xor_tensor)
    loss = criterion(outputs, y_xor_tensor)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    losses.append(loss.item())
    
    if (epoch + 1) % 200 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# Test the model
xor_model.eval()
with torch.no_grad():
    predictions = xor_model(X_xor_tensor)
    predicted_classes = torch.argmax(predictions, dim=1)

print("\nðŸŽ‰ MLP successfully solves XOR!")
print("\nPredictions:")
for xi, yi, pred in zip(X_xor, y_xor, predicted_classes):
    print(f"Input: {xi} | True: {yi} | Predicted: {pred.item()} {'âœ“' if pred == yi else 'âœ—'}")

# Plot training loss
plt.figure(figsize=(10, 4))
plt.plot(losses, linewidth=2)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Loss', fontsize=12)
plt.title('MLP Training Loss on XOR Problem', fontsize=14, weight='bold')
plt.grid(True, alpha=0.3)
plt.show()