# Tutorial 15: Advanced Model Architectures

This tutorial explores cutting-edge neural network architectures including Graph Neural Networks, Vision Transformers, and other state-of-the-art models.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math
from typing import Optional, Tuple, Union
import matplotlib.pyplot as plt
import networkx as nx
from einops import rearrange, repeat
from einops.layers.torch import Rearrange

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Graph Neural Networks (GNNs)

Graph Neural Networks are designed to work with graph-structured data, where relationships between entities are as important as the entities themselves.

In [None]:
class GraphConvolutionLayer(nn.Module):
    """Simple Graph Convolution Layer"""
    def __init__(self, in_features, out_features):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_features, out_features))
        self.bias = nn.Parameter(torch.zeros(out_features))
        
        # Initialize weights
        nn.init.xavier_uniform_(self.weight)
        
    def forward(self, x, adj):
        # x: [num_nodes, in_features]
        # adj: [num_nodes, num_nodes]
        support = torch.mm(x, self.weight)
        output = torch.sparse.mm(adj, support)
        return output + self.bias

class GCN(nn.Module):
    """Graph Convolutional Network"""
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2):
        super().__init__()
        self.layers = nn.ModuleList()
        
        # Input layer
        self.layers.append(GraphConvolutionLayer(input_dim, hidden_dim))
        
        # Hidden layers
        for _ in range(num_layers - 2):
            self.layers.append(GraphConvolutionLayer(hidden_dim, hidden_dim))
        
        # Output layer
        self.layers.append(GraphConvolutionLayer(hidden_dim, output_dim))
        
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x, adj):
        for i, layer in enumerate(self.layers[:-1]):
            x = layer(x, adj)
            x = F.relu(x)
            x = self.dropout(x)
        
        x = self.layers[-1](x, adj)
        return F.log_softmax(x, dim=1)

In [None]:
# Create and visualize a simple graph
num_nodes = 20
num_features = 16
num_classes = 4

# Create a random graph
G = nx.erdos_renyi_graph(num_nodes, 0.3)
adj_matrix = nx.adjacency_matrix(G).todense()
adj_tensor = torch.FloatTensor(adj_matrix).to_sparse()

# Visualize the graph
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, node_color='lightblue', 
        node_size=500, font_size=10, edge_color='gray')
plt.title("Example Graph Structure")
plt.show()

# Random node features
x = torch.randn(num_nodes, num_features)

# Create and test GCN
gcn = GCN(num_features, 32, num_classes)
output = gcn(x, adj_tensor)
print(f"GCN output shape: {output.shape}")

## 2. Graph Attention Networks (GAT)

GATs use attention mechanisms to weigh the importance of neighboring nodes.

In [None]:
class GraphAttentionLayer(nn.Module):
    """Graph Attention Layer"""
    def __init__(self, in_features, out_features, dropout=0.6, alpha=0.2):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.dropout = dropout
        self.alpha = alpha
        
        self.W = nn.Parameter(torch.randn(in_features, out_features))
        self.a = nn.Parameter(torch.randn(2 * out_features, 1))
        
        self.leakyrelu = nn.LeakyReLU(self.alpha)
        
        # Initialize weights
        nn.init.xavier_uniform_(self.W)
        nn.init.xavier_uniform_(self.a)
        
    def forward(self, x, adj):
        # x: [N, in_features]
        h = torch.mm(x, self.W)  # [N, out_features]
        N = h.size(0)
        
        # Attention mechanism
        a_input = torch.cat([h.repeat(1, N).view(N * N, -1), 
                           h.repeat(N, 1)], dim=1).view(N, -1, 2 * self.out_features)
        e = self.leakyrelu(torch.matmul(a_input, self.a).squeeze(2))
        
        # Mask attention scores
        zero_vec = -9e15 * torch.ones_like(e)
        attention = torch.where(adj.to_dense() > 0, e, zero_vec)
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        
        h_prime = torch.matmul(attention, h)
        return F.elu(h_prime), attention

class GAT(nn.Module):
    """Graph Attention Network"""
    def __init__(self, input_dim, hidden_dim, output_dim, num_heads=8):
        super().__init__()
        self.attention_heads = nn.ModuleList([
            GraphAttentionLayer(input_dim, hidden_dim) for _ in range(num_heads)
        ])
        self.out_att = GraphAttentionLayer(hidden_dim * num_heads, output_dim)
        
    def forward(self, x, adj):
        # Multi-head attention
        att_outputs = []
        attentions = []
        for att in self.attention_heads:
            out, attn = att(x, adj)
            att_outputs.append(out)
            attentions.append(attn)
            
        x = torch.cat(att_outputs, dim=1)
        x = F.dropout(x, 0.6, training=self.training)
        x, final_attn = self.out_att(x, adj)
        
        return F.log_softmax(x, dim=1), attentions

In [None]:
# Test GAT and visualize attention
gat = GAT(num_features, 8, num_classes, num_heads=4)
output, attentions = gat(x, adj_tensor)
print(f"GAT output shape: {output.shape}")

# Visualize attention weights for one head
if len(attentions) > 0:
    attn = attentions[0].detach().numpy()
    plt.figure(figsize=(10, 8))
    plt.imshow(attn, cmap='hot', interpolation='nearest')
    plt.colorbar()
    plt.title('Graph Attention Weights (Head 1)')
    plt.xlabel('Node')
    plt.ylabel('Node')
    plt.show()

## 3. Vision Transformer (ViT)

Vision Transformers apply the transformer architecture directly to sequences of image patches.

In [None]:
class PatchEmbedding(nn.Module):
    """Convert image into patches and embed them"""
    def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dim=768):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = (img_size // patch_size) ** 2
        
        self.projection = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', 
                     p1=patch_size, p2=patch_size),
            nn.Linear(patch_size * patch_size * in_channels, embed_dim)
        )
        
    def forward(self, x):
        return self.projection(x)

class MultiHeadSelfAttention(nn.Module):
    """Multi-Head Self Attention"""
    def __init__(self, embed_dim, num_heads=8, dropout=0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        
        assert embed_dim % num_heads == 0
        
        self.qkv = nn.Linear(embed_dim, embed_dim * 3)
        self.attention_dropout = nn.Dropout(dropout)
        self.proj = nn.Linear(embed_dim, embed_dim)
        self.proj_dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        
        attn = (q @ k.transpose(-2, -1)) * (self.head_dim ** -0.5)
        attn = attn.softmax(dim=-1)
        attn = self.attention_dropout(attn)
        
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_dropout(x)
        return x, attn

In [None]:
class TransformerBlock(nn.Module):
    """Transformer Block"""
    def __init__(self, embed_dim, num_heads, mlp_ratio=4.0, dropout=0.1):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout)
        self.norm2 = nn.LayerNorm(embed_dim)
        
        mlp_hidden_dim = int(embed_dim * mlp_ratio)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_hidden_dim, embed_dim),
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        attn_out, attn_weights = self.attn(self.norm1(x))
        x = x + attn_out
        x = x + self.mlp(self.norm2(x))
        return x, attn_weights

class VisionTransformer(nn.Module):
    """Vision Transformer"""
    def __init__(self, img_size=224, patch_size=16, in_channels=3, 
                 num_classes=1000, embed_dim=768, depth=12, num_heads=12):
        super().__init__()
        self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
        num_patches = self.patch_embed.num_patches
        
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
        self.pos_drop = nn.Dropout(0.1)
        
        self.blocks = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads) for _ in range(depth)
        ])
        
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)
        
        # Initialize weights
        nn.init.trunc_normal_(self.pos_embed, std=0.02)
        nn.init.trunc_normal_(self.cls_token, std=0.02)
        
    def forward(self, x):
        B = x.shape[0]
        x = self.patch_embed(x)
        
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        x = self.pos_drop(x)
        
        attention_maps = []
        for block in self.blocks:
            x, attn = block(x)
            attention_maps.append(attn)
        
        x = self.norm(x)
        x = x[:, 0]  # Use CLS token
        x = self.head(x)
        return x, attention_maps

In [None]:
# Create a small ViT and visualize patches
vit = VisionTransformer(
    img_size=32, 
    patch_size=4, 
    in_channels=3, 
    num_classes=10, 
    embed_dim=192, 
    depth=6, 
    num_heads=6
).to(device)

# Test with random image
img = torch.randn(1, 3, 32, 32).to(device)
output, attention_maps = vit(img)
print(f"ViT output shape: {output.shape}")
print(f"ViT parameters: {sum(p.numel() for p in vit.parameters()) / 1e6:.2f}M")

# Visualize how image is divided into patches
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Original image (using random data for visualization)
img_vis = img[0].cpu().permute(1, 2, 0).numpy()
img_vis = (img_vis - img_vis.min()) / (img_vis.max() - img_vis.min())
axes[0].imshow(img_vis)
axes[0].set_title('Original Image')
axes[0].axis('off')

# Show patch grid
axes[1].imshow(img_vis)
patch_size = 4
for i in range(0, 32, patch_size):
    axes[1].axhline(i, color='red', linewidth=0.5)
    axes[1].axvline(i, color='red', linewidth=0.5)
axes[1].set_title('Image Patches (4x4)')
axes[1].axis('off')

plt.tight_layout()
plt.show()

## 4. EfficientNet Architecture

EfficientNet uses compound scaling to efficiently scale network width, depth, and resolution.

In [None]:
class SqueezeExcite(nn.Module):
    """Squeeze-and-Excitation block"""
    def __init__(self, in_channels, squeeze_ratio=0.25):
        super().__init__()
        squeeze_channels = max(1, int(in_channels * squeeze_ratio))
        self.fc1 = nn.Conv2d(in_channels, squeeze_channels, 1)
        self.fc2 = nn.Conv2d(squeeze_channels, in_channels, 1)
        
    def forward(self, x):
        scale = x.mean((2, 3), keepdim=True)
        scale = F.relu(self.fc1(scale))
        scale = torch.sigmoid(self.fc2(scale))
        return x * scale

class MBConvBlock(nn.Module):
    """Mobile Inverted Bottleneck Convolution Block"""
    def __init__(self, in_channels, out_channels, expand_ratio, stride, kernel_size=3):
        super().__init__()
        self.stride = stride
        self.use_residual = stride == 1 and in_channels == out_channels
        
        hidden_channels = in_channels * expand_ratio
        
        layers = []
        # Expansion
        if expand_ratio != 1:
            layers.extend([
                nn.Conv2d(in_channels, hidden_channels, 1, bias=False),
                nn.BatchNorm2d(hidden_channels),
                nn.SiLU()
            ])
        
        # Depthwise conv
        layers.extend([
            nn.Conv2d(hidden_channels, hidden_channels, kernel_size, 
                     stride=stride, padding=kernel_size//2, groups=hidden_channels, bias=False),
            nn.BatchNorm2d(hidden_channels),
            nn.SiLU(),
            SqueezeExcite(hidden_channels)
        ])
        
        # Output projection
        layers.extend([
            nn.Conv2d(hidden_channels, out_channels, 1, bias=False),
            nn.BatchNorm2d(out_channels)
        ])
        
        self.conv = nn.Sequential(*layers)
        
    def forward(self, x):
        if self.use_residual:
            return x + self.conv(x)
        else:
            return self.conv(x)

In [None]:
# Demonstrate compound scaling
def calculate_scaled_params(width_mult, depth_mult, resolution_mult):
    """Calculate parameters after compound scaling"""
    base_width = 1.0
    base_depth = 1.0
    base_resolution = 224
    
    scaled_width = base_width * width_mult
    scaled_depth = base_depth * depth_mult
    scaled_resolution = int(base_resolution * resolution_mult)
    
    # Approximate parameter count (simplified)
    params = scaled_width * scaled_depth * (scaled_resolution / base_resolution) ** 2
    
    return {
        'width': scaled_width,
        'depth': scaled_depth,
        'resolution': scaled_resolution,
        'relative_params': params
    }

# EfficientNet scaling coefficients
efficientnet_configs = {
    'B0': (1.0, 1.0, 1.0),
    'B1': (1.0, 1.1, 1.15),
    'B2': (1.1, 1.2, 1.3),
    'B3': (1.2, 1.4, 1.5),
    'B4': (1.4, 1.8, 1.8),
    'B5': (1.6, 2.2, 2.1),
    'B6': (1.8, 2.6, 2.4),
    'B7': (2.0, 3.1, 2.7)
}

print("EfficientNet Compound Scaling:")
print("-" * 60)
for name, (w, d, r) in efficientnet_configs.items():
    config = calculate_scaled_params(w, d, r)
    print(f"{name}: Width={config['width']:.1f}x, Depth={config['depth']:.1f}x, "
          f"Resolution={config['resolution']}, RelativeParams={config['relative_params']:.1f}x")

## 5. Neural Ordinary Differential Equations (Neural ODEs)

Neural ODEs parameterize the continuous dynamics of hidden states using neural networks.

In [None]:
class ODEFunc(nn.Module):
    """ODE function for Neural ODE"""
    def __init__(self, dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim * 2),
            nn.Tanh(),
            nn.Linear(dim * 2, dim)
        )
        
    def forward(self, t, x):
        return self.net(x)

class NeuralODE(nn.Module):
    """Neural ODE Block with different solvers"""
    def __init__(self, func, solver='euler', step_size=0.1):
        super().__init__()
        self.func = func
        self.solver = solver
        self.step_size = step_size
        
    def euler_solve(self, x, t_span):
        """Euler method solver"""
        t0, t1 = t_span
        num_steps = int((t1 - t0) / self.step_size)
        
        h = self.step_size
        for _ in range(num_steps):
            x = x + h * self.func(t0, x)
            t0 += h
            
        return x
    
    def rk4_solve(self, x, t_span):
        """4th order Runge-Kutta solver"""
        t0, t1 = t_span
        num_steps = int((t1 - t0) / self.step_size)
        
        h = self.step_size
        for _ in range(num_steps):
            k1 = self.func(t0, x)
            k2 = self.func(t0 + h/2, x + h*k1/2)
            k3 = self.func(t0 + h/2, x + h*k2/2)
            k4 = self.func(t0 + h, x + h*k3)
            
            x = x + h * (k1 + 2*k2 + 2*k3 + k4) / 6
            t0 += h
            
        return x
        
    def forward(self, x, t_span):
        if self.solver == 'euler':
            return self.euler_solve(x, t_span)
        elif self.solver == 'rk4':
            return self.rk4_solve(x, t_span)
        else:
            raise ValueError(f"Unknown solver: {self.solver}")

In [None]:
# Visualize Neural ODE dynamics
dim = 2
ode_func = ODEFunc(dim)
neural_ode = NeuralODE(ode_func, solver='rk4', step_size=0.01)

# Create a grid of initial points
n_points = 20
x_range = torch.linspace(-2, 2, n_points)
y_range = torch.linspace(-2, 2, n_points)
grid_x, grid_y = torch.meshgrid(x_range, y_range)
initial_points = torch.stack([grid_x.flatten(), grid_y.flatten()], dim=1)

# Solve ODE for each point
t_span = (0.0, 1.0)
trajectories = []

for i in range(initial_points.shape[0]):
    point = initial_points[i:i+1]
    trajectory = [point.clone()]
    
    # Solve with smaller time steps for visualization
    num_vis_steps = 10
    for j in range(num_vis_steps):
        t0 = j / num_vis_steps
        t1 = (j + 1) / num_vis_steps
        point = neural_ode(point, (t0, t1))
        trajectory.append(point.clone())
    
    trajectories.append(torch.cat(trajectory, dim=0))

# Visualize vector field
plt.figure(figsize=(10, 8))

# Plot trajectories
for traj in trajectories[::20]:  # Plot every 20th trajectory for clarity
    traj_np = traj.detach().numpy()
    plt.plot(traj_np[:, 0], traj_np[:, 1], 'b-', alpha=0.5, linewidth=0.5)
    plt.plot(traj_np[0, 0], traj_np[0, 1], 'go', markersize=3)  # Start
    plt.plot(traj_np[-1, 0], traj_np[-1, 1], 'ro', markersize=3)  # End

plt.title('Neural ODE Dynamics')
plt.xlabel('x1')
plt.ylabel('x2')
plt.grid(True, alpha=0.3)
plt.axis('equal')
plt.show()

## 6. Multimodal Architecture

Modern AI systems often need to process multiple modalities (vision, language, audio) together.

In [None]:
class MultimodalTransformer(nn.Module):
    """Simple multimodal transformer for vision and language"""
    def __init__(self, vocab_size, max_seq_len, img_size=224, patch_size=16,
                 embed_dim=512, depth=6, num_heads=8, num_classes=1000):
        super().__init__()
        
        # Vision encoder
        self.vision_embed = PatchEmbedding(img_size, patch_size, 3, embed_dim)
        num_patches = self.vision_embed.num_patches
        
        # Language encoder
        self.text_embed = nn.Embedding(vocab_size, embed_dim)
        self.text_pos_embed = nn.Parameter(torch.zeros(1, max_seq_len, embed_dim))
        
        # Modality embeddings
        self.vision_type_embed = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.text_type_embed = nn.Parameter(torch.zeros(1, 1, embed_dim))
        
        # Shared transformer
        self.blocks = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads) for _ in range(depth)
        ])
        
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)
        
    def forward(self, images=None, text_ids=None, return_embeddings=False):
        embeddings = []
        
        # Process vision input
        if images is not None:
            vision_embeds = self.vision_embed(images)
            vision_embeds = vision_embeds + self.vision_type_embed
            embeddings.append(vision_embeds)
        
        # Process text input
        if text_ids is not None:
            text_embeds = self.text_embed(text_ids)
            seq_len = text_embeds.size(1)
            text_embeds = text_embeds + self.text_pos_embed[:, :seq_len]
            text_embeds = text_embeds + self.text_type_embed
            embeddings.append(text_embeds)
        
        # Concatenate modalities
        x = torch.cat(embeddings, dim=1)
        
        # Apply transformer
        attention_maps = []
        for block in self.blocks:
            x, attn = block(x)
            attention_maps.append(attn)
        
        x = self.norm(x)
        
        if return_embeddings:
            return x, attention_maps
        
        x = x.mean(dim=1)  # Global average pooling
        x = self.head(x)
        
        return x

In [None]:
# Test multimodal model and visualize cross-modal attention
multimodal = MultimodalTransformer(
    vocab_size=10000, 
    max_seq_len=128, 
    img_size=32, 
    patch_size=4,
    embed_dim=256, 
    depth=4, 
    num_classes=100
)

# Test with both modalities
images = torch.randn(2, 3, 32, 32)
text_ids = torch.randint(0, 10000, (2, 20))
embeddings, attention_maps = multimodal(images=images, text_ids=text_ids, return_embeddings=True)

print(f"Multimodal embeddings shape: {embeddings.shape}")
print(f"Number of patches: {(32//4)**2}")
print(f"Number of text tokens: 20")
print(f"Total sequence length: {embeddings.shape[1]}")

# Visualize cross-modal attention
if len(attention_maps) > 0:
    # Get attention from last layer, first head
    attn = attention_maps[-1][0, 0].detach().cpu().numpy()
    
    # Split attention into vision-vision, vision-text, text-vision, text-text
    n_patches = 64  # 8x8 patches
    n_text = 20
    
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # Vision to Vision
    axes[0, 0].imshow(attn[:n_patches, :n_patches], cmap='hot')
    axes[0, 0].set_title('Vision → Vision')
    axes[0, 0].set_xlabel('Vision Patches')
    axes[0, 0].set_ylabel('Vision Patches')
    
    # Vision to Text
    axes[0, 1].imshow(attn[:n_patches, n_patches:], cmap='hot')
    axes[0, 1].set_title('Vision → Text')
    axes[0, 1].set_xlabel('Text Tokens')
    axes[0, 1].set_ylabel('Vision Patches')
    
    # Text to Vision
    axes[1, 0].imshow(attn[n_patches:, :n_patches], cmap='hot')
    axes[1, 0].set_title('Text → Vision')
    axes[1, 0].set_xlabel('Vision Patches')
    axes[1, 0].set_ylabel('Text Tokens')
    
    # Text to Text
    axes[1, 1].imshow(attn[n_patches:, n_patches:], cmap='hot')
    axes[1, 1].set_title('Text → Text')
    axes[1, 1].set_xlabel('Text Tokens')
    axes[1, 1].set_ylabel('Text Tokens')
    
    plt.tight_layout()
    plt.show()

## Model Architecture Comparison

Let's compare the different architectures we've implemented.

In [None]:
# Compare model sizes and theoretical properties
architectures = {
    "GCN": (gcn, "Graph-structured data"),
    "GAT": (gat, "Graph data with attention"),
    "Vision Transformer": (vit, "Image classification"),
    "Neural ODE": (neural_ode, "Continuous dynamics"),
    "Multimodal": (multimodal, "Multi-input processing")
}

print("Model Architecture Comparison")
print("=" * 70)
print(f"{'Architecture':<20} {'Parameters':<15} {'Best Use Case':<35}")
print("-" * 70)

for name, (model, use_case) in architectures.items():
    params = sum(p.numel() for p in model.parameters())
    print(f"{name:<20} {params/1e6:>8.2f}M      {use_case:<35}")

# Plot parameter counts
plt.figure(figsize=(10, 6))
names = list(architectures.keys())
params = [sum(p.numel() for p in architectures[name][0].parameters())/1e6 for name in names]

bars = plt.bar(names, params, color=['skyblue', 'lightgreen', 'lightcoral', 'lightyellow', 'lightpink'])
plt.ylabel('Parameters (Millions)')
plt.title('Model Size Comparison')
plt.xticks(rotation=45)

# Add value labels on bars
for bar, param in zip(bars, params):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{param:.2f}M', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## Summary

In this tutorial, we explored several advanced model architectures:

1. **Graph Neural Networks (GCN, GAT)**: Process graph-structured data by aggregating information from neighboring nodes
2. **Vision Transformers (ViT)**: Apply self-attention to image patches for powerful visual understanding
3. **EfficientNet**: Use compound scaling to balance model size and accuracy
4. **Neural ODEs**: Model continuous dynamics with neural networks
5. **Multimodal Architectures**: Process multiple input modalities simultaneously

### Key Takeaways:
- Different architectures excel at different tasks
- Attention mechanisms are powerful across domains
- Efficient scaling is crucial for practical deployment
- Multimodal learning is increasingly important
- Choose architecture based on your data structure and task requirements

These architectures represent the cutting edge of deep learning and continue to push the boundaries of what's possible with neural networks!