# Tutorial 19: Neural Architecture Search

This tutorial explores Neural Architecture Search (NAS) techniques for automatically designing optimal neural network architectures.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from typing import List, Dict, Tuple, Optional
import random
import copy
import time
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
from sklearn.manifold import TSNE
import networkx as nx

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Search Space Design

The first step in NAS is defining the search space - the set of possible architectures.

In [None]:
# Define basic operations for search space
class ConvBlock(nn.Module):
    """Basic convolutional block"""
    def __init__(self, in_channels, out_channels, kernel_size, stride=1):
        super().__init__()
        padding = kernel_size // 2
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
        self.bn = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        return self.relu(self.bn(self.conv(x)))

class DepthwiseSeparableConv(nn.Module):
    """Depthwise separable convolution"""
    def __init__(self, in_channels, out_channels, kernel_size, stride=1):
        super().__init__()
        padding = kernel_size // 2
        self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size, 
                                  stride, padding, groups=in_channels)
        self.pointwise = nn.Conv2d(in_channels, out_channels, 1)
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.bn1(self.depthwise(x)))
        x = self.bn2(self.pointwise(x))
        return self.relu(x)

class IdentityBlock(nn.Module):
    """Identity/skip connection"""
    def __init__(self, channels):
        super().__init__()
        self.channels = channels
        
    def forward(self, x):
        return x

# Define search space
PRIMITIVES = {
    'conv3x3': lambda c: ConvBlock(c, c, 3),
    'conv5x5': lambda c: ConvBlock(c, c, 5),
    'dw_conv3x3': lambda c: DepthwiseSeparableConv(c, c, 3),
    'dw_conv5x5': lambda c: DepthwiseSeparableConv(c, c, 5),
    'max_pool3x3': lambda c: nn.MaxPool2d(3, stride=1, padding=1),
    'avg_pool3x3': lambda c: nn.AvgPool2d(3, stride=1, padding=1),
    'identity': lambda c: IdentityBlock(c)
}

In [None]:
# Visualize search space
fig, ax = plt.subplots(figsize=(10, 6))

# Operation properties
op_properties = {
    'conv3x3': {'params': 9, 'flops': 9, 'type': 'conv'},
    'conv5x5': {'params': 25, 'flops': 25, 'type': 'conv'},
    'dw_conv3x3': {'params': 3, 'flops': 3, 'type': 'conv'},
    'dw_conv5x5': {'params': 5, 'flops': 5, 'type': 'conv'},
    'max_pool3x3': {'params': 0, 'flops': 1, 'type': 'pool'},
    'avg_pool3x3': {'params': 0, 'flops': 1, 'type': 'pool'},
    'identity': {'params': 0, 'flops': 0, 'type': 'skip'}
}

ops = list(PRIMITIVES.keys())
params = [op_properties[op]['params'] for op in ops]
flops = [op_properties[op]['flops'] for op in ops]
colors = ['#3498db' if op_properties[op]['type'] == 'conv' else 
          '#2ecc71' if op_properties[op]['type'] == 'pool' else 
          '#e74c3c' for op in ops]

scatter = ax.scatter(params, flops, s=200, c=colors, alpha=0.7, edgecolors='black', linewidth=2)

for i, op in enumerate(ops):
    ax.annotate(op, (params[i], flops[i]), xytext=(5, 5), 
                textcoords='offset points', fontsize=10)

ax.set_xlabel('Relative Parameters', fontsize=12)
ax.set_ylabel('Relative FLOPs', fontsize=12)
ax.set_title('Search Space Operations', fontsize=14)
ax.grid(True, alpha=0.3)

# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='#3498db', label='Convolution'),
                   Patch(facecolor='#2ecc71', label='Pooling'),
                   Patch(facecolor='#e74c3c', label='Skip')]
ax.legend(handles=legend_elements, loc='upper right')

plt.tight_layout()
plt.show()

## 2. Architecture Representation

We need a way to represent and manipulate architectures.

In [None]:
class Architecture:
    """Represents a neural architecture as a list of operations"""
    def __init__(self, layers: List[str]):
        self.layers = layers
        
    def __repr__(self):
        return f"Architecture({' -> '.join(self.layers)})"
    
    def mutate(self, mutation_prob=0.1):
        """Random mutation for evolutionary search"""
        new_layers = self.layers.copy()
        for i in range(len(new_layers)):
            if random.random() < mutation_prob:
                new_layers[i] = random.choice(list(PRIMITIVES.keys()))
        return Architecture(new_layers)
    
    def crossover(self, other: 'Architecture'):
        """Crossover for evolutionary search"""
        crossover_point = random.randint(1, len(self.layers) - 1)
        new_layers = self.layers[:crossover_point] + other.layers[crossover_point:]
        return Architecture(new_layers)
    
    def visualize(self):
        """Visualize architecture as a graph"""
        G = nx.DiGraph()
        
        # Add nodes
        G.add_node('input', label='Input')
        for i, layer in enumerate(self.layers):
            G.add_node(f'layer_{i}', label=layer)
        G.add_node('output', label='Output')
        
        # Add edges
        G.add_edge('input', 'layer_0')
        for i in range(len(self.layers) - 1):
            G.add_edge(f'layer_{i}', f'layer_{i+1}')
        G.add_edge(f'layer_{len(self.layers)-1}', 'output')
        
        # Plot
        plt.figure(figsize=(12, 4))
        pos = nx.spring_layout(G, k=2, iterations=50)
        
        # Draw nodes with different colors
        node_colors = ['lightblue' if 'layer' in node else 'lightgreen' for node in G.nodes()]
        nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=1500)
        nx.draw_networkx_edges(G, pos, edge_color='gray', arrows=True, arrowsize=20)
        
        # Draw labels
        labels = {node: data['label'] for node, data in G.nodes(data=True)}
        nx.draw_networkx_labels(G, pos, labels, font_size=10)
        
        plt.title('Architecture Visualization')
        plt.axis('off')
        plt.tight_layout()
        plt.show()

In [None]:
# Example architectures
arch1 = Architecture(['conv3x3', 'max_pool3x3', 'conv5x5', 'identity'])
arch2 = Architecture(['dw_conv3x3', 'avg_pool3x3', 'dw_conv5x5', 'conv3x3'])

print("Architecture 1:")
print(arch1)
arch1.visualize()

print("\nMutation and Crossover Examples:")
mutated = arch1.mutate(0.5)
print(f"Mutated: {mutated}")

crossed = arch1.crossover(arch2)
print(f"Crossover: {crossed}")

## 3. Random Search

The simplest search strategy is random search.

In [None]:
class NASModel(nn.Module):
    """Model that can be built from architecture description"""
    def __init__(self, architecture: Architecture, input_channels=3, num_classes=10):
        super().__init__()
        self.architecture = architecture
        
        # Build layers from architecture
        layers = []
        channels = 32  # Starting channels
        
        # Initial conv
        layers.append(ConvBlock(input_channels, channels, 3))
        
        # Architecture-defined layers
        for op_name in architecture.layers:
            layers.append(PRIMITIVES[op_name](channels))
        
        self.features = nn.Sequential(*layers)
        self.global_pool = nn.AdaptiveAvgPool2d(1)
        self.classifier = nn.Linear(channels, num_classes)
        
    def forward(self, x):
        x = self.features(x)
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

def evaluate_architecture(architecture: Architecture, num_epochs=5):
    """Quick evaluation of an architecture"""
    model = NASModel(architecture).to(device)
    
    # Count parameters
    num_params = sum(p.numel() for p in model.parameters())
    
    # Simulate training (in practice, you'd train on real data)
    # Here we use a simple heuristic based on architecture properties
    base_accuracy = 0.7
    
    # Bonus for good operations
    for op in architecture.layers:
        if 'conv' in op:
            base_accuracy += 0.02
        elif op == 'identity':
            base_accuracy += 0.01
    
    # Penalty for too many parameters
    if num_params > 1e6:
        base_accuracy -= 0.1
    
    # Add some randomness
    accuracy = np.clip(base_accuracy + np.random.normal(0, 0.05), 0, 1)
    latency = num_params / 1e6 * np.random.uniform(0.8, 1.2)
    
    return {
        'accuracy': accuracy,
        'params': num_params,
        'latency': latency,
        'architecture': architecture
    }

In [None]:
# Random search
def random_search(num_architectures=50, layers_per_arch=4):
    """Random architecture search"""
    results = []
    
    print("Running Random Search...")
    for i in range(num_architectures):
        # Generate random architecture
        layers = [random.choice(list(PRIMITIVES.keys())) for _ in range(layers_per_arch)]
        arch = Architecture(layers)
        
        # Evaluate
        result = evaluate_architecture(arch)
        results.append(result)
        
        if (i + 1) % 10 == 0:
            print(f"Evaluated {i+1}/{num_architectures} architectures")
    
    # Sort by accuracy
    results.sort(key=lambda x: x['accuracy'], reverse=True)
    
    return results

# Run random search
random_results = random_search(num_architectures=50)

print("\nTop 5 Architectures (Random Search):")
for i, result in enumerate(random_results[:5]):
    print(f"{i+1}. Accuracy: {result['accuracy']:.3f}, "
          f"Params: {result['params']/1e6:.2f}M, "
          f"Arch: {result['architecture'].layers}")

## 4. Evolutionary Search

Evolutionary algorithms can be more efficient than random search.

In [None]:
class EvolutionarySearch:
    """Evolutionary algorithm for NAS"""
    def __init__(self, population_size=20, mutation_prob=0.1, layers_per_arch=4):
        self.population_size = population_size
        self.mutation_prob = mutation_prob
        self.layers_per_arch = layers_per_arch
        
    def initialize_population(self):
        """Create initial random population"""
        population = []
        for _ in range(self.population_size):
            layers = [random.choice(list(PRIMITIVES.keys())) 
                     for _ in range(self.layers_per_arch)]
            population.append(Architecture(layers))
        return population
    
    def evaluate_population(self, population):
        """Evaluate all architectures in population"""
        results = []
        for arch in population:
            result = evaluate_architecture(arch)
            results.append(result)
        return results
    
    def select_parents(self, population_results, num_parents):
        """Tournament selection"""
        parents = []
        for _ in range(num_parents):
            # Tournament of size 3
            tournament = random.sample(population_results, 3)
            winner = max(tournament, key=lambda x: x['accuracy'])
            parents.append(winner['architecture'])
        return parents
    
    def evolve(self, num_generations=30):
        """Run evolutionary search"""
        # Initialize
        population = self.initialize_population()
        history = {'best_accuracy': [], 'avg_accuracy': [], 'diversity': []}
        all_results = []
        
        print("Running Evolutionary Search...")
        for generation in range(num_generations):
            # Evaluate current population
            results = self.evaluate_population(population)
            all_results.extend(results)
            
            # Track statistics
            accuracies = [r['accuracy'] for r in results]
            history['best_accuracy'].append(max(accuracies))
            history['avg_accuracy'].append(np.mean(accuracies))
            
            # Calculate diversity (unique architectures)
            unique_archs = len(set(str(r['architecture'].layers) for r in results))
            history['diversity'].append(unique_archs / len(results))
            
            # Select parents
            num_parents = self.population_size // 2
            parents = self.select_parents(results, num_parents)
            
            # Generate offspring
            offspring = []
            while len(offspring) < self.population_size:
                if random.random() < 0.5 and len(parents) >= 2:
                    # Crossover
                    p1, p2 = random.sample(parents, 2)
                    child = p1.crossover(p2)
                else:
                    # Mutation
                    parent = random.choice(parents)
                    child = parent.mutate(self.mutation_prob)
                offspring.append(child)
            
            # Replace population
            population = offspring
            
            if (generation + 1) % 5 == 0:
                print(f"Generation {generation + 1}: "
                      f"Best Acc = {history['best_accuracy'][-1]:.3f}, "
                      f"Avg Acc = {history['avg_accuracy'][-1]:.3f}, "
                      f"Diversity = {history['diversity'][-1]:.2f}")
        
        return all_results, history

In [None]:
# Run evolutionary search
evo_search = EvolutionarySearch(population_size=30, mutation_prob=0.2)
evo_results, evo_history = evo_search.evolve(num_generations=25)

# Sort results
evo_results.sort(key=lambda x: x['accuracy'], reverse=True)

print("\nTop 5 Architectures (Evolutionary Search):")
for i, result in enumerate(evo_results[:5]):
    print(f"{i+1}. Accuracy: {result['accuracy']:.3f}, "
          f"Params: {result['params']/1e6:.2f}M")

In [None]:
# Visualize evolutionary search progress
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Accuracy over generations
axes[0].plot(evo_history['best_accuracy'], label='Best', linewidth=2)
axes[0].plot(evo_history['avg_accuracy'], label='Average', linewidth=2)
axes[0].fill_between(range(len(evo_history['best_accuracy'])),
                     evo_history['avg_accuracy'], 
                     evo_history['best_accuracy'],
                     alpha=0.3)
axes[0].set_xlabel('Generation')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Evolutionary Search Progress')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Diversity
axes[1].plot(evo_history['diversity'], linewidth=2, color='green')
axes[1].set_xlabel('Generation')
axes[1].set_ylabel('Population Diversity')
axes[1].set_title('Genetic Diversity Over Time')
axes[1].grid(True, alpha=0.3)

# Compare random vs evolutionary
random_accs = [r['accuracy'] for r in random_results]
evo_accs = [r['accuracy'] for r in evo_results[:len(random_results)]]

axes[2].hist([random_accs, evo_accs], bins=15, label=['Random', 'Evolutionary'], alpha=0.7)
axes[2].set_xlabel('Accuracy')
axes[2].set_ylabel('Count')
axes[2].set_title('Search Strategy Comparison')
axes[2].legend()
axes[2].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 5. Differentiable Architecture Search (DARTS)

DARTS makes the architecture search differentiable by using continuous relaxation.

In [None]:
class MixedOperation(nn.Module):
    """Mixed operation for DARTS"""
    def __init__(self, channels):
        super().__init__()
        self.ops = nn.ModuleList([
            op(channels) for op in PRIMITIVES.values()
        ])
        
    def forward(self, x, weights):
        """weights: softmax over operations"""
        return sum(w * op(x) for w, op in zip(weights, self.ops))

class DARTSCell(nn.Module):
    """DARTS cell with learnable architecture"""
    def __init__(self, channels, num_nodes=4):
        super().__init__()
        self.num_nodes = num_nodes
        
        # Mixed operations for each edge
        self.ops = nn.ModuleList()
        for i in range(num_nodes):
            for j in range(i):
                self.ops.append(MixedOperation(channels))
        
        # Architecture parameters (to be learned)
        num_edges = len(self.ops)
        self.arch_params = nn.Parameter(
            torch.randn(num_edges, len(PRIMITIVES)) / 10
        )
        
    def forward(self, x):
        states = [x]
        offset = 0
        
        # Compute intermediate nodes
        for i in range(1, self.num_nodes):
            s = []
            for j in range(i):
                weights = F.softmax(self.arch_params[offset], dim=0)
                s.append(self.ops[offset](states[j], weights))
                offset += 1
            states.append(sum(s))
        
        # Output is concatenation of intermediate nodes
        return torch.cat(states[1:], dim=1)
    
    def get_genotype(self):
        """Extract discrete architecture"""
        gene = []
        offset = 0
        
        for i in range(1, self.num_nodes):
            edges = []
            for j in range(i):
                weights = F.softmax(self.arch_params[offset], dim=0)
                op_idx = weights.argmax().item()
                op_name = list(PRIMITIVES.keys())[op_idx]
                edges.append((op_name, j, weights[op_idx].item()))
                offset += 1
            
            # Select top 2 edges
            edges.sort(key=lambda x: x[2], reverse=True)
            gene.extend([(e[0], e[1]) for e in edges[:2]])
        
        return gene

In [None]:
# Visualize DARTS architecture weights
darts_cell = DARTSCell(32, num_nodes=4)

# Get architecture weights
arch_weights = F.softmax(darts_cell.arch_params, dim=1).detach().numpy()

# Create heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(arch_weights.T, 
            xticklabels=[f'Edge {i}' for i in range(arch_weights.shape[0])],
            yticklabels=list(PRIMITIVES.keys()),
            cmap='YlOrRd',
            annot=True,
            fmt='.2f',
            cbar_kws={'label': 'Operation Weight'})

plt.xlabel('Cell Edges')
plt.ylabel('Operations')
plt.title('DARTS Architecture Weights (Before Training)')
plt.tight_layout()
plt.show()

# Extract genotype
genotype = darts_cell.get_genotype()
print("\nExtracted Genotype:")
for i, (op, node) in enumerate(genotype):
    print(f"Edge {i}: {op} from node {node}")

## 6. Multi-Objective NAS

Often we need to optimize for multiple objectives like accuracy, latency, and model size.

In [None]:
def pareto_frontier(results, objectives=['accuracy', 'latency']):
    """Find Pareto frontier for multi-objective optimization"""
    pareto_front = []
    
    for candidate in results:
        dominated = False
        
        for other in results:
            if other == candidate:
                continue
                
            # Check if candidate is dominated
            better_in_all = all(
                other[obj] >= candidate[obj] if obj == 'accuracy' 
                else other[obj] <= candidate[obj]
                for obj in objectives
            )
            better_in_one = any(
                other[obj] > candidate[obj] if obj == 'accuracy'
                else other[obj] < candidate[obj]
                for obj in objectives
            )
            
            if better_in_all and better_in_one:
                dominated = True
                break
        
        if not dominated:
            pareto_front.append(candidate)
    
    return pareto_front

# Combine all results
all_results = random_results + evo_results

# Find Pareto optimal architectures
pareto_architectures = pareto_frontier(all_results)

In [None]:
# Visualize multi-objective optimization
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Accuracy vs Latency
all_acc = [r['accuracy'] for r in all_results]
all_lat = [r['latency'] for r in all_results]
pareto_acc = [r['accuracy'] for r in pareto_architectures]
pareto_lat = [r['latency'] for r in pareto_architectures]

axes[0].scatter(all_lat, all_acc, alpha=0.4, label='All architectures', s=50)
axes[0].scatter(pareto_lat, pareto_acc, color='red', s=100, 
               label='Pareto optimal', edgecolors='black', linewidth=2)

# Connect Pareto front
pareto_sorted = sorted(zip(pareto_lat, pareto_acc))
if pareto_sorted:
    pareto_lat_sorted, pareto_acc_sorted = zip(*pareto_sorted)
    axes[0].plot(pareto_lat_sorted, pareto_acc_sorted, 'r--', alpha=0.5)

axes[0].set_xlabel('Latency (ms)')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Accuracy vs Latency Trade-off')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy vs Parameters
all_params = [r['params']/1e6 for r in all_results]
pareto_params = [r['params']/1e6 for r in pareto_architectures]

axes[1].scatter(all_params, all_acc, alpha=0.4, label='All architectures', s=50)
axes[1].scatter(pareto_params, pareto_acc, color='red', s=100, 
               label='Pareto optimal', edgecolors='black', linewidth=2)

axes[1].set_xlabel('Parameters (M)')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Accuracy vs Model Size Trade-off')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Found {len(pareto_architectures)} Pareto optimal architectures out of {len(all_results)} total")
print("\nTop Pareto Architectures:")
for i, arch in enumerate(pareto_architectures[:5]):
    print(f"{i+1}. Acc: {arch['accuracy']:.3f}, Latency: {arch['latency']:.2f}ms, "
          f"Params: {arch['params']/1e6:.2f}M")

## 7. Architecture Analysis

Let's analyze what makes good architectures.

In [None]:
# Analyze operation frequency in top architectures
top_n = 20
top_architectures = sorted(all_results, key=lambda x: x['accuracy'], reverse=True)[:top_n]

# Count operations
op_counts = defaultdict(int)
op_positions = defaultdict(list)

for result in top_architectures:
    for pos, op in enumerate(result['architecture'].layers):
        op_counts[op] += 1
        op_positions[op].append(pos)

# Visualize operation analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Operation frequency
ops = list(op_counts.keys())
counts = list(op_counts.values())
colors = plt.cm.viridis(np.linspace(0, 1, len(ops)))

bars = axes[0].bar(ops, counts, color=colors)
axes[0].set_xlabel('Operation')
axes[0].set_ylabel('Frequency in Top Architectures')
axes[0].set_title(f'Operation Popularity (Top {top_n} Architectures)')
axes[0].tick_params(axis='x', rotation=45)

# Add value labels
for bar, count in zip(bars, counts):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                str(count), ha='center', va='bottom')

# Operation position preference
position_data = []
for op in ops:
    if op_positions[op]:
        position_data.append(op_positions[op])

axes[1].boxplot(position_data, labels=ops)
axes[1].set_xlabel('Operation')
axes[1].set_ylabel('Position in Architecture')
axes[1].set_title('Operation Position Preferences')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 8. Hardware-Aware NAS

Real-world NAS often needs to consider specific hardware constraints.

In [None]:
# Hardware performance lookup table (simulated)
HARDWARE_LATENCY = {
    'mobile': {
        'conv3x3': 2.0,
        'conv5x5': 5.0,
        'dw_conv3x3': 1.0,
        'dw_conv5x5': 2.0,
        'max_pool3x3': 0.5,
        'avg_pool3x3': 0.5,
        'identity': 0.1
    },
    'gpu': {
        'conv3x3': 0.5,
        'conv5x5': 0.8,
        'dw_conv3x3': 0.6,
        'dw_conv5x5': 1.0,
        'max_pool3x3': 0.2,
        'avg_pool3x3': 0.2,
        'identity': 0.05
    }
}

def hardware_aware_evaluate(architecture, target_hardware='mobile'):
    """Evaluate architecture for specific hardware"""
    result = evaluate_architecture(architecture)
    
    # Calculate hardware-specific latency
    total_latency = 0
    for op in architecture.layers:
        total_latency += HARDWARE_LATENCY[target_hardware][op]
    
    result['hardware_latency'] = total_latency
    result['hardware'] = target_hardware
    
    return result

# Evaluate top architectures on different hardware
hardware_results = {'mobile': [], 'gpu': []}

for result in top_architectures[:10]:
    arch = result['architecture']
    for hardware in ['mobile', 'gpu']:
        hw_result = hardware_aware_evaluate(arch, hardware)
        hardware_results[hardware].append(hw_result)

# Visualize hardware comparison
fig, ax = plt.subplots(figsize=(10, 6))

mobile_latencies = [r['hardware_latency'] for r in hardware_results['mobile']]
gpu_latencies = [r['hardware_latency'] for r in hardware_results['gpu']]
accuracies = [r['accuracy'] for r in hardware_results['mobile']]

x = np.arange(len(mobile_latencies))
width = 0.35

bars1 = ax.bar(x - width/2, mobile_latencies, width, label='Mobile', alpha=0.8)
bars2 = ax.bar(x + width/2, gpu_latencies, width, label='GPU', alpha=0.8)

# Add accuracy as line
ax2 = ax.twinx()
ax2.plot(x, accuracies, 'ro-', linewidth=2, markersize=8, label='Accuracy')
ax2.set_ylabel('Accuracy', color='red')
ax2.tick_params(axis='y', labelcolor='red')

ax.set_xlabel('Architecture Index')
ax.set_ylabel('Latency (ms)')
ax.set_title('Hardware-Aware Architecture Evaluation')
ax.set_xticks(x)
ax.legend(loc='upper left')
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print("Hardware-Specific Insights:")
print(f"Average mobile latency: {np.mean(mobile_latencies):.2f}ms")
print(f"Average GPU latency: {np.mean(gpu_latencies):.2f}ms")
print(f"Mobile/GPU latency ratio: {np.mean(mobile_latencies)/np.mean(gpu_latencies):.2f}x")

## Summary

Let's summarize the key concepts and best practices for Neural Architecture Search.

In [None]:
# Create summary visualization
fig, ax = plt.subplots(figsize=(12, 8))
ax.axis('off')

# Title
ax.text(0.5, 0.95, 'Neural Architecture Search Summary', 
        ha='center', fontsize=20, fontweight='bold')

# Method comparison table
methods_data = [
    ['Method', 'Search Efficiency', 'Quality', 'Hardware Aware', 'Complexity'],
    ['Random Search', '⭐', '⭐⭐', '✓', '⭐'],
    ['Evolutionary', '⭐⭐', '⭐⭐⭐', '✓', '⭐⭐'],
    ['DARTS', '⭐⭐⭐⭐', '⭐⭐⭐⭐', '✗', '⭐⭐⭐⭐'],
    ['Predictor-based', '⭐⭐⭐⭐⭐', '⭐⭐⭐', '✓', '⭐⭐⭐']
]

# Create table
table = ax.table(cellText=methods_data[1:], colLabels=methods_data[0],
                cellLoc='center', loc='center',
                bbox=[0.1, 0.5, 0.8, 0.35])
table.auto_set_font_size(False)
table.set_fontsize(11)
table.scale(1, 2)

# Style header
for i in range(len(methods_data[0])):
    table[(0, i)].set_facecolor('#3498db')
    table[(0, i)].set_text_props(weight='bold', color='white')

# Key insights
insights = [
    "• Start with random search as baseline",
    "• Use evolutionary algorithms for discrete search spaces",
    "• DARTS is efficient but memory-intensive",
    "• Consider hardware constraints early",
    "• Multi-objective optimization is often necessary",
    "• Use performance predictors to reduce search cost"
]

ax.text(0.1, 0.35, 'Key Insights:', fontsize=14, fontweight='bold')
for i, insight in enumerate(insights):
    ax.text(0.1, 0.30 - i*0.04, insight, fontsize=11)

plt.tight_layout()
plt.show()

print("NAS Best Practices:")
print("1. Define clear objectives (accuracy, latency, size)")
print("2. Design appropriate search space for your problem")
print("3. Use early stopping and performance prediction")
print("4. Consider hardware constraints from the beginning")
print("5. Validate discovered architectures thoroughly")
print("\nNAS is a powerful tool for automating deep learning design!")