# Tutorial 17: Model Optimization Techniques

This tutorial covers advanced model optimization techniques including quantization, pruning, knowledge distillation, and model compression.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.quantization import quantize_dynamic, quantize_fx
import torch.nn.utils.prune as prune
import numpy as np
import time
import copy
from typing import Tuple, List, Dict
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 1. Model Quantization

Quantization reduces the numerical precision of weights and activations, significantly reducing model size and improving inference speed.

In [None]:
# Define a simple model for demonstration
class SimpleModel(nn.Module):
    """Simple model for demonstrating optimization techniques"""
    def __init__(self, input_size=784, hidden_size=256, num_classes=10):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size // 2, num_classes)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

In [None]:
# Helper function to evaluate model
def evaluate_model(model, data_loader=None):
    """Evaluate model size, speed, and accuracy"""
    model.eval()
    
    # Model size
    param_size = 0
    buffer_size = 0
    
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    
    model_size = (param_size + buffer_size) / 1024 / 1024  # MB
    
    # Speed test
    dummy_input = torch.randn(100, 784)
    
    # Warmup
    for _ in range(10):
        with torch.no_grad():
            _ = model(dummy_input)
    
    # Time inference
    start_time = time.time()
    num_iterations = 100
    with torch.no_grad():
        for _ in range(num_iterations):
            _ = model(dummy_input)
    
    inference_time = (time.time() - start_time) / num_iterations * 1000  # ms
    
    return model_size, inference_time

In [None]:
# Compare different quantization methods
original_model = SimpleModel()
original_size, original_time = evaluate_model(original_model)

print("Model Quantization Comparison")
print("=" * 50)
print(f"Original Model:")
print(f"  Size: {original_size:.2f} MB")
print(f"  Inference: {original_time:.2f} ms")

# Dynamic Quantization
dynamic_quantized_model = quantize_dynamic(
    copy.deepcopy(original_model),
    {nn.Linear},
    dtype=torch.qint8
)
dq_size, dq_time = evaluate_model(dynamic_quantized_model)

print(f"\nDynamic Quantization:")
print(f"  Size: {dq_size:.2f} MB ({original_size/dq_size:.2f}x smaller)")
print(f"  Inference: {dq_time:.2f} ms ({original_time/dq_time:.2f}x speedup)")

In [None]:
# Static Quantization (with calibration)
class QuantizableModel(nn.Module):
    """Model prepared for static quantization"""
    def __init__(self, input_size=784, hidden_size=256, num_classes=10):
        super().__init__()
        self.quant = torch.quantization.QuantStub()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size // 2, num_classes)
        self.dequant = torch.quantization.DeQuantStub()
        
    def forward(self, x):
        x = self.quant(x)
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.dequant(x)
        return x

# Prepare for static quantization
static_model = QuantizableModel()
static_model.eval()

# Set quantization config
static_model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(static_model, inplace=True)

# Calibrate with representative data
calibration_data = torch.randn(1000, 784)
with torch.no_grad():
    for i in range(0, len(calibration_data), 100):
        static_model(calibration_data[i:i+100])

# Convert to quantized model
torch.quantization.convert(static_model, inplace=True)

sq_size, sq_time = evaluate_model(static_model)
print(f"\nStatic Quantization:")
print(f"  Size: {sq_size:.2f} MB ({original_size/sq_size:.2f}x smaller)")
print(f"  Inference: {sq_time:.2f} ms ({original_time/sq_time:.2f}x speedup)")

In [None]:
# Visualize quantization comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Model sizes
models = ['Original', 'Dynamic\nQuantized', 'Static\nQuantized']
sizes = [original_size, dq_size, sq_size]
colors = ['#3498db', '#2ecc71', '#e74c3c']

bars1 = ax1.bar(models, sizes, color=colors)
ax1.set_ylabel('Model Size (MB)')
ax1.set_title('Model Size Comparison')
ax1.set_ylim(0, max(sizes) * 1.2)

# Add value labels
for bar, size in zip(bars1, sizes):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f'{size:.2f} MB', ha='center', va='bottom')

# Inference times
times = [original_time, dq_time, sq_time]
bars2 = ax2.bar(models, times, color=colors)
ax2.set_ylabel('Inference Time (ms)')
ax2.set_title('Inference Speed Comparison')
ax2.set_ylim(0, max(times) * 1.2)

# Add value labels
for bar, time in zip(bars2, times):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f'{time:.2f} ms', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 2. Network Pruning

Pruning removes unnecessary connections or neurons from the network, creating sparse models.

In [None]:
def count_parameters(model):
    """Count total and non-zero parameters"""
    total_params = 0
    nonzero_params = 0
    
    for name, param in model.named_parameters():
        total_params += param.numel()
        if hasattr(param, 'data'):
            nonzero_params += torch.count_nonzero(param.data).item()
        else:
            nonzero_params += param.numel()
    
    return total_params, nonzero_params

def visualize_weight_distribution(model, title="Weight Distribution"):
    """Visualize weight distribution of the model"""
    weights = []
    for name, param in model.named_parameters():
        if 'weight' in name:
            weights.extend(param.data.cpu().numpy().flatten())
    
    plt.figure(figsize=(10, 6))
    plt.hist(weights, bins=100, alpha=0.7, color='blue', edgecolor='black')
    plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
    plt.xlabel('Weight Value')
    plt.ylabel('Count')
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.show()

In [None]:
# Unstructured Pruning
pruning_model = SimpleModel()
total_params, nonzero_params = count_parameters(pruning_model)

print("Unstructured Pruning")
print("=" * 50)
print(f"Before pruning:")
print(f"  Total parameters: {total_params:,}")
print(f"  Non-zero parameters: {nonzero_params:,}")
print(f"  Sparsity: {(1 - nonzero_params/total_params)*100:.1f}%")

# Visualize original weights
visualize_weight_distribution(pruning_model, "Original Weight Distribution")

# Apply L1 unstructured pruning
for name, module in pruning_model.named_modules():
    if isinstance(module, nn.Linear):
        prune.l1_unstructured(module, name='weight', amount=0.5)

total_params, nonzero_params = count_parameters(pruning_model)
print(f"\nAfter 50% pruning:")
print(f"  Total parameters: {total_params:,}")
print(f"  Non-zero parameters: {nonzero_params:,}")
print(f"  Sparsity: {(1 - nonzero_params/total_params)*100:.1f}%")

# Visualize pruned weights
visualize_weight_distribution(pruning_model, "Pruned Weight Distribution (50% sparsity)")

In [None]:
# Different pruning methods comparison
pruning_methods = {
    'L1': prune.L1Unstructured,
    'L2': prune.L2Unstructured,
    'Random': prune.RandomUnstructured
}

pruning_amounts = [0.1, 0.3, 0.5, 0.7, 0.9]
results = {method: [] for method in pruning_methods}

for method_name, method_class in pruning_methods.items():
    for amount in pruning_amounts:
        # Create fresh model
        model = SimpleModel()
        
        # Apply pruning
        for name, module in model.named_modules():
            if isinstance(module, nn.Linear):
                method_class.apply(module, name='weight', amount=amount)
        
        # Count parameters
        total, nonzero = count_parameters(model)
        sparsity = 1 - (nonzero / total)
        results[method_name].append(sparsity)

# Plot comparison
plt.figure(figsize=(10, 6))
for method_name, sparsities in results.items():
    plt.plot([a*100 for a in pruning_amounts], 
             [s*100 for s in sparsities], 
             marker='o', linewidth=2, label=method_name)

plt.xlabel('Pruning Amount (%)')
plt.ylabel('Actual Sparsity (%)')
plt.title('Pruning Methods Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Structured Pruning (channel/filter pruning)
structured_model = SimpleModel()

print("\nStructured Pruning")
print("=" * 50)

# Get original dimensions
for name, module in structured_model.named_modules():
    if isinstance(module, nn.Linear):
        print(f"{name}: {module.weight.shape}")

# Apply structured pruning
for name, module in structured_model.named_modules():
    if isinstance(module, nn.Linear):
        # Prune output channels (neurons)
        prune.ln_structured(module, name='weight', amount=0.3, n=2, dim=0)

print("\nAfter structured pruning (30% of neurons):")
# Note: Structured pruning doesn't change tensor dimensions, 
# it zeros out entire rows/columns

## 3. Knowledge Distillation

Knowledge distillation transfers knowledge from a large teacher model to a smaller student model.

In [None]:
class TeacherModel(nn.Module):
    """Large teacher model"""
    def __init__(self, input_size=784, hidden_size=512, num_classes=10):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size // 2)
        self.fc4 = nn.Linear(hidden_size // 2, num_classes)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

class StudentModel(nn.Module):
    """Small student model"""
    def __init__(self, input_size=784, hidden_size=128, num_classes=10):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
def distillation_loss(student_outputs, teacher_outputs, labels, temperature=4.0, alpha=0.7):
    """Knowledge distillation loss"""
    # Soft targets loss
    soft_targets = F.softmax(teacher_outputs / temperature, dim=1)
    soft_loss = F.kl_div(
        F.log_softmax(student_outputs / temperature, dim=1),
        soft_targets,
        reduction='batchmean'
    ) * (temperature ** 2)
    
    # Hard targets loss
    hard_loss = F.cross_entropy(student_outputs, labels)
    
    # Combined loss
    return alpha * soft_loss + (1 - alpha) * hard_loss, soft_loss, hard_loss

# Create teacher and student models
teacher = TeacherModel()
student = StudentModel()

teacher_params = sum(p.numel() for p in teacher.parameters())
student_params = sum(p.numel() for p in student.parameters())

print("Knowledge Distillation")
print("=" * 50)
print(f"Teacher parameters: {teacher_params:,}")
print(f"Student parameters: {student_params:,} ({teacher_params/student_params:.1f}x smaller)")
print(f"Compression ratio: {teacher_params/student_params:.1f}x")

In [None]:
# Simulate distillation training
def train_with_distillation(teacher, student, num_epochs=10, temperature=4.0, alpha=0.7):
    """Train student model with knowledge distillation"""
    teacher.eval()  # Teacher in eval mode
    student.train()
    
    optimizer = optim.Adam(student.parameters(), lr=0.001)
    losses = {'total': [], 'soft': [], 'hard': []}
    
    for epoch in range(num_epochs):
        # Generate dummy data
        batch_size = 128
        x = torch.randn(batch_size, 784)
        labels = torch.randint(0, 10, (batch_size,))
        
        # Teacher predictions
        with torch.no_grad():
            teacher_outputs = teacher(x)
        
        # Student predictions
        student_outputs = student(x)
        
        # Calculate distillation loss
        loss, soft_loss, hard_loss = distillation_loss(
            student_outputs, teacher_outputs, labels, temperature, alpha
        )
        
        # Update student
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses['total'].append(loss.item())
        losses['soft'].append(soft_loss.item())
        losses['hard'].append(hard_loss.item())
    
    return losses

# Train with different temperatures
temperatures = [1, 4, 10, 20]
all_losses = {}

for temp in temperatures:
    teacher_model = TeacherModel()
    student_model = StudentModel()
    losses = train_with_distillation(teacher_model, student_model, 
                                    num_epochs=50, temperature=temp)
    all_losses[temp] = losses

In [None]:
# Visualize distillation results
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

for idx, (temp, losses) in enumerate(all_losses.items()):
    ax = axes[idx]
    epochs = range(len(losses['total']))
    
    ax.plot(epochs, losses['total'], label='Total Loss', linewidth=2)
    ax.plot(epochs, losses['soft'], label='Soft Loss', linewidth=2, linestyle='--')
    ax.plot(epochs, losses['hard'], label='Hard Loss', linewidth=2, linestyle=':')
    
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    ax.set_title(f'Temperature = {temp}')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Effect of temperature on soft targets
plt.figure(figsize=(10, 6))
logits = torch.tensor([2.0, 1.0, 0.1, -1.0, -2.0])
temps = [0.5, 1, 2, 5, 10, 20]

for temp in temps:
    probs = F.softmax(logits / temp, dim=0)
    plt.plot(probs.numpy(), marker='o', label=f'T={temp}')

plt.xlabel('Class')
plt.ylabel('Probability')
plt.title('Effect of Temperature on Softmax Distribution')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 4. Advanced Compression Techniques

Let's explore more advanced compression techniques like low-rank factorization and weight clustering.

In [None]:
# Low-rank factorization
class LowRankLinear(nn.Module):
    """Low-rank factorization of linear layer"""
    def __init__(self, in_features, out_features, rank):
        super().__init__()
        self.U = nn.Linear(in_features, rank, bias=False)
        self.V = nn.Linear(rank, out_features, bias=True)
        
    def forward(self, x):
        return self.V(self.U(x))

# Compare original vs low-rank
def compare_lowrank_compression(in_features, out_features, ranks):
    """Compare different low-rank approximations"""
    results = []
    
    # Original layer
    original = nn.Linear(in_features, out_features)
    orig_params = sum(p.numel() for p in original.parameters())
    results.append(('Original', orig_params, 1.0))
    
    # Low-rank versions
    for rank in ranks:
        lr_layer = LowRankLinear(in_features, out_features, rank)
        lr_params = sum(p.numel() for p in lr_layer.parameters())
        compression = orig_params / lr_params
        results.append((f'Rank-{rank}', lr_params, compression))
    
    return results

# Test different configurations
in_features, out_features = 512, 512
ranks = [64, 128, 256]
results = compare_lowrank_compression(in_features, out_features, ranks)

print("Low-Rank Factorization")
print("=" * 60)
print(f"Original: {in_features} × {out_features} = {in_features * out_features:,} weights")
print()
print(f"{'Method':<15} {'Parameters':<15} {'Compression':<15}")
print("-" * 45)
for method, params, compression in results:
    print(f"{method:<15} {params:<15,} {compression:<15.1f}x")

In [None]:
# Weight clustering visualization
from sklearn.cluster import KMeans

def apply_weight_clustering(weight, num_clusters=16):
    """Apply k-means clustering to weights"""
    # Flatten weights
    w_flat = weight.flatten().cpu().numpy()
    
    # Cluster weights
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    kmeans.fit(w_flat.reshape(-1, 1))
    
    # Replace weights with cluster centers
    clustered = kmeans.cluster_centers_[kmeans.labels_]
    clustered_weight = torch.tensor(clustered.reshape(weight.shape))
    
    return clustered_weight, kmeans

# Demonstrate weight clustering
example_weight = torch.randn(256, 128)
num_clusters_list = [4, 8, 16, 32]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

# Original weights
axes[0].hist(example_weight.flatten().numpy(), bins=50, alpha=0.7)
axes[0].set_title(f'Original Weights\n{len(torch.unique(example_weight))} unique values')
axes[0].set_xlabel('Weight Value')
axes[0].set_ylabel('Count')

# Clustered weights
for idx, num_clusters in enumerate(num_clusters_list):
    clustered_weight, kmeans = apply_weight_clustering(example_weight, num_clusters)
    
    ax = axes[idx + 1]
    ax.hist(clustered_weight.flatten().numpy(), bins=50, alpha=0.7)
    ax.set_title(f'{num_clusters} Clusters\n{len(torch.unique(clustered_weight))} unique values')
    ax.set_xlabel('Weight Value')
    ax.set_ylabel('Count')
    
    # Mark cluster centers
    for center in kmeans.cluster_centers_:
        ax.axvline(x=center[0], color='red', linestyle='--', alpha=0.5)

# Remove empty subplot
axes[-1].remove()

plt.tight_layout()
plt.show()

## 5. Complete Optimization Pipeline

Let's create a complete optimization pipeline that combines multiple techniques.

In [None]:
class OptimizationPipeline:
    """Complete model optimization pipeline"""
    def __init__(self, model):
        self.original_model = copy.deepcopy(model)
        self.optimized_model = copy.deepcopy(model)
        self.results = {}
        
    def evaluate(self, model, stage_name):
        """Evaluate and store results"""
        size, time = evaluate_model(model)
        total_params, nonzero_params = count_parameters(model)
        
        self.results[stage_name] = {
            'size_mb': size,
            'time_ms': time,
            'total_params': total_params,
            'nonzero_params': nonzero_params,
            'sparsity': 1 - (nonzero_params / total_params)
        }
        
    def apply_pruning(self, amount=0.5):
        """Apply pruning"""
        for name, module in self.optimized_model.named_modules():
            if isinstance(module, nn.Linear):
                prune.l1_unstructured(module, name='weight', amount=amount)
        self.evaluate(self.optimized_model, 'pruned')
        
    def apply_quantization(self):
        """Apply quantization"""
        self.optimized_model = quantize_dynamic(
            self.optimized_model,
            {nn.Linear},
            dtype=torch.qint8
        )
        self.evaluate(self.optimized_model, 'quantized')
        
    def run_pipeline(self, prune_amount=0.5):
        """Run complete optimization pipeline"""
        # Evaluate original
        self.evaluate(self.original_model, 'original')
        
        # Apply pruning
        self.apply_pruning(prune_amount)
        
        # Apply quantization
        self.apply_quantization()
        
        return self.results
    
    def visualize_results(self):
        """Visualize optimization results"""
        stages = list(self.results.keys())
        
        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
        
        # Model size
        sizes = [self.results[s]['size_mb'] for s in stages]
        axes[0, 0].bar(stages, sizes, color=['#3498db', '#e74c3c', '#2ecc71'])
        axes[0, 0].set_ylabel('Size (MB)')
        axes[0, 0].set_title('Model Size')
        
        # Inference time
        times = [self.results[s]['time_ms'] for s in stages]
        axes[0, 1].bar(stages, times, color=['#3498db', '#e74c3c', '#2ecc71'])
        axes[0, 1].set_ylabel('Time (ms)')
        axes[0, 1].set_title('Inference Time')
        
        # Parameter count
        params = [self.results[s]['nonzero_params'] for s in stages]
        axes[1, 0].bar(stages, params, color=['#3498db', '#e74c3c', '#2ecc71'])
        axes[1, 0].set_ylabel('Parameters')
        axes[1, 0].set_title('Non-zero Parameters')
        
        # Compression summary
        ax = axes[1, 1]
        ax.text(0.1, 0.9, 'Optimization Summary:', fontsize=14, fontweight='bold', 
                transform=ax.transAxes)
        
        y_pos = 0.7
        for stage in stages[1:]:
            size_reduction = self.results['original']['size_mb'] / self.results[stage]['size_mb']
            speed_up = self.results['original']['time_ms'] / self.results[stage]['time_ms']
            sparsity = self.results[stage]['sparsity'] * 100
            
            text = f"{stage.capitalize()}:\n"
            text += f"  Size reduction: {size_reduction:.2f}x\n"
            text += f"  Speed up: {speed_up:.2f}x\n"
            text += f"  Sparsity: {sparsity:.1f}%"
            
            ax.text(0.1, y_pos, text, transform=ax.transAxes, fontsize=11)
            y_pos -= 0.3
        
        ax.axis('off')
        
        plt.tight_layout()
        plt.show()

In [None]:
# Run complete optimization pipeline
model = SimpleModel()
pipeline = OptimizationPipeline(model)
results = pipeline.run_pipeline(prune_amount=0.6)

print("Optimization Pipeline Results")
print("=" * 70)
print(f"{'Stage':<15} {'Size (MB)':<12} {'Time (ms)':<12} {'Parameters':<15} {'Sparsity':<10}")
print("-" * 70)

for stage, metrics in results.items():
    print(f"{stage:<15} {metrics['size_mb']:<12.2f} {metrics['time_ms']:<12.2f} "
          f"{metrics['nonzero_params']:<15,} {metrics['sparsity']*100:<10.1f}%")

pipeline.visualize_results()

## 6. Best Practices and Guidelines

Let's summarize the best practices for model optimization.

In [None]:
# Create optimization technique comparison
techniques_data = {
    'Technique': ['Dynamic Quantization', 'Static Quantization', 'Unstructured Pruning', 
                  'Structured Pruning', 'Knowledge Distillation', 'Low-Rank Factorization'],
    'Size Reduction': [2, 4, 10, 5, 20, 3],
    'Speed Up': [2, 3, 2, 3, 10, 2],
    'Accuracy Loss': [0.5, 1, 3, 2, 1, 2],
    'Implementation Difficulty': [1, 3, 2, 3, 4, 2]
}

# Create radar chart
fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(projection='polar'))

# Categories
categories = ['Size\nReduction', 'Speed\nUp', 'Low Accuracy\nLoss', 'Easy\nImplementation']
num_vars = len(categories)

# Angles for each category
angles = [n / float(num_vars) * 2 * np.pi for n in range(num_vars)]
angles += angles[:1]

# Plot each technique
colors = plt.cm.Set3(np.linspace(0, 1, len(techniques_data['Technique'])))

for idx, technique in enumerate(techniques_data['Technique']):
    values = [
        techniques_data['Size Reduction'][idx],
        techniques_data['Speed Up'][idx],
        5 - techniques_data['Accuracy Loss'][idx],  # Invert for "low loss is good"
        5 - techniques_data['Implementation Difficulty'][idx]  # Invert for "easy is good"
    ]
    values += values[:1]
    
    ax.plot(angles, values, 'o-', linewidth=2, label=technique, color=colors[idx])
    ax.fill(angles, values, alpha=0.15, color=colors[idx])

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories)
ax.set_ylim(0, 5)
ax.set_title('Model Optimization Techniques Comparison', size=16, y=1.1)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
ax.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Decision tree for optimization techniques
print("Model Optimization Decision Guide")
print("=" * 50)
print()
print("1. What is your primary constraint?")
print("   ├─ Model Size → Try Pruning or Distillation")
print("   ├─ Inference Speed → Try Quantization or TorchScript")
print("   └─ Both → Combine Pruning + Quantization")
print()
print("2. What is your deployment target?")
print("   ├─ Mobile/Edge → Use INT8 Quantization + Pruning")
print("   ├─ Server CPU → Use Dynamic Quantization")
print("   └─ Server GPU → Use Mixed Precision (FP16)")
print()
print("3. How much accuracy loss is acceptable?")
print("   ├─ < 1% → Use Quantization only")
print("   ├─ 1-3% → Add moderate Pruning")
print("   └─ > 3% → Consider Knowledge Distillation")
print()
print("Recommended Pipeline:")
print("1. Profile and identify bottlenecks")
print("2. Apply quantization (easy win)")
print("3. Add pruning if needed")
print("4. Fine-tune after optimization")
print("5. Validate accuracy thoroughly")

## Summary

In this tutorial, we covered comprehensive model optimization techniques:

1. **Quantization**: Reducing numerical precision for smaller, faster models
2. **Pruning**: Removing unnecessary connections to create sparse networks
3. **Knowledge Distillation**: Transferring knowledge from large to small models
4. **Advanced Compression**: Low-rank factorization and weight clustering
5. **Optimization Pipelines**: Combining techniques for maximum compression

### Key Takeaways:
- Start with quantization for easy performance gains
- Combine multiple techniques for best results
- Always validate accuracy after optimization
- Consider your deployment constraints
- Use profiling to guide optimization efforts

Model optimization is crucial for deploying deep learning in production!