# üöÄ GPU + CPU Pipelining Demo
## Understanding How CPU and GPU Work Together in Deep Learning

### Setup Instructions:
1. **Runtime ‚Üí Change runtime type ‚Üí T4 GPU** 
2. Run all cells in order
3. Watch the timing comparisons!

---

In [None]:
# ============================================
# CELL 1: Setup and Imports
# ============================================

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import time
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

print("=" * 60)
print("SETUP CHECK")
print("=" * 60)
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("‚ö†Ô∏è  WARNING: GPU not detected! Please change runtime to T4 GPU")
print("=" * 60)

In [None]:
# ============================================
# CELL 2: Create Synthetic Dataset
# ============================================

class SyntheticImageDataset(Dataset):
    """
    Creates fake images to simulate a real dataset.
    This simulates loading JPEGs from disk + CPU preprocessing.
    """
    def __init__(self, num_images=1000, img_size=224, transform=None):
        self.num_images = num_images
        self.img_size = img_size
        self.transform = transform
        
    def __len__(self):
        return self.num_images
    
    def __getitem__(self, idx):
        # Simulate reading & decoding image from disk (CPU work)
        img = np.random.randint(0, 255, (self.img_size, self.img_size, 3), dtype=np.uint8)
        img = Image.fromarray(img)
        
        # Random label
        label = idx % 10
        
        # Apply transforms (CPU preprocessing!)
        if self.transform:
            img = self.transform(img)
        
        return img, label

print("‚úì Dataset class created!")
print("  This simulates: Disk ‚Üí CPU (load, decode, preprocess)")

In [None]:
# ============================================
# CELL 3: Define a Simple CNN Model
# ============================================

class SimpleCNN(nn.Module):
    """Simple CNN for demonstration"""
    def __init__(self, num_classes=10):
        super(SimpleCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Linear(64 * 56 * 56, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(128, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

print("‚úì CNN Model defined!")
print("  This will run on GPU for training")

In [None]:
# ============================================
# CELL 4: Training Function with Detailed Timing
# ============================================

def train_with_monitoring(model, dataloader, criterion, optimizer, device, num_batches=20):
    """
    Train and monitor timing of each stage
    """
    model.train()
    
    times = {
        'data_loading': [],
        'cpu_to_gpu': [],
        'forward_backward': [],
        'total_batch': []
    }
    
    print(f"\n{'='*70}")
    print(f"Training {num_batches} batches")
    print(f"{'='*70}")
    print(f"{'Batch':<8} {'Load(ms)':<12} {'Transfer(ms)':<15} {'Train(ms)':<12} {'Total(ms)':<12}")
    print(f"{'-'*70}")
    
    batch_start_time = time.time()
    
    for batch_idx, (images, labels) in enumerate(dataloader):
        if batch_idx >= num_batches:
            break
            
        iter_start = time.time()
        
        # Time 1: Data loading (CPU already did this in background)
        data_load_time = (time.time() - batch_start_time) * 1000
        
        # Time 2: Transfer to GPU
        transfer_start = time.time()
        images = images.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        torch.cuda.synchronize()
        transfer_time = (time.time() - transfer_start) * 1000
        
        # Time 3: GPU Training
        train_start = time.time()
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        torch.cuda.synchronize()
        train_time = (time.time() - train_start) * 1000
        
        total_time = (time.time() - iter_start) * 1000
        
        times['data_loading'].append(data_load_time)
        times['cpu_to_gpu'].append(transfer_time)
        times['forward_backward'].append(train_time)
        times['total_batch'].append(total_time)
        
        print(f"{batch_idx:<8} {data_load_time:>10.2f}  {transfer_time:>13.2f}  {train_time:>10.2f}  {total_time:>10.2f}")
        
        batch_start_time = time.time()
    
    return times

print("‚úì Training function created!")

---
## üü¢ Experiment 1: WITH Pipelining (The Good Way)

**Setup:** `num_workers=2` ‚Üí CPU workers prepare batches ahead

**Expected:** GPU is always busy, minimal waiting

---

In [None]:
# ============================================
# EXPERIMENT 1: WITH PIPELINING
# ============================================

print("="*70)
print("EXPERIMENT 1: WITH PIPELINING (num_workers=2)")
print("="*70)
print("‚úÖ CPU workers prepare batches while GPU trains")
print()

# Transforms (all CPU work!)
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                        std=[0.229, 0.224, 0.225])
])

# Create dataset
dataset_good = SyntheticImageDataset(num_images=500, transform=transform)

# DataLoader WITH workers (PIPELINING!)
dataloader_good = DataLoader(
    dataset_good,
    batch_size=32,
    shuffle=True,
    num_workers=2,          # ‚Üê CPU workers!
    pin_memory=True,        # ‚Üê Fast GPU transfer
    prefetch_factor=2,      # ‚Üê Prepare 2 batches ahead
    persistent_workers=True
)

# Create model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_good = SimpleCNN(num_classes=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_good.parameters(), lr=0.001)

# Train!
times_good = train_with_monitoring(
    model_good, dataloader_good, criterion, optimizer, device, num_batches=20
)

print(f"\n{'='*70}")
print("EXPERIMENT 1 RESULTS")
print(f"{'='*70}")
print(f"Average Data Loading: {np.mean(times_good['data_loading']):.2f} ms")
print(f"Average GPU Transfer:  {np.mean(times_good['cpu_to_gpu']):.2f} ms")
print(f"Average GPU Training:  {np.mean(times_good['forward_backward']):.2f} ms")
print(f"Average Total/Batch:   {np.mean(times_good['total_batch']):.2f} ms")
print(f"{'='*70}")

---
## üî¥ Experiment 2: WITHOUT Pipelining (The Bad Way)

**Setup:** `num_workers=0` ‚Üí No background workers

**Expected:** GPU waits for CPU, much slower!

---

In [None]:
# ============================================
# EXPERIMENT 2: WITHOUT PIPELINING
# ============================================

print("="*70)
print("EXPERIMENT 2: WITHOUT PIPELINING (num_workers=0)")
print("="*70)
print("‚ùå Main thread does everything - GPU waits!")
print()

# Same dataset
dataset_bad = SyntheticImageDataset(num_images=500, transform=transform)

# DataLoader WITHOUT workers (NO PIPELINING!)
dataloader_bad = DataLoader(
    dataset_bad,
    batch_size=32,
    shuffle=True,
    num_workers=0,  # ‚Üê No workers! Sequential!
    pin_memory=False
)

# Fresh model
model_bad = SimpleCNN(num_classes=10).to(device)
optimizer = optim.Adam(model_bad.parameters(), lr=0.001)

# Train!
times_bad = train_with_monitoring(
    model_bad, dataloader_bad, criterion, optimizer, device, num_batches=20
)

print(f"\n{'='*70}")
print("EXPERIMENT 2 RESULTS")
print(f"{'='*70}")
print(f"Average Data Loading: {np.mean(times_bad['data_loading']):.2f} ms")
print(f"Average GPU Transfer:  {np.mean(times_bad['cpu_to_gpu']):.2f} ms")
print(f"Average GPU Training:  {np.mean(times_bad['forward_backward']):.2f} ms")
print(f"Average Total/Batch:   {np.mean(times_bad['total_batch']):.2f} ms")
print(f"{'='*70}")

---
## üìä Comparison & Visualization
---

In [None]:
# ============================================
# COMPARISON & VISUALIZATION
# ============================================

print("\n" + "="*70)
print("üéØ FINAL COMPARISON")
print("="*70)

avg_time_good = np.mean(times_good['total_batch'])
avg_time_bad = np.mean(times_bad['total_batch'])
speedup = avg_time_bad / avg_time_good

print(f"\nWITH Pipelining (num_workers=2):    {avg_time_good:.2f} ms/batch")
print(f"WITHOUT Pipelining (num_workers=0): {avg_time_bad:.2f} ms/batch")
print(f"\nüöÄ SPEEDUP: {speedup:.2f}x FASTER with pipelining!")
print(f"üíæ Time saved: {avg_time_bad - avg_time_good:.2f} ms per batch")
print(f"üìä For 1000 batches: {(avg_time_bad - avg_time_good) * 1000 / 1000:.1f} seconds saved!")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Breakdown
categories = ['Data Load', 'GPU Transfer', 'GPU Train', 'Total']
good_times = [
    np.mean(times_good['data_loading']),
    np.mean(times_good['cpu_to_gpu']),
    np.mean(times_good['forward_backward']),
    np.mean(times_good['total_batch'])
]
bad_times = [
    np.mean(times_bad['data_loading']),
    np.mean(times_bad['cpu_to_gpu']),
    np.mean(times_bad['forward_backward']),
    np.mean(times_bad['total_batch'])
]

x = np.arange(len(categories))
width = 0.35

bars1 = axes[0].bar(x - width/2, good_times, width, label='WITH Pipelining', color='green', alpha=0.7)
bars2 = axes[0].bar(x + width/2, bad_times, width, label='WITHOUT Pipelining', color='red', alpha=0.7)

axes[0].set_xlabel('Stage', fontsize=12)
axes[0].set_ylabel('Time (ms)', fontsize=12)
axes[0].set_title('Timing Breakdown: WITH vs WITHOUT Pipelining', fontsize=14, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(categories)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        axes[0].text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.1f}',
                    ha='center', va='bottom', fontsize=9)

# Plot 2: Speed comparison
axes[1].barh(['WITHOUT\nPipelining', 'WITH\nPipelining'], 
            [avg_time_bad, avg_time_good],
            color=['red', 'green'], alpha=0.7, height=0.6)
axes[1].set_xlabel('Time per Batch (ms)', fontsize=12)
axes[1].set_title(f'Speed Comparison: {speedup:.2f}x Faster!', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='x')

for i, v in enumerate([avg_time_bad, avg_time_good]):
    axes[1].text(v + 2, i, f'{v:.1f} ms', va='center', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

print("\n‚úì Visualization complete!")

---
## üîç Check GPU Utilization
---

In [None]:
# Check GPU status
!nvidia-smi

---
## üéì Summary: Key Learnings

### ‚úÖ What We Learned:

1. **Images DO go through CPU first** (load, decode, preprocess)
2. **BUT GPU is NOT idle** - it works on previous batch!
3. **Pipelining = Parallelism**: CPU and GPU work simultaneously on different batches
4. **DataLoader workers** enable this pipelining automatically
5. **Result**: Significant speedup (typically 1.5x - 3x faster!)

### üîß Best Practices:

```python
DataLoader(
    dataset,
    num_workers=4,          # ‚Üê Use 4-8 workers
    pin_memory=True,        # ‚Üê Faster GPU transfer
    prefetch_factor=2,      # ‚Üê Prepare batches ahead
    persistent_workers=True # ‚Üê Keep workers alive
)
```

### üìä The Pipeline:

```
Worker 1: Loading Batch N+1
Worker 2: Decoding Batch N+2  } All happening
Worker 3: Augmenting Batch N+3} simultaneously!
GPU:      Training Batch N     }
```

---

### üéØ Remember:

**Images MUST go through CPU, but with smart pipelining, the GPU never waits!**

---