# Tutorial 13: Custom Extensions (C++ and CUDA)

This tutorial demonstrates how to create custom C++ and CUDA extensions for PyTorch to achieve better performance for specialized operations.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import time
import os
import sys
from torch.utils.cpp_extension import load_inline
import matplotlib.pyplot as plt

## Why Custom Extensions?

Custom extensions are useful when:
1. **Performance**: C++/CUDA can be much faster than Python
2. **Memory efficiency**: Better control over memory allocation
3. **Novel operations**: Implement operations not available in PyTorch
4. **Hardware optimization**: Leverage specific hardware features

## Example 1: Simple C++ Extension

Let's create a custom ReLU implementation in C++.

In [None]:
# C++ source code for a custom ReLU implementation
cpp_source = '''
#include <torch/extension.h>
#include <vector>

// Forward pass
torch::Tensor custom_relu_forward(torch::Tensor input) {
    auto output = torch::zeros_like(input);
    output = torch::where(input > 0, input, output);
    return output;
}

// Backward pass
torch::Tensor custom_relu_backward(torch::Tensor grad_output, torch::Tensor input) {
    auto grad_input = torch::zeros_like(grad_output);
    grad_input = torch::where(input > 0, grad_output, grad_input);
    return grad_input;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &custom_relu_forward, "Custom ReLU forward");
    m.def("backward", &custom_relu_backward, "Custom ReLU backward");
}
'''

# Load the extension
custom_relu_cpp = load_inline(
    name='custom_relu_cpp',
    cpp_sources=[cpp_source],
    functions=['forward', 'backward'],
    verbose=True,
    build_directory='./cpp_build'
)

In [None]:
# Create a custom autograd Function
class CustomReLUFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        return custom_relu_cpp.forward(input)
    
    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors
        return custom_relu_cpp.backward(grad_output, input)

# Wrap it in a module
class CustomReLU(nn.Module):
    def forward(self, input):
        return CustomReLUFunction.apply(input)

In [None]:
# Test the custom ReLU
x = torch.randn(10, 10, requires_grad=True)
custom_relu = CustomReLU()
y = custom_relu(x)
loss = y.sum()
loss.backward()

print(f"Input shape: {x.shape}")
print(f"Output shape: {y.shape}")
print(f"Gradient computed: {x.grad is not None}")

# Visualize the ReLU function
x_test = torch.linspace(-2, 2, 100)
y_test = custom_relu(x_test)

plt.figure(figsize=(8, 5))
plt.plot(x_test.detach().numpy(), y_test.detach().numpy(), label='Custom ReLU')
plt.grid(True)
plt.xlabel('Input')
plt.ylabel('Output')
plt.title('Custom ReLU Function')
plt.legend()
plt.show()

## Example 2: Fused Linear Layer

Let's create a fused linear layer that combines linear transformation, bias addition, and ReLU activation in a single operation.

In [None]:
# C++ code for fused linear layer (bias + activation)
fused_cpp_source = '''
#include <torch/extension.h>
#include <vector>

torch::Tensor fused_linear_relu_forward(
    torch::Tensor input,
    torch::Tensor weight,
    torch::Tensor bias) {
    
    // Perform linear transformation
    auto output = torch::matmul(input, weight.t());
    
    // Add bias and apply ReLU in one pass
    output = torch::clamp_min(output + bias, 0);
    
    return output;
}

std::vector<torch::Tensor> fused_linear_relu_backward(
    torch::Tensor grad_output,
    torch::Tensor input,
    torch::Tensor weight,
    torch::Tensor output) {
    
    // ReLU backward
    auto relu_grad = torch::where(output > 0, grad_output, torch::zeros_like(grad_output));
    
    // Linear backward
    auto grad_input = torch::matmul(relu_grad, weight);
    auto grad_weight = torch::matmul(relu_grad.t(), input);
    auto grad_bias = relu_grad.sum(0);
    
    return {grad_input, grad_weight, grad_bias};
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("forward", &fused_linear_relu_forward, "Fused Linear-ReLU forward");
    m.def("backward", &fused_linear_relu_backward, "Fused Linear-ReLU backward");
}
'''

# Load the fused operation
fused_linear_relu = load_inline(
    name='fused_linear_relu',
    cpp_sources=[fused_cpp_source],
    functions=['forward', 'backward'],
    verbose=True,
    build_directory='./cpp_build'
)

In [None]:
class FusedLinearReLUFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weight, bias):
        output = fused_linear_relu.forward(input, weight, bias)
        ctx.save_for_backward(input, weight, output)
        return output
    
    @staticmethod
    def backward(ctx, grad_output):
        input, weight, output = ctx.saved_tensors
        grad_input, grad_weight, grad_bias = fused_linear_relu.backward(
            grad_output, input, weight, output
        )
        return grad_input, grad_weight, grad_bias

class FusedLinearReLU(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(out_features, in_features))
        self.bias = nn.Parameter(torch.zeros(out_features))
        
    def forward(self, input):
        return FusedLinearReLUFunction.apply(input, self.weight, self.bias)

In [None]:
# Test the fused layer
fused_layer = FusedLinearReLU(100, 50)
x = torch.randn(32, 100)
y = fused_layer(x)
print(f"Fused layer output shape: {y.shape}")

# Verify gradients work correctly
loss = y.sum()
loss.backward()
print(f"Weight gradient shape: {fused_layer.weight.grad.shape}")
print(f"Bias gradient shape: {fused_layer.bias.grad.shape}")

## Example 3: Custom Optimizer in C++

Let's implement a custom SGD optimizer in C++ for better performance.

In [None]:
custom_optimizer_source = '''
#include <torch/extension.h>
#include <vector>

void custom_sgd_step(
    torch::Tensor param,
    torch::Tensor grad,
    torch::Tensor momentum_buffer,
    float lr,
    float momentum,
    float weight_decay) {
    
    if (weight_decay != 0) {
        grad = grad + weight_decay * param;
    }
    
    if (momentum != 0) {
        momentum_buffer.mul_(momentum).add_(grad);
        param.add_(momentum_buffer, -lr);
    } else {
        param.add_(grad, -lr);
    }
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("step", &custom_sgd_step, "Custom SGD step");
}
'''

custom_sgd = load_inline(
    name='custom_sgd',
    cpp_sources=[custom_optimizer_source],
    functions=['step'],
    verbose=True,
    build_directory='./cpp_build'
)

In [None]:
class CustomSGD:
    def __init__(self, params, lr=0.01, momentum=0.9, weight_decay=0):
        self.params = list(params)
        self.lr = lr
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.momentum_buffers = {}
        
        for p in self.params:
            self.momentum_buffers[p] = torch.zeros_like(p)
    
    def step(self):
        for p in self.params:
            if p.grad is not None:
                custom_sgd.step(
                    p.data,
                    p.grad.data,
                    self.momentum_buffers[p],
                    self.lr,
                    self.momentum,
                    self.weight_decay
                )
    
    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad.zero_()

In [None]:
# Test custom optimizer
model = nn.Linear(10, 1)
optimizer = CustomSGD(model.parameters(), lr=0.1)

# Simple training loop
for i in range(10):
    x = torch.randn(32, 10)
    target = torch.randn(32, 1)
    
    output = model(x)
    loss = F.mse_loss(output, target)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if i % 2 == 0:
        print(f"Iteration {i}, Loss: {loss.item():.4f}")

## Performance Comparison

Let's compare the performance of our custom operations with PyTorch's built-in operations.

In [None]:
def benchmark_operation(name, func, *args, num_runs=1000):
    # Warmup
    for _ in range(10):
        func(*args)
    
    # Benchmark
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    
    start_time = time.time()
    for _ in range(num_runs):
        result = func(*args)
    
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    
    end_time = time.time()
    avg_time = (end_time - start_time) / num_runs * 1000  # Convert to ms
    
    return avg_time, result

In [None]:
# Compare custom ReLU with PyTorch ReLU
sizes = [100, 500, 1000, 2000]
pytorch_times = []
custom_times = []

for size in sizes:
    x = torch.randn(size, size)
    pytorch_relu = nn.ReLU()
    custom_relu = CustomReLU()
    
    pytorch_time, _ = benchmark_operation("PyTorch ReLU", pytorch_relu, x, num_runs=100)
    custom_time, _ = benchmark_operation("Custom ReLU", custom_relu, x, num_runs=100)
    
    pytorch_times.append(pytorch_time)
    custom_times.append(custom_time)
    
    print(f"Size {size}x{size}: PyTorch {pytorch_time:.4f}ms, Custom {custom_time:.4f}ms")

In [None]:
# Plot performance comparison
plt.figure(figsize=(10, 6))
plt.plot(sizes, pytorch_times, 'o-', label='PyTorch ReLU', linewidth=2)
plt.plot(sizes, custom_times, 's-', label='Custom ReLU', linewidth=2)
plt.xlabel('Tensor Size (NxN)')
plt.ylabel('Time (ms)')
plt.title('Performance Comparison: PyTorch vs Custom ReLU')
plt.legend()
plt.grid(True)
plt.show()

## CUDA Extension Example (Pseudo-code)

Here's an example of how you would write a CUDA extension. Note that this requires CUDA to be available and properly configured.

In [None]:
# CUDA kernel example (this is just for demonstration)
cuda_kernel_example = '''
// custom_matmul_kernel.cu
#include <cuda.h>
#include <cuda_runtime.h>

template <typename scalar_t>
__global__ void custom_matmul_kernel(
    const scalar_t* __restrict__ a,
    const scalar_t* __restrict__ b,
    scalar_t* __restrict__ c,
    int m, int n, int k) {
    
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (row < m && col < n) {
        scalar_t sum = 0;
        for (int i = 0; i < k; i++) {
            sum += a[row * k + i] * b[i * n + col];
        }
        c[row * n + col] = sum;
    }
}
'''

print("CUDA Kernel Example:")
print(cuda_kernel_example)

## Building Extensions with setuptools

For production use, you'll want to build extensions using setuptools instead of JIT compilation.

In [None]:
setup_py_example = '''
# setup.py
from setuptools import setup, Extension
from torch.utils import cpp_extension

setup(
    name='custom_ops',
    ext_modules=[
        cpp_extension.CppExtension(
            'custom_ops',
            ['custom_ops.cpp'],
            extra_compile_args=['-O3']
        ),
        cpp_extension.CUDAExtension(
            'custom_cuda_ops',
            ['custom_cuda_ops.cpp', 'custom_cuda_ops_kernel.cu'],
            extra_compile_args={'cxx': ['-O3'],
                              'nvcc': ['-O3', '--use_fast_math']}
        ) if torch.cuda.is_available() else None
    ],
    cmdclass={
        'build_ext': cpp_extension.BuildExtension
    }
)
'''

print("Example setup.py:")
print(setup_py_example)
print("\nTo build: python setup.py install")

## Best Practices and Debugging Tips

In [None]:
# Example: Gradient checking for custom operations
def test_custom_operation():
    # Test forward pass
    x = torch.randn(10, 10, requires_grad=True, dtype=torch.double)
    
    # Use gradcheck to verify gradients
    from torch.autograd import gradcheck
    
    # Test custom ReLU
    input = (x,)
    test = gradcheck(CustomReLUFunction.apply, input, eps=1e-6, atol=1e-4)
    print(f"Custom ReLU gradient check: {'PASSED' if test else 'FAILED'}")
    
    # Test fused linear layer
    weight = torch.randn(5, 10, requires_grad=True, dtype=torch.double)
    bias = torch.randn(5, requires_grad=True, dtype=torch.double)
    input = (x, weight, bias)
    test = gradcheck(FusedLinearReLUFunction.apply, input, eps=1e-6, atol=1e-4)
    print(f"Fused Linear ReLU gradient check: {'PASSED' if test else 'FAILED'}")

test_custom_operation()

## Summary

In this tutorial, we've covered:

1. **Writing C++ Extensions**: How to create custom operations in C++
2. **Autograd Integration**: Making custom ops work with PyTorch's autograd
3. **Performance Optimization**: Fusing operations for better performance
4. **Custom Optimizers**: Implementing optimizers in C++
5. **CUDA Extensions**: Basics of GPU-accelerated custom operations
6. **Building and Packaging**: How to properly build and distribute extensions

### Key Takeaways:
- Only use custom extensions when necessary
- Always verify gradients with gradcheck
- Profile before and after to ensure performance gains
- Consider memory layout and tensor continuity
- Test on different platforms and configurations

Custom extensions are powerful but should be used judiciously. PyTorch's built-in operations are highly optimized and sufficient for most use cases!