## Setup

In [6]:
import torch
import torch.nn.functional as F
from torch import nn
import time
import matplotlib.pyplot as plt

## Max Pooling Operation

In [7]:
def maxpool2d_simple(input, kernel_size=2, stride=2):

    input_height, input_width = input.shape

    output_height = (input_height - kernel_size) // stride + 1
    output_width = (input_width - kernel_size) // stride + 1

    output = torch.zeros(output_height, output_width)

    for i in range(output_height):
        for j in range(output_width):
            output[i, j] = torch.max(
                input[
                    i * stride : i * stride + kernel_size,
                    j * stride : j * stride + kernel_size,
                ]
            )

    return output

In [8]:
input = torch.tensor([
    [1, 3, 2, 4],
    [5, 6, 8, 8],
    [9, 7, 5, 6],
    [8, 4, 3, 2]
])

print("Input Matrix:")
print(maxpool2d_simple(input))

Input Matrix:
tensor([[6., 8.],
        [9., 6.]])


## Extended Implementation

In [9]:
def maxpool2d(input, kernel_size=2, stride=2, padding=0):
    batch_size, in_channels, in_height, in_width = input.shape
    
    # Calculate the output height and width
    output_height = (in_height + 2 * padding - kernel_size) // stride + 1
    output_width = (in_width + 2 * padding - kernel_size) // stride + 1
    
    # Pad the input
    input_padded = F.pad(input, (padding, padding, padding, padding))
    
    # Initialize the unfolded tensor for all patches
    unfolded_tensor = torch.zeros((batch_size, in_channels, kernel_size * kernel_size, output_height * output_width))
    
    # Extract patches
    for i in range(output_height):
        for j in range(output_width):
            start_i = i * stride
            start_j = j * stride
            end_i = start_i + kernel_size
            end_j = start_j + kernel_size
            patches = input_padded[:, :, start_i:end_i, start_j:end_j]
            unfolded_tensor[:, :, :, i * output_width + j] = patches.reshape(batch_size, in_channels, -1)
    
    # Perform max pooling
    pooled_output = unfolded_tensor.max(dim=2)[0]
    
    # Reshape the pooled output to the expected output shape
    pooled_output = pooled_output.view(batch_size, in_channels, output_height, output_width)
    
    return pooled_output

## Checking the implementations

In [10]:
# Generate a test input tensor
input = torch.randn(64, 3, 128, 128)  # Example shape: batch_size=1, channels=3, height=32, width=32

# Apply Vectorized Custom Max Pooling
start_vectorized = time.time()
custom_output = maxpool2d(input, kernel_size=2, stride=2)
end_vectorized = time.time()

# Apply PyTorch Max Pooling
maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
start_pytorch = time.time()
pytorch_output = maxpool(input)
end_pytorch = time.time()

# Compare Outputs
are_close = torch.allclose(custom_output, pytorch_output, atol=1e-6)
print(f"Are the outputs close? {are_close}")

# Measure and Compare Performance
print(f"Vectorized Max Pooling Time: {end_vectorized - start_vectorized:.6f} seconds")
print(f"PyTorch Max Pooling Time: {end_pytorch - start_pytorch:.6f} seconds")

Are the outputs close? True
Vectorized Max Pooling Time: 0.217464 seconds
PyTorch Max Pooling Time: 0.004998 seconds
