In [2]:
from torchvision.models import mobilenet_v3_large
import torch

x = torch.randn([1, 3, 224, 224])
model = mobilenet_v3_large()

def hook(module, args, output):
    print(args[0].shape)

for mod in model.modules():
    if isinstance(mod, torch.nn.Conv2d) and mod.groups == mod.in_channels:
        mod.register_forward_hook(hook)

model(x);

torch.Size([1, 16, 112, 112])
torch.Size([1, 64, 112, 112])
torch.Size([1, 72, 56, 56])
torch.Size([1, 72, 56, 56])
torch.Size([1, 120, 28, 28])
torch.Size([1, 120, 28, 28])
torch.Size([1, 240, 28, 28])
torch.Size([1, 200, 14, 14])
torch.Size([1, 184, 14, 14])
torch.Size([1, 184, 14, 14])
torch.Size([1, 480, 14, 14])
torch.Size([1, 672, 14, 14])
torch.Size([1, 672, 14, 14])
torch.Size([1, 960, 7, 7])
torch.Size([1, 960, 7, 7])


In [4]:
import torch
from torch import nn

In [5]:
model = nn.Sequential(
    nn.Conv2d(32, 32, kernel_size=3, padding=1, groups=32, bias=False),
    nn.Conv2d(32, 64, kernel_size=1, bias=False)
).cuda()

x = torch.randn([1, 32, 64, 64], requires_grad=False).cuda()

In [6]:
model[0].weight.shape

torch.Size([32, 1, 3, 3])

In [7]:
model[1].weight.shape

torch.Size([64, 32, 1, 1])

In [8]:
for param in model.parameters():
    if param.grad is not None:
        param.grad.data.fill_(0.0)

y = model(x)
y.mean().backward()

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


In [29]:
import torch
from torch import nn
from monarch_cuda import conv2d_forward

n, num_channels, h, w = 32, 16, 112, 112
#n, num_channels, h, w = 16, 960, 7, 7
n, num_channels, h, w = 32, 240, 28, 28
# n = 4
# num_channels = 512
# h = w = 64

depthwise_conv2d = nn.Conv2d(num_channels, num_channels, kernel_size=3, padding=1, groups=num_channels, bias=False).cuda()
x = torch.randn([n, num_channels, h, w], requires_grad=False).cuda()

@torch.no_grad()
def run():
    y = depthwise_conv2d(x)
    torch.cuda.synchronize()
    return y

y = run()

@torch.no_grad()
def run_my():
    y = conv2d_forward(x, depthwise_conv2d.weight.contiguous(), 1)
    torch.cuda.synchronize()
    return y

z = run_my()

print(torch.allclose(y, z))

True


In [30]:
%timeit -n 1000 run_my()

1.17 ms ± 61.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [31]:
%timeit -n 1000 run()

1.72 ms ± 8.97 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [3]:
y.shape, z.shape, depthwise_conv2d.weight.shape

(torch.Size([1, 2, 32, 32]),
 torch.Size([1, 2, 32, 32]),
 torch.Size([2, 1, 3, 3]))

In [23]:
next(iter(model.parameters())).grad.data

tensor([[[[-1.6789e-04, -2.5931e-04, -1.9989e-04],
          [-1.5090e-04, -2.3825e-04, -1.7833e-04],
          [-1.2129e-04, -2.0130e-04, -1.4132e-04]]],


        [[[-1.0133e-04, -9.8267e-05, -1.1907e-04],
          [-1.1414e-04, -1.1293e-04, -1.3485e-04],
          [-1.3128e-04, -1.3289e-04, -1.5608e-04]]],


        [[[-1.8379e-04, -1.7843e-04, -1.6717e-04],
          [-1.5581e-04, -1.5098e-04, -1.3873e-04],
          [-1.5948e-04, -1.5850e-04, -1.4639e-04]]],


        [[[ 6.4354e-05,  6.8785e-05,  8.9805e-05],
          [ 6.5355e-05,  6.5105e-05,  8.5797e-05],
          [ 5.0063e-05,  4.8304e-05,  6.6207e-05]]],


        [[[-1.2251e-04, -6.9850e-05, -5.1950e-05],
          [-1.2614e-04, -7.2298e-05, -4.9813e-05],
          [-5.1994e-05, -1.2274e-06,  1.7884e-05]]],


        [[[-1.0487e-04, -2.2397e-04, -4.0430e-04],
          [-5.0163e-05, -1.5823e-04, -3.2890e-04],
          [-1.5510e-05, -1.2504e-04, -3.0435e-04]]],


        [[[ 8.6366e-05,  8.0099e-05,  9.6377e-05],
       