In [1]:
import sys
sys.path.append('/NDDepth/src')

In [2]:
from transformers import Swinv2Config, UperNetConfig, UperNetForSemanticSegmentation, AutoImageProcessor
from PIL import Image
from torch import nn
import requests
import matplotlib.pyplot as plt
from newcrf_layers import NewCRF

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

In [3]:
version = "large07"

if version[:-2] == 'base':
    embed_dim = 128
    depths = [2, 2, 18, 2]
    num_heads = [4, 8, 16, 32]
    in_channels = [128, 256, 512, 1024]
elif version[:-2] == 'large':
    embed_dim = 192
    depths = [2, 2, 18, 2]
    num_heads = [6, 12, 24, 48]
    in_channels = [192, 384, 768, 1536]
elif version[:-2] == 'tiny':
    embed_dim = 96
    depths = [2, 2, 6, 2]
    num_heads = [3, 6, 12, 24]
    in_channels = [96, 192, 384, 768]
    
win = 7
crf_dims = [128, 256, 512, 1024]
v_dims = [64, 128, 256, 512]

In [4]:
processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-tiny")

backbone_config = Swinv2Config(
                embed_dim=embed_dim,
                depths=depths,
                num_heads=num_heads,
                out_features=["stage1", "stage2", "stage3", "stage4"]
                )
config = UperNetConfig(backbone_config=backbone_config)
model = UperNetForSemanticSegmentation(config)
model.eval() # Necessary to allow evaluation as some layer requies mean??

UperNetForSemanticSegmentation(
  (backbone): Swinv2Backbone(
    (embeddings): Swinv2Embeddings(
      (patch_embeddings): Swinv2PatchEmbeddings(
        (projection): Conv2d(3, 192, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): Swinv2Encoder(
      (layers): ModuleList(
        (0): Swinv2Stage(
          (blocks): ModuleList(
            (0-1): 2 x Swinv2Layer(
              (attention): Swinv2Attention(
                (self): Swinv2SelfAttention(
                  (continuous_position_bias_mlp): Sequential(
                    (0): Linear(in_features=2, out_features=512, bias=True)
                    (1): ReLU(inplace=True)
                    (2): Linear(in_features=512, out_features=6, bias=False)
                  )
                  (query): Linear(in_features=192, out_features=192, bias=True)
                  (key): Linear(in_features=192, out

In [5]:
inputs = processor(images=image, return_tensors="pt")
outputs = model.backbone.forward_with_filtered_kwargs(**inputs)
features = outputs.feature_maps
for i in range(len(features)):
    print(features[i].shape)
logits = model.decode_head(features)
logits = nn.functional.interpolate(logits, size=inputs.pixel_values.shape[2:], mode="bilinear", align_corners=False)
print(logits.shape)

torch.Size([1, 192, 128, 128])
torch.Size([1, 384, 64, 64])
torch.Size([1, 768, 32, 32])
torch.Size([1, 1536, 16, 16])
torch.Size([1, 2, 512, 512])


In [6]:
crf3 = NewCRF(input_dim=in_channels[3], embed_dim=crf_dims[3], window_size=win, v_dim=v_dims[3], num_heads=32)
crf2 = NewCRF(input_dim=in_channels[2], embed_dim=crf_dims[2], window_size=win, v_dim=v_dims[2], num_heads=16)
crf1 = NewCRF(input_dim=in_channels[1], embed_dim=crf_dims[1], window_size=win, v_dim=v_dims[1], num_heads=8)
crf0 = NewCRF(input_dim=in_channels[0], embed_dim=crf_dims[0], window_size=win, v_dim=v_dims[0], num_heads=4)

In [7]:
e3 = crf3(features[3], logits) # DX: This is the GRU tuning process
e3 = nn.PixelShuffle(2)(e3)
e2 = crf2(features[2], e3)
e2 = nn.PixelShuffle(2)(e2)
e1 = crf1(features[1], e2)
e1 = nn.PixelShuffle(2)(e1)
e0 = crf0(features[0], e1)

RuntimeError: Given groups=1, weight of size [1024, 512, 3, 3], expected input[1, 2, 512, 512] to have 512 channels, but got 2 channels instead