In [1]:
from transformers import CvtConfig, CvtModel

# Initializing a Cvt msft/cvt style configuration
configuration = CvtConfig()

# Initializing a model (with random weights) from the msft/cvt style configuration
model = CvtModel(configuration)

# Accessing the model configuration
configuration = model.config

In [2]:
configuration

CvtConfig {
  "attention_drop_rate": [
    0.0,
    0.0,
    0.0
  ],
  "cls_token": [
    false,
    false,
    true
  ],
  "depth": [
    1,
    2,
    10
  ],
  "drop_path_rate": [
    0.0,
    0.0,
    0.1
  ],
  "drop_rate": [
    0.0,
    0.0,
    0.0
  ],
  "embed_dim": [
    64,
    192,
    384
  ],
  "initializer_range": 0.02,
  "kernel_qkv": [
    3,
    3,
    3
  ],
  "layer_norm_eps": 1e-12,
  "mlp_ratio": [
    4.0,
    4.0,
    4.0
  ],
  "model_type": "cvt",
  "num_channels": 3,
  "num_heads": [
    1,
    3,
    6
  ],
  "padding_kv": [
    1,
    1,
    1
  ],
  "padding_q": [
    1,
    1,
    1
  ],
  "patch_padding": [
    2,
    1,
    1
  ],
  "patch_sizes": [
    7,
    3,
    3
  ],
  "patch_stride": [
    4,
    2,
    2
  ],
  "qkv_bias": [
    true,
    true,
    true
  ],
  "qkv_projection_method": [
    "dw_bn",
    "dw_bn",
    "dw_bn"
  ],
  "stride_kv": [
    2,
    2,
    2
  ],
  "stride_q": [
    1,
    1,
    1
  ],
  "transformers_version": "4.26.

In [45]:
from transformers import AutoImageProcessor, CvtModel
import torch
from datasets import load_dataset
from PIL import Image


image = Image.open("ChestX-ray14/images/00000001_000.png").convert('RGB')

image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
model = CvtModel.from_pretrained("microsoft/cvt-13")

inputs = image_processor(image, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Some weights of the model checkpoint at microsoft/cvt-13 were not used when initializing CvtModel: ['layernorm.bias', 'classifier.bias', 'classifier.weight', 'layernorm.weight']
- This IS expected if you are initializing CvtModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CvtModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[1, 384, 14, 14]

In [12]:
inputs["pixel_values"].shape

torch.Size([1, 3, 224, 224])

In [13]:
from transformers import AutoImageProcessor, CvtForImageClassification
import torch
from datasets import load_dataset

image = Image.open("ChestX-ray14/images/00000001_000.png").convert('RGB')

image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
model = CvtForImageClassification.from_pretrained("microsoft/cvt-13")

inputs = image_processor(image, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

# model predicts one of the 1000 ImageNet classes
# predicted_label = logits.argmax(-1).item()
# print(model.config.id2label[predicted_label])

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [15]:
logits.shape

torch.Size([1, 1000])

In [18]:
from torchsummary import summary

summary(model, (3, 224, 224))

Layer (type:depth-idx)                   Output Shape              Param #
├─CvtModel: 1-1                          [[-1, 1, 384]]            --
|    └─CvtEncoder: 2-1                   [[-1, 1, 384]]            --
├─LayerNorm: 1-2                         [-1, 1, 384]              768
├─Linear: 1-3                            [-1, 1000]                385,000
Total params: 385,768
Trainable params: 385,768
Non-trainable params: 0
Total mult-adds (M): 59.03
Input size (MB): 0.57
Forward/backward pass size (MB): 0.01
Params size (MB): 1.47
Estimated Total Size (MB): 2.06


Layer (type:depth-idx)                   Output Shape              Param #
├─CvtModel: 1-1                          [[-1, 1, 384]]            --
|    └─CvtEncoder: 2-1                   [[-1, 1, 384]]            --
├─LayerNorm: 1-2                         [-1, 1, 384]              768
├─Linear: 1-3                            [-1, 1000]                385,000
Total params: 385,768
Trainable params: 385,768
Non-trainable params: 0
Total mult-adds (M): 59.03
Input size (MB): 0.57
Forward/backward pass size (MB): 0.01
Params size (MB): 1.47
Estimated Total Size (MB): 2.06

In [20]:
x = torch.randn(4, 4)
x.size()
y = x.view(16)
y.size()
z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
z.size()

a = torch.randn(1, 2, 3, 4)
a.size()
b = a.transpose(1, 2)  # Swaps 2nd and 3rd dimension
b.size()
c = a.view(1, 3, 2, 4)  # Does not change tensor layout in memory
c.size()
torch.equal(b, c)

False

In [21]:
x = torch.randn(128, 3, 28, 28)

x1 = x.view(128, 3, 784).permute(0, 2, 1)

x1.shape

torch.Size([128, 784, 3])

In [23]:
bs, hs, c = x.shape

ValueError: too many values to unpack (expected 3)

In [36]:
x1.shape

torch.Size([128, 784, 3])

In [37]:
x1.permute(0,2,1).view(128, 3, 28, 28).shape

torch.Size([128, 3, 28, 28])

In [48]:
import torch.nn as nn

In [49]:
layernorm = nn.LayerNorm(382)

In [52]:
encoder_output = torch.randn(128, 382, 28, 28)

batch_size, num_channels, height, width = encoder_output.shape
print(f"encoder_output shape: {encoder_output.shape}")

# rearrange "b c h w -> b (h w) c"
encoder_output = encoder_output.view(batch_size, num_channels, height * width).permute(0, 2, 1)
print(f"encoder_output shape: {encoder_output.shape}")

encoder_output = layernorm(encoder_output)
print(f"encoder_output shape: {encoder_output.shape}")


encoder_output_mean = encoder_output.mean(dim=1)
print(f"encoder_output_mean shape: {encoder_output_mean.shape}")


encoder_output shape: torch.Size([128, 382, 28, 28])
encoder_output shape: torch.Size([128, 784, 382])
encoder_output shape: torch.Size([128, 784, 382])
encoder_output_mean shape: torch.Size([128, 382])


In [None]:
import torchvision.transforms as transforms

In [150]:
# import modeling_xvt
import modeling_lightweight_xvt
import configuration_xvt
import importlib
# importlib.reload(modeling_xvt)
importlib.reload(modeling_lightweight_xvt)
importlib.reload(configuration_xvt)

# from modeling_xvt import XvtForImageClassification
from modeling_lightweight_xvt import XvtForImageClassification
from configuration_xvt import XvtConfig

In [151]:
config = XvtConfig()
model = XvtForImageClassification(config)

In [152]:
pixel_values = torch.randn(1, 3, 224, 224)

with torch.no_grad():
    logits = model(pixel_values=pixel_values)

In [153]:
logits.shape

torch.Size([1, 14])

In [154]:
from torchsummary import summary

summary(model, (3, 224, 224))

Layer (type:depth-idx)                        Output Shape              Param #
├─XvtConvEmbeddings: 1-1                      [-1, 32, 56, 56]          --
|    └─Conv2d: 2-1                            [-1, 32, 56, 56]          4,736
|    └─LayerNorm: 2-2                         [-1, 3136, 32]            64
├─XvtEncoder: 1-2                             [-1, 32, 56, 56]          --
|    └─Sequential: 2                          []                        --
|    |    └─XvtLayer: 3-1                     [-1, 3136, 32]            13,760
├─LayerNorm: 1-3                              [-1, 3136, 32]            64
├─Linear: 1-4                                 [-1, 14]                  462
Total params: 19,086
Trainable params: 19,086
Non-trainable params: 0
Total mult-adds (M): 14.80
Input size (MB): 0.57
Forward/backward pass size (MB): 3.83
Params size (MB): 0.07
Estimated Total Size (MB): 4.48


Layer (type:depth-idx)                        Output Shape              Param #
├─XvtConvEmbeddings: 1-1                      [-1, 32, 56, 56]          --
|    └─Conv2d: 2-1                            [-1, 32, 56, 56]          4,736
|    └─LayerNorm: 2-2                         [-1, 3136, 32]            64
├─XvtEncoder: 1-2                             [-1, 32, 56, 56]          --
|    └─Sequential: 2                          []                        --
|    |    └─XvtLayer: 3-1                     [-1, 3136, 32]            13,760
├─LayerNorm: 1-3                              [-1, 3136, 32]            64
├─Linear: 1-4                                 [-1, 14]                  462
Total params: 19,086
Trainable params: 19,086
Non-trainable params: 0
Total mult-adds (M): 14.80
Input size (MB): 0.57
Forward/backward pass size (MB): 3.83
Params size (MB): 0.07
Estimated Total Size (MB): 4.48

In [144]:
m = nn.Linear(20, 30)
input = torch.randn(1, 128, 20)
output = m(input)
print(output.size())

torch.Size([1, 128, 30])


In [155]:
args = {"one": 1, "two":2}

In [156]:
args.two

AttributeError: 'dict' object has no attribute 'two'

In [157]:
from configuration_xvt import XvtScheduler

ImportError: cannot import name 'XvtScheduler' from 'configuration_xvt' (/Users/vernontoh/SUTD/Term6/Theory and Practice of Deep Learning/ChestXRay/configuration_xvt.py)