In [1]:
import torch
from torchvision.io import read_image

import voltron
from voltron import instantiate_extractor, load

In [9]:
voltron.available_models()

['v-cond',
 'v-dual',
 'v-gen',
 'v-cond-base',
 'r-mvp',
 'r-r3m-vit',
 'r-r3m-rn50']

In [2]:
image_path = "/root/code/BC-IB/third_party/methods/voltron-robotics/examples/verification/img/peel-carrot-initial.png"

### R3M

In [3]:
# Load a frozen R3M model
load_path = "/root/model/r3m/r3m-small"
r3m_vit_model, r3m_vit_preprocess = load("r-r3m-vit", device="cuda", freeze=True, load_path=load_path)
# r3m_vit_model, r3m_vit_preprocess = load("r-r3m-rn50", device="cuda", freeze=True, load_path=load_path)
r3m_vit_model.to("cuda")

VR3M(
  (patch2embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
  )
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (pre_norm_attn): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (pre_norm_mlp): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Sequential(
        (0): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): GELU(approximate='none')
        )
        (1): Dropout(p=0.0, inplace=False)
        (2): Linear(in_features=1536, out_features=384, bias=True)
      )
    )
  )
  (norm): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
  (language_reward): Sequential(
    (0): Linear(in_features=1536, out_features=1024, bias=True)
    (1): ReLU()


In [4]:
# Obtain & Preprocess an image =>> can be from a dataset, or camera on a robot, etc.
#   => Feel free to add any language if you have it (Voltron models work either way!)
img = r3m_vit_preprocess(read_image(image_path))[None, ...].to("cuda")
print(img.shape)
print(img)

torch.Size([1, 3, 224, 224])
tensor([[[[ 90., 114., 139.,  ...,  32.,  20.,  15.],
          [116., 152., 132.,  ...,  23.,  21.,  37.],
          [129., 152., 142.,  ...,  41.,  48.,  36.],
          ...,
          [ 10.,  10.,  10.,  ...,  26.,  27.,  23.],
          [ 10.,   9.,   9.,  ...,  19.,  26.,  25.],
          [  9.,   9.,   8.,  ...,  15.,  16.,  25.]],

         [[103., 128., 157.,  ...,  42.,  30.,  23.],
          [129., 164., 151.,  ...,  31.,  29.,  45.],
          [142., 167., 162.,  ...,  49.,  56.,  44.],
          ...,
          [ 10.,  10.,  10.,  ...,  25.,  26.,  21.],
          [  9.,   9.,   9.,  ...,  18.,  25.,  24.],
          [  8.,   8.,   8.,  ...,  15.,  15.,  24.]],

         [[130., 153., 180.,  ...,  38.,  25.,  19.],
          [153., 187., 171.,  ...,  27.,  25.,  41.],
          [163., 187., 179.,  ...,  45.,  52.,  40.],
          ...,
          [  9.,   9.,   8.,  ...,  21.,  22.,  18.],
          [  8.,   8.,   8.,  ...,  15.,  22.,  20.],
    

In [5]:
# Extract both multimodal AND vision-only embeddings!
img = torch.cat((img, img), dim=0)
r3m_vit_embeddings = r3m_vit_model.get_representations(img)
print(r3m_vit_embeddings.shape)

torch.Size([2, 1, 384])


In [8]:
# configure a vector extractor, then
# Use the `vector_extractor` to output dense vector representations for downstream applications!
#   => Pass this representation to model of your choice (object detector, control policy, etc.)
r3m_vit_vector_extractor = instantiate_extractor(r3m_vit_model)()
r3m_vit_vector_extractor = r3m_vit_vector_extractor.to("cuda")
r3m_vit_representation = r3m_vit_vector_extractor(r3m_vit_embeddings)
print(r3m_vit_representation.shape)

torch.Size([1, 384])


### MVP

In [16]:
# Load a frozen MVP model
load_path = "/root/model/mvp/mvp-small"
mvp_model, mvp_preprocess = load("r-mvp", device="cuda", freeze=True, load_path=load_path)

In [17]:
img = mvp_preprocess(read_image(image_path))[None, ...].to("cuda")
print(img)

tensor([[[[-0.5767, -0.1657,  0.2624,  ..., -1.5699, -1.7754, -1.8610],
          [-0.1314,  0.4851,  0.1426,  ..., -1.7240, -1.7583, -1.4843],
          [ 0.0912,  0.4851,  0.3138,  ..., -1.4158, -1.2959, -1.5014],
          ...,
          [-1.9467, -1.9467, -1.9467,  ..., -1.6727, -1.6555, -1.7240],
          [-1.9467, -1.9638, -1.9638,  ..., -1.7925, -1.6727, -1.6898],
          [-1.9638, -1.9638, -1.9809,  ..., -1.8610, -1.8439, -1.6898]],

         [[-0.2325,  0.2052,  0.7129,  ..., -1.3004, -1.5105, -1.6331],
          [ 0.2227,  0.8354,  0.6078,  ..., -1.4930, -1.5280, -1.2479],
          [ 0.4503,  0.8880,  0.8004,  ..., -1.1779, -1.0553, -1.2654],
          ...,
          [-1.8606, -1.8606, -1.8606,  ..., -1.5980, -1.5805, -1.6681],
          [-1.8782, -1.8782, -1.8782,  ..., -1.7206, -1.5980, -1.6155],
          [-1.8957, -1.8957, -1.8957,  ..., -1.7731, -1.7731, -1.6155]],

         [[ 0.4614,  0.8622,  1.3328,  ..., -1.1421, -1.3687, -1.4733],
          [ 0.8622,  1.4548,  

In [5]:
mvp_embeddings = mvp_model.get_representations(img)
print(mvp_embeddings.shape)

torch.Size([1, 196, 384])


In [6]:
mvp_vector_extractor = instantiate_extractor(mvp_model)()
mvp_vector_extractor = mvp_vector_extractor.to("cuda")
mvp_representation = mvp_vector_extractor(mvp_embeddings)
print(mvp_representation.shape)

torch.Size([1, 384])


### Voltron

In [13]:
# Load a frozen MVP model
load_path = "/root/model/voltron/v-cond-small"
vcond_model, vcond_preprocess = load("v-cond", device="cuda", freeze=True, load_path=load_path)


In [14]:
img = vcond_preprocess(read_image(image_path))[None, ...].to("cuda")
lang = ["peeling a carrot"]

In [15]:
print(img)

tensor([[[[-0.5767, -0.1657,  0.2624,  ..., -1.5699, -1.7754, -1.8610],
          [-0.1314,  0.4851,  0.1426,  ..., -1.7240, -1.7583, -1.4843],
          [ 0.0912,  0.4851,  0.3138,  ..., -1.4158, -1.2959, -1.5014],
          ...,
          [-1.9467, -1.9467, -1.9467,  ..., -1.6727, -1.6555, -1.7240],
          [-1.9467, -1.9638, -1.9638,  ..., -1.7925, -1.6727, -1.6898],
          [-1.9638, -1.9638, -1.9809,  ..., -1.8610, -1.8439, -1.6898]],

         [[-0.2325,  0.2052,  0.7129,  ..., -1.3004, -1.5105, -1.6331],
          [ 0.2227,  0.8354,  0.6078,  ..., -1.4930, -1.5280, -1.2479],
          [ 0.4503,  0.8880,  0.8004,  ..., -1.1779, -1.0553, -1.2654],
          ...,
          [-1.8606, -1.8606, -1.8606,  ..., -1.5980, -1.5805, -1.6681],
          [-1.8782, -1.8782, -1.8782,  ..., -1.7206, -1.5980, -1.6155],
          [-1.8957, -1.8957, -1.8957,  ..., -1.7731, -1.7731, -1.6155]],

         [[ 0.4614,  0.8622,  1.3328,  ..., -1.1421, -1.3687, -1.4733],
          [ 0.8622,  1.4548,  

In [7]:
vcond_model.to("cuda")
multimodal_embeddings = vcond_model.get_representations(img, lang, mode="multimodal")
visual_embeddings = vcond_model.get_representations(img, mode="visual")
print('multimodal: ', multimodal_embeddings.shape)
print('visual: ', visual_embeddings.shape)

multimodal:  torch.Size([1, 216, 384])
visual:  torch.Size([1, 196, 384])


In [13]:
vcond_vector_extractor = instantiate_extractor(vcond_model)()
vcond_vector_extractor = vcond_vector_extractor.to("cuda")
multimodal_representation = vcond_vector_extractor(multimodal_embeddings)
visual_representation = vcond_vector_extractor(visual_embeddings)
print(multimodal_representation.shape)
print(visual_representation.shape)

torch.Size([1, 384])
torch.Size([1, 384])
