In [None]:
import os

import cv2
import torch
import numpy as np

from transformers import ViTFeatureExtractor, ViTModel
from torchvision.models.feature_extraction import create_feature_extractor
from torchvision.models import efficientnet_b5
from torchvision.transforms.functional import InterpolationMode
from torchvision import transforms

In [None]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [None]:
# Downlaod a sample image from the COCO dataset
!wget http://images.cocodataset.org/val2017/000000439715.jpg -q -O input.jpg
img = cv2.imread("input.jpg")

In [None]:
# Load feature extractors
feature_extractor_VIT = ViTFeatureExtractor.from_pretrained('google/vit-base-patch32-224-in21k')
model_vit = ViTModel.from_pretrained('google/vit-base-patch32-224-in21k').to(device)
model_vit.eval()

preprocess = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize(456, InterpolationMode.BICUBIC),
    transforms.CenterCrop(456),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

model_effnet = efficientnet_b5(pretrained=True).to(device)
model_effnet.eval()
return_nodes = {"flatten": "flatten"}
feature_extractor_EFF = create_feature_extractor(model_effnet, return_nodes=return_nodes)

In [None]:
# Extract features from ViT model
inputs = feature_extractor_VIT(images=[img], return_tensors="pt")
outputs = model_vit(**inputs.to(device))
last_hidden_states = outputs.pooler_output
_VIT = last_hidden_states.detach().cpu().numpy()

In [None]:
# Extract features from EfficientNet model
res = feature_extractor_EFF(preprocess(img).to(device)[None, :, :, :])["flatten"]
_EFF = res.detach().cpu().numpy()

In [None]:
# Check the expected feature shape
assert _EFF.shape == (1, 2048)
assert _VIT.shape == (1, 768)

In [None]:
# Remove the downlaoded file
! rm input.jpg