EVA
===

**EVA: Exploring the Limits of Masked Visual Representation Learning at Scale**

 * Paper: https://arxiv.org/abs/2211.07636

![EVA Overview](../assets/eva_overview.png)

In [1]:
from PIL import Image
import timm
import torch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = timm.create_model(
    'eva_large_patch14_196.in22k_ft_in22k_in1k', pretrained=True
)
model.eval().to(device);

# get model specific transforms (normalization, resize)
data_config = timm.data.resolve_model_data_config(model)
transforms = timm.data.create_transform(
    **data_config, is_training=False
)

In [3]:
image_path = "../samples/bicycle.jpg"

img = Image.open(image_path).convert("RGB")

with torch.no_grad():
    inputs = transforms(img).unsqueeze(0).to(device)
    output = model(inputs)

top5_logits, top5_class_ids = torch.topk(
    output.softmax(dim=1) * 100, k=5
)
print(top5_logits)
print(top5_class_ids)

tensor([[12.8416, 10.4233,  5.0744,  4.5568,  4.2718]], device='cuda:0')
tensor([[665, 695, 444, 637, 671]], device='cuda:0')


In [6]:
import urllib.request

# Download ImageNet-1k labels from torchvision's GitHub
url = "https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
filename = "imagenet_classes.txt"

urllib.request.urlretrieve(url, filename)

# Read them into a list
with open(filename, "r") as f:
    imagenet_classes = [line.strip() for line in f.readlines()]


In [7]:
# Print the top 5 predictions
top5_labels = [
    imagenet_classes[class_id] for class_id in top5_class_ids[0]
]
top5_labels

['moped', 'padlock', 'bicycle-built-for-two', 'mailbox', 'mountain bike']