In [2]:
import torch
import pandas as pd
import numpy as np
import torchvision.transforms as transforms
import torchvision.models as models
from tqdm import tqdm
from torch.utils.data import DataLoader
from datasets import load_dataset

In [3]:
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(device=0))
print(torch.__version__)

1
NVIDIA A100-SXM4-40GB
1.11.0


## Transforms

In [4]:
standard_transform = transforms.Compose(
  [transforms.Resize(256), 
   transforms.CenterCrop(224), 
   transforms.ToTensor(), 
   transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

## Datasets


In [5]:
imagenet_hard_dataset = load_dataset('taesiri/imagenet-hard', split='validation')
imagenet_hard_dataset

Found cached dataset parquet (/home/mohammad/.cache/huggingface/datasets/taesiri___parquet/taesiri--imagenet-hard-124be08d1e33678b/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Dataset({
    features: ['image', 'label'],
    num_rows: 23845
})

In [6]:
def preprocess_batch(batch):
    batch['image'] = [standard_transform(image.convert('RGB')) for image in batch['image']]
    return batch

imagenet_hard_dataset.set_transform(preprocess_batch)

In [7]:
# Indices of classes that should be masked out (set to False) in ImageNet's 1000 classes
false_mask = [ 12,  13,  24, 333, 339, 340, 352, 354, 386, 400, 404, 430, 444, 466, 510, 527, 630, 668, 746, 779, 802, 890, 916, 919, 954, 981, 984, 985]
mask = np.ones(1000, dtype=bool)
mask[false_mask] = False

In [8]:
# Helpers
concat = lambda x: np.concatenate(x, axis=0)
to_np = lambda x: x.data.to('cpu').numpy()

## Benchmark

In [9]:
def run_benchmark_masked(model, bs=16):
  model.cuda()
  model.eval()
  
  
  loader = DataLoader(imagenet_hard_dataset, batch_size=256, num_workers=2)

  correct_ones = 0
  with torch.inference_mode():
    with torch.no_grad():
      for i, (batch) in enumerate(tqdm(loader)):
        images, target = batch['image'], batch['label']
        images = images.cuda()
        target = target.cuda()
        
        model_output = model(images)[:,mask]
        pred = model_output.data.max(1)[1]
        correct_ones += pred.eq(target.data).sum().item()
  return 100*correct_ones/len(imagenet_hard_dataset)

In [10]:
model_names = ['resnet50', 'resnet18', 'alexnet', 'vgg19', 'vit_b_32']

In [11]:
accuracy = {}

for name in model_names:
  model = models.__dict__[name](pretrained=True)
  accuracy[name] = run_benchmark_masked(model)
  print(f'{name} accuracy: {accuracy[name]}')

100%|██████████| 94/94 [02:34<00:00,  1.64s/it]


resnet50 accuracy: 11.52023485007339


100%|██████████| 94/94 [02:31<00:00,  1.61s/it]


resnet18 accuracy: 8.651708953659048


100%|██████████| 94/94 [02:31<00:00,  1.61s/it]


alexnet accuracy: 5.63640176137555


100%|██████████| 94/94 [02:32<00:00,  1.62s/it]


vgg19 accuracy: 8.999790312434472


100%|██████████| 94/94 [02:30<00:00,  1.60s/it]

vit_b_32 accuracy: 16.674355210736003





In [12]:
pd.DataFrame(accuracy, index=['accuracy']).T.round(2)

Unnamed: 0,accuracy
resnet50,11.52
resnet18,8.65
alexnet,5.64
vgg19,9.0
vit_b_32,16.67
