In [1]:
import os
import shutil
import pickle

import timm
import torch
from torch import cuda
from torchvision import models
from torchvision import transforms
from PIL import Image
import numpy as np
from matplotlib.pyplot import imshow
from timm.data.dataset import Dataset #if you use timm v0.4.5 this should be ImageDataset instead of Dataset
from timm.data.loader import create_loader

# Create model
(by Moritz Jakob, only a minor modification added)

The first step is to load a pretrained model.

A rough description on how to extract features can be found here:
https://rwightman.github.io/pytorch-image-models/feature_extraction/

I did pooled and with no classifier.

In [2]:
# this command shows all available pre-trained models in the library
timm.list_models()

['adv_inception_v3',
 'cspdarknet53',
 'cspdarknet53_iabn',
 'cspresnet50',
 'cspresnet50d',
 'cspresnet50w',
 'cspresnext50',
 'cspresnext50_iabn',
 'darknet53',
 'densenet121',
 'densenet121d',
 'densenet161',
 'densenet169',
 'densenet201',
 'densenet264',
 'densenet264d_iabn',
 'densenetblur121d',
 'dla34',
 'dla46_c',
 'dla46x_c',
 'dla60',
 'dla60_res2net',
 'dla60_res2next',
 'dla60x',
 'dla60x_c',
 'dla102',
 'dla102x',
 'dla102x2',
 'dla169',
 'dpn68',
 'dpn68b',
 'dpn92',
 'dpn98',
 'dpn107',
 'dpn131',
 'eca_vovnet39b',
 'ecaresnet18',
 'ecaresnet50',
 'ecaresnet50d',
 'ecaresnet50d_pruned',
 'ecaresnet101d',
 'ecaresnet101d_pruned',
 'ecaresnetlight',
 'ecaresnext26tn_32x4d',
 'efficientnet_b0',
 'efficientnet_b1',
 'efficientnet_b1_pruned',
 'efficientnet_b2',
 'efficientnet_b2_pruned',
 'efficientnet_b2a',
 'efficientnet_b3',
 'efficientnet_b3_pruned',
 'efficientnet_b3a',
 'efficientnet_b4',
 'efficientnet_b5',
 'efficientnet_b6',
 'efficientnet_b7',
 'efficientnet_b8',


In [8]:
# loading pretrained model
# tf_efficientnet_b0 and resnet34 were chosen

model = timm.create_model('tf_efficientnet_b0', pretrained=True, num_classes=0) # num_classes = 0, because no classifier
#model = timm.create_model('resnet34', pretrained=True, num_classes=0) # num_classes = 0, because no classifier

In [9]:
# move the model to the gpu
model.to("cuda")

EfficientNet(
  (conv_stem): Conv2dSame(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
  (bn1): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  (act1): SiLU(inplace=True)
  (blocks): Sequential(
    (0): Sequential(
      (0): DepthwiseSeparableConv(
        (conv_dw): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
        (bn1): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
        (act1): SiLU(inplace=True)
        (se): SqueezeExcite(
          (conv_reduce): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
          (act1): SiLU(inplace=True)
          (conv_expand): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
        )
        (conv_pw): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn2): BatchNorm2d(16, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
        (act2): Identity()
      )
    )
    (1): Sequential(
      (0

# Extract Features Function
stores feature vectors into a nested dictionary.

Note all images should be of same size (155x155), if not resize!

In [10]:
def extract_features_to_dict(in_path, noun, model):
    """
    Extracts feature vectors from a pretrained model and writes them to text files.
        Arguments:
            :in_path: the path where all image files are stored
            :noun: the noun of which image features will be extracted
            :model: the (pretrained) model that will be used to extract features
        
        Return:
            :out_dict: a nested dictionary where first dimension keys are nouns, second dimension keys are nouns
                       with a running count (more or less the picture names). The second dimension values are lists
                       that represent the extracted features.
    """
    # initiate an empty dictionary as k-v pair in the out dict
    out_dict = {}
    out_dict[noun] = {}
    
    # load in the dataset
    dataset = Dataset(os.path.join(in_path, noun))
    
    # create dataloader
    # this uses the prefetch loader which only works on GPU enabled machines
    # args: dataset, inputsize (channels, height, width), batchsize
    loader = create_loader(dataset, (3, 155, 155), 1)
   
    # added condition for num of images to consider
    max_images = 25
    n=1
    for i in loader:        
        o = model(i[0])
        out_dict[noun][f'{noun}_{n}'] = o.tolist()[0]    
        n+=1
        if n > max_images: # truncate image list to 25 (some have more)
            break
        
    
    print(f'Wrote tensors for the images of {noun} into out_dict[{noun}]')
    
    return out_dict

# Call the function

The function is called and the returned dictionary is stored in a variable 'output_dict_MODEL'.

Note that the list comprehension is exactly the same but the variable name is different. That means the model you have loaded basically decides which chunk you execute.

In [None]:
input_dir = "/work/lingai/projects/multimodality/camilla/images_small" 

In [11]:

output_dict_efnet = [extract_features_to_dict(input_dir, i, model) for i in os.listdir(input_dir)]

Wrote tensors for the images of aardvark into out_dict[aardvark]
Wrote tensors for the images of abalone into out_dict[abalone]
Wrote tensors for the images of abandon into out_dict[abandon]
Wrote tensors for the images of abandonment into out_dict[abandonment]
Wrote tensors for the images of abbey into out_dict[abbey]
Wrote tensors for the images of abduct into out_dict[abduct]
Wrote tensors for the images of abduction into out_dict[abduction]
Wrote tensors for the images of ability into out_dict[ability]
Wrote tensors for the images of ablaze into out_dict[ablaze]
Wrote tensors for the images of abnormality into out_dict[abnormality]
Wrote tensors for the images of abode into out_dict[abode]
Wrote tensors for the images of abomination into out_dict[abomination]
Wrote tensors for the images of abort into out_dict[abort]
Wrote tensors for the images of abortion into out_dict[abortion]
Wrote tensors for the images of abscess into out_dict[abscess]
Wrote tensors for the images of absolut

In [6]:
output_dict_resnet = [extract_features_to_dict(input_dir, i, model) for i in os.listdir(input_dir)]

Wrote tensors for the images of aardvark into out_dict[aardvark]
Wrote tensors for the images of abalone into out_dict[abalone]
Wrote tensors for the images of abandon into out_dict[abandon]
Wrote tensors for the images of abandonment into out_dict[abandonment]
Wrote tensors for the images of abbey into out_dict[abbey]
Wrote tensors for the images of abduct into out_dict[abduct]
Wrote tensors for the images of abduction into out_dict[abduction]
Wrote tensors for the images of ability into out_dict[ability]
Wrote tensors for the images of ablaze into out_dict[ablaze]
Wrote tensors for the images of abnormality into out_dict[abnormality]
Wrote tensors for the images of abode into out_dict[abode]
Wrote tensors for the images of abomination into out_dict[abomination]
Wrote tensors for the images of abort into out_dict[abort]
Wrote tensors for the images of abortion into out_dict[abortion]
Wrote tensors for the images of abscess into out_dict[abscess]
Wrote tensors for the images of absolut

# Store in pickles

In [12]:
# output dir, file, write
#with open(os.path.join('/work/lingai/projects/multimodality/camilla/image_vectors', 'resnet_vectors.pkl'), 'wb') as outf: 
#    pickle.dump(output_dict_resnet, outf)
    
with open(os.path.join('/work/lingai/projects/multimodality/camilla/image_vectors', 'effnet_vectors.pkl'), 'wb') as outf:
    pickle.dump(output_dict_efnet, outf)