# Import

In [2]:
from PIL import Image
import datetime

import torch
import torchvision
import torchvision.transforms as transforms
from torch import nn

from torchsummary import summary
from timm.models import create_model
import timm

from script.tool import *

from onnxruntime import InferenceSession
from transformers import AutoFeatureExtractor

# Initial

In [3]:
### image preprocessing pipeline
IMG_SIZE = (224, 224)
NORMALIZE_MEAN = [0.485, 0.456, 0.406]
NORMALIZE_STD = [0.229, 0.224, 0.225]

trans = [
              transforms.Resize(IMG_SIZE),
              transforms.ToTensor(),
              transforms.Normalize(NORMALIZE_MEAN, NORMALIZE_STD),
              ]

trans = transforms.Compose(trans)

In [4]:
n_cv = 5
path_dataset = '/home/music/Desktop/measure_model/data/product'
device = torch.device("cuda:0")
df = scan_directory(path_dataset)
df_pd, index_less_than_n, index_greater_than_or_equal_to_n = filter_data(df, minimum_data_class=n_cv)
### load image and transfer to tensor
img = Image.open("/home/music/Desktop/measure_model/data/image_net/n01514668_cock.JPEG")
img_tensor = trans(img).to(device).unsqueeze(0)

amount of all image : 15524
amount of image that less than 5 in that class : 116
amount of image that more than 5 in that class : 15408


# Module

## Timm

In [7]:
def report_model(model, img, device='cuda:0', report=True):
    ### return specific layer
    data_config = timm.data.resolve_model_data_config(model)
    transforms = timm.data.create_transform(**data_config, is_training=False)
    start_time_torch = datetime.datetime.now()
    output = model(transforms(img).to(device).unsqueeze(0))
    delta_time_torch = datetime.datetime.now() - start_time_torch
    if report:
        print("model :", model.__class__.__name__)
        print("cut model")
        print(f"Output shape at layer : {output.shape}")
        print("runtime :", delta_time_torch.microseconds/1000, "ms")
    return output

In [7]:
def convert_feature_timm(model, device='cuda:0'):
    model.eval().to(device)
    X_trans = []
    first = True
    data_config = timm.data.resolve_model_data_config(model)
    transforms = timm.data.create_transform(**data_config, is_training=False)
    for img_path in tqdm(df_pd['path_img']):
            img = Image.open(img_path).convert('RGB')
            outputs = model(transforms(img).to(device).unsqueeze(0))
            output = outputs.flatten().unsqueeze(0)
            output = standardize_feature(output).to('cpu').detach().numpy()
            if first:
                X_trans = output
                first = False
            else:
                X_trans = np.concatenate((X_trans, output))
    return X_trans

## efficientnet

In [31]:
efficientnet_b1 = create_model(
        "efficientnet_b1",
        num_classes=0,
        pretrained=True,
    )
efficientnet_b1.eval().to(device)
# summary(efficientformer_l3, (3, 224, 224))
output = report_model(efficientnet_b1, img=img, device=device)

model : EfficientNet
cut model
Output shape at layer : torch.Size([1, 1280])
runtime : 52.003 ms


In [61]:
x_trans = convert_feature_timm(efficientnet_b1, device)
save_feature(x_trans, df_pd['classes'], name="efficientnet_b1")

  0%|          | 0/15524 [00:00<?, ?it/s]

  0%|          | 4/15524 [00:00<09:57, 25.98it/s]


In [34]:
efficientnet_b5 = create_model(
        "efficientnet_b5",
        num_classes=0,
        pretrained=True,
    )
efficientnet_b5.eval().to(device)
# summary(efficientnet_b5, (3, 224, 224))
output = report_model(efficientnet_b5, img, device=device)

model : EfficientNet
cut model
Output shape at layer : torch.Size([1, 2048])
runtime : 542.966 ms


## efficientformer

In [22]:
efficientformer_l3 = create_model(
        "efficientformer_l3",
        num_classes=0,
        pretrained=True,
    )
efficientformer_l3.eval().to('cpu')
# summary(efficientformer_l3, (3, 224, 224))
output = report_model(efficientformer_l3, img, device=device)

model : EfficientFormer
cut model
Output shape at layer : torch.Size([1, 512])
runtime : 138.976 ms


In [66]:
efficientformer_l1 = create_model(
        "efficientformer_l1",
        num_classes=0,
        pretrained=True,
    )
efficientformer_l1.eval().to(device)
# summary(efficientformer_l1, (3, 224, 224))
output = report_model(efficientformer_l1, img)

model : EfficientFormer
cut model
Output shape at layer : torch.Size([1, 448])
runtime : 23.688 ms


In [67]:
efficientformerv2_l = create_model(
        "efficientformerv2_l",
        num_classes=0,
        pretrained=True,
    )
efficientformerv2_l.eval().to(device)
# summary(efficientformerv2_l, (3, 224, 224))
output = report_model(efficientformerv2_l, img)

model : EfficientFormerV2
cut model
Output shape at layer : torch.Size([1, 384])
runtime : 86.324 ms


## Transformer

In [54]:
def convert_feature_transformer(model, processor, layer, row=False, device='cuda:0'):
    model.eval().to(device)
    X_trans = []
    first = True
    for img_path in tqdm(df_pd['path_img']):
        img = Image.open(img_path).convert('RGB')
        inputs = processor(images=img, return_tensors="pt").to(device)
        outputs = model(**inputs)
        if type(row) == bool and row==False:
            output = outputs[layer]
        else:
            output = outputs[layer][:, row]
        output = output.flatten().unsqueeze(0)
        output = standardize_feature(output).to('cpu').detach().numpy()
        if first:
            X_trans = output
            first = False
        else:
            X_trans = np.concatenate((X_trans, output))
        
    return X_trans

In [28]:
from transformers import ViTImageProcessor, ViTModel
vit_gg = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
processor_vit_gg = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
vit_gg.eval().to(device)
inputs = processor_vit_gg(images=img, return_tensors="pt").to(device)
start_time_torch = datetime.datetime.now()
outputs = vit_gg(**inputs)
delta_time_torch = datetime.datetime.now() - start_time_torch
print("runtime :", delta_time_torch.microseconds/1000, "ms")
print(f"outputs layers : {outputs.keys()}")
print(f"shape last_hidden_state : {outputs.last_hidden_state.shape}")
print(f"shape pooler_output : {outputs.pooler_output.shape}")

runtime : 182.445 ms
outputs layers : odict_keys(['last_hidden_state', 'pooler_output'])
shape last_hidden_state : torch.Size([1, 197, 768])
shape pooler_output : torch.Size([1, 768])


In [None]:
x_trans = convert_feature_transformer(vit_gg, processor_vit_gg, layer="last_hidden_state", row=0, device=device)
save_feature(x_trans, df_pd['classes'], name="vit_base_patch16_224_in21k_last_hidden_state")

## Transformer onnx

In [5]:
def convert_feature_transformer_onnx(model, processor, layer, row=False, device='cuda:0'):
    X_trans = []
    first = True
    for img_path in tqdm(df_pd['path_img']):
        img = Image.open(img_path).convert('RGB')
        inputs = processor(images=img, return_tensors="np")
        outputs = model.run(output_names=[layer], input_feed=dict(inputs))[0]
        if type(row) == bool and row==False:
            output = outputs[0]
        else:
            output = outputs[:, row]
        output = output.flatten().reshape(1, -1)
        output = standardize_feature(output)
        if first:
            X_trans = output
            first = False
        else:
            X_trans = np.concatenate((X_trans, output))
        
    return X_trans

In [None]:
vit_gg_onnx_path = "/home/music/Desktop/measure_model/models/vit_gg/model.onnx"
vit_gg_onnx = InferenceSession(vit_gg_onnx_path, providers=['CUDAExecutionProvider'])
processor_vit_gg_onxx = AutoFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
inputs = processor_vit_gg_onxx(images=img.convert("RGB"), return_tensors="np")
start_time_torch = datetime.datetime.now()
outputs = vit_gg_onnx.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
delta_time_torch = datetime.datetime.now() - start_time_torch
print("runtime :", delta_time_torch.microseconds/1000, "ms")

In [59]:
x_trans = convert_feature_transformer_onnx(vit_gg_onnx, processor_vit_gg, layer="last_hidden_state", row=0, device=device)
save_feature(x_trans, df_pd['classes'], name="vit_base_patch16_224_in21k_last_hidden_state_onnx")

100%|██████████| 15524/15524 [1:03:44<00:00,  4.06it/s]


In [None]:
vit_gg_onnx_quantized_path = "/home/music/Desktop/measure_model/models/vit_gg_onnx_quantize"
vit_gg_onnx_quantized_model_path = vit_gg_onnx_quantized_path+"/model_quantized.onnx"
vit_gg_onnx_quantized = InferenceSession(vit_gg_onnx_quantized_model_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
processor_vit_gg_onxx = AutoFeatureExtractor.from_pretrained(vit_gg_onnx_quantized_path)

In [11]:
x_trans = convert_feature_transformer_onnx(vit_gg_onnx_quantized, processor_vit_gg_onxx, layer="last_hidden_state", row=0, device=device)
save_feature(x_trans, df_pd['classes'], name="vit_base_patch16_224_in21k_last_hidden_state_onnx")

100%|██████████| 15524/15524 [55:38<00:00,  4.65it/s] 
