https://github.com/roatienza/benchmark

In [None]:
import torch

In [None]:
from ConvNeXt.models.convnext import convnext_tiny

In [None]:
model = convnext_tiny(pretrained=False, in_22k=False)

In [None]:
state_dict = torch.load('./convnext_tiny_1k_224_ema.pth', map_location='cpu')['model']
model.load_state_dict(state_dict)
model = model.eval()

In [None]:
torch.cuda.is_available()

In [None]:
device = torch.device('cuda:0')

In [None]:
convnext_tiny = model.to(device)

In [None]:
image = torch.rand(64,3,224,224)
image = image.to(device)
output = convnext_tiny(image)
print(output.shape, output.device)

In [None]:
del output
del image
torch.cuda.empty_cache()

# 与resnet速度比拼

In [None]:
import time
import numpy as np

In [None]:
def benchmark(model, input_shape=(512, 3, 224, 224), dtype='fp32', nwarmup=50, nruns=100):
    torch.cuda.empty_cache()
    old_value = torch.backends.cudnn.benchmark
    torch.backends.cudnn.benchmark = True
    input_data = torch.randn(input_shape)
    input_data = input_data.to("cuda")
    if dtype=='fp16':
        input_data = input_data.half()
        
    print("Warm up ...")
    with torch.no_grad():
        for _ in range(nwarmup):
            features = model(input_data)
    torch.cuda.synchronize()
    print("Start timing ...")
    timings = []
    with torch.no_grad():
        for i in range(1, nruns+1):
            start_time = time.time()
            pred_loc  = model(input_data)
            torch.cuda.synchronize()
            end_time = time.time()
            timings.append(end_time - start_time)
            if i%10==0:
                print('Iteration %d/%d, avg batch time %.2f ms'%(i, nruns, np.mean(timings)*1000))
    input_size = tuple(input_data.size())
    del input_data
    del features
    torch.cuda.empty_cache()
    torch.backends.cudnn.benchmark = old_value
    print("Input shape:", input_size)
    print('Average throughput: %.2f images/second'%(input_shape[0]/np.mean(timings)))

In [None]:
convnext_tiny = convnext_tiny.to(device)
# 82.1 82.9
benchmark(convnext_tiny, input_shape=(384, 3, 224, 224))

In [None]:
import timm

In [None]:
resnetv2_50_distilled = timm.create_model('resnetv2_50x1_bit_distilled', pretrained=False)
resnetv2_50_distilled = resnetv2_50_distilled.eval()
resnetv2_50_distilled = resnetv2_50_distilled.to(device)

In [None]:
resnetv2_50_distilled

In [None]:
# 82.822
benchmark(resnetv2_50_distilled, input_shape=(384, 3, 224, 224))

In [None]:
resnet50d = timm.create_model('resnet50d', pretrained=False)
resnet50d = resnet50d.eval()
resnet50d = resnet50d.to(device)

In [None]:
# 80.528
benchmark(resnet50d, input_shape=(384, 3, 224, 224))

同等规模下的模型，在cuda10.2+cudnn8.4环境下，convnext更具性价比

## 测试FLops

In [None]:
import thop

In [None]:
x = torch.randn(1,3,224,224)
convnext_tiny = convnext_tiny.to('cpu')
flops, params = thop.profile(convnext_tiny,inputs=(x,))
flops, params = thop.clever_format((flops, params))
print(flops, params)

In [None]:
from thop.vision.calc_func import calculate_parameters, calculate_zero_ops, calculate_conv2d_flops

def count_your_model(model, x, y):
    x = x[0]
    model.total_params[0] = calculate_parameters(model.parameters())
    model.total_ops += calculate_conv2d_flops(input_size = list(x.shape),
        output_size = list(y.shape),
        kernel_size = list(model.weight.shape),
        groups = model.groups,
        bias = model.bias)

In [None]:
import timm
import torch
resnetv2_50_distilled = timm.create_model('resnetv2_50x1_bit_distilled', pretrained=False)

In [None]:
x = torch.randn(1,3,224,224)
resnetv2_50_distilled = resnetv2_50_distilled.to('cpu')
std_conv_type = type(resnetv2_50_distilled.stem[0])

    
flops, params = thop.profile(resnetv2_50_distilled,inputs=(x,),custom_ops={std_conv_type: count_your_model})
flops, params = thop.clever_format((flops, params))
print(flops, params)

In [None]:
torch.onnx.export(resnetv2_50_distilled, x, 'resnetv2_50_distilled.onnx', input_names=['input'], output_names=['output'], opset_version=13, dynamic_axes={
    'input':{
        0:'batch_size'
    },
    'output': {
        0:'batch_size'
    }
})

In [None]:
import netron

netron.stop()
netron.start('resnetv2_50_distilled.onnx')

In [None]:
x = torch.randn(1,3,224,224)
resnet50d = resnet50d.to('cpu')
flops, params = thop.profile(resnet50d,inputs=(x,))
flops, params = thop.clever_format((flops, params))
print(flops, params)

FLOPs并不能反映实际运行时间，实际运行时间还与内存访问开销、算子的具体实现和硬件等因素相关联，但是对于同一类模型可以采用FLOPs的方式来衡量实际运行速度。

下一步研究
1. https://blog.csdn.net/caroline_wendy/article/details/120292130  
   https://www.cvmart.net/community/detail/4206
2. convnext训练过程，组件细节  
   cait/droppath 以及初始化/参数初始化
3. 转换为onnx后怎么压缩
4. 在tensorrt上如何加速
5. 结论