In [1]:
from torch.utils.data import DataLoader

from mmengine.dataset import DefaultSampler, pseudo_collate
from mmdet.datasets.objects365 import Objects365V2Dataset
from mmpretrain.datasets.transforms import *
from mmpretrain.models import ClsDataPreprocessor

from mmengine import Config
from mmpretrain.models import build_classifier

from projects.ma_clip.datasets import InstanceDataset, LoadInstanceImage
from projects.ma_clip.models import *
from projects.clip.models import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cfg = Config.fromfile('../configs/clip/vit-base-p16_pt-64xb64_in1k.py').model
cfg.type = 'CLIPClassifier'
cfg.backbone = dict(
    type='CLIP',
    visual=cfg.backbone,
    text=dict(
        type='TextTransformer',
        context_length=77,
        vocab_size=49408,
        width=512,
        output_dims=256,
        num_heads=8,
        num_layers=12),
    output_dims=512,
    init_cfg = dict(
        type='Pretrained', 
        checkpoint='../data/pretrained/clip/CLIP-ViT-B-16-laion2B-s34B-b88K/pretrain.pth'))
cfg.head= dict(
    type='CLIPClsHead',
    loss=dict(type='CLIPLoss'),
    cal_acc=True)
model = build_classifier(cfg)
model.init_weights()

  f'The value of {key} should be {value}, but it is currently '


05/25 19:35:55 - mmengine - INFO - load model from: ../data/pretrained/clip/CLIP-ViT-B-16-laion2B-s34B-b88K/pretrain.pth
05/25 19:35:55 - mmengine - INFO - Loads checkpoint by local backend from path: ../data/pretrained/clip/CLIP-ViT-B-16-laion2B-s34B-b88K/pretrain.pth
05/25 19:35:56 - mmengine - INFO - 
backbone.visual_projection - torch.Size([768, 512]): 
PretrainedInit: load from ../data/pretrained/clip/CLIP-ViT-B-16-laion2B-s34B-b88K/pretrain.pth 
 
05/25 19:35:56 - mmengine - INFO - 
backbone.positional_embedding - torch.Size([77, 512]): 
PretrainedInit: load from ../data/pretrained/clip/CLIP-ViT-B-16-laion2B-s34B-b88K/pretrain.pth 
 
05/25 19:35:56 - mmengine - INFO - 
backbone.text_projection - torch.Size([512, 512]): 
PretrainedInit: load from ../data/pretrained/clip/CLIP-ViT-B-16-laion2B-s34B-b88K/pretrain.pth 
 
05/25 19:35:56 - mmengine - INFO - 
backbone.logit_scale - torch.Size([]): 
PretrainedInit: load from ../data/pretrained/clip/CLIP-ViT-B-16-laion2B-s34B-b88K/pretrain

In [3]:
cfg

{'type': 'CLIPClassifier',
 'backbone': {'type': 'CLIP',
  'visual': {'type': 'VisionTransformer',
   'arch': 'b',
   'img_size': 224,
   'patch_size': 16,
   'drop_rate': 0.1,
   'init_cfg': [{'type': 'Kaiming',
     'layer': 'Conv2d',
     'mode': 'fan_in',
     'nonlinearity': 'linear'}],
   'pre_norm': True},
  'text': {'type': 'TextTransformer',
   'context_length': 77,
   'vocab_size': 49408,
   'width': 512,
   'output_dims': 256,
   'num_heads': 8,
   'num_layers': 12},
  'output_dims': 512,
  'init_cfg': {'type': 'Pretrained',
   'checkpoint': '../data/pretrained/clip/CLIP-ViT-B-16-laion2B-s34B-b88K/pretrain.pth'}},
 'neck': None,
 'head': {'type': 'CLIPClsHead',
  'loss': {'type': 'CLIPLoss'},
  'cal_acc': True}}

In [4]:
pipeline = [
    LoadInstanceImage(with_mask=False, exp_factor=1.2, channel_order='rgb'),
    ResizeEdge(scale=256, edge='short'),
    RandomCrop(crop_size=224),
    RandomFlip(prob=0.5, direction='horizontal'),
    PackInputs(algorithm_keys=['language'])
]
toy_dataset = VisionLanguageDataset(
    InstanceDataset(
        Objects365V2Dataset(
            data_root='../data/Objects365/Obj365_v2/',
            data_prefix=dict(img='train/'),
            ann_file='debug/train.json'),
        filter_cfg=dict(min_size=32)),
    pipeline=pipeline)

sampler = DefaultSampler(toy_dataset, shuffle=True)
train_loader = DataLoader(dataset=toy_dataset, batch_size=4, sampler=sampler, collate_fn=pseudo_collate)
data_preprocessor = ClsDataPreprocessor(
    mean=[125.307, 122.961, 113.8575],
    std=[51.5865, 50.847, 51.255])



loading annotations into memory...
Done (t=0.02s)
creating index...
index created!


In [5]:
# model = model.cuda()
# data_preprocessor = data_preprocessor.cuda()
# 训练过程
for data_batch in train_loader:
#     data_batch['inputs'] = [d.cuda() for d in data_batch['inputs']]
#     data_batch['data_samples'] = [d.cuda() for d in data_batch['data_samples']]

    data_batch = data_preprocessor(data_batch, training=True)
    if isinstance(data_batch, dict):
        losses = model(**data_batch, mode='loss')
    elif isinstance(data_batch, (list, tuple)):
        losses = model(*data_batch, mode='loss')
    else:
        raise TypeError()
    print(losses)

{'loss': tensor(2.5499, grad_fn=<MulBackward0>), 'accuracy_top-1': [tensor([25.])]}
{'loss': tensor(1.8627, grad_fn=<MulBackward0>), 'accuracy_top-1': [tensor([25.])]}
{'loss': tensor(2.9411, grad_fn=<MulBackward0>), 'accuracy_top-1': [tensor([50.])]}
{'loss': tensor(2.1341, grad_fn=<MulBackward0>), 'accuracy_top-1': [tensor([25.])]}
{'loss': tensor(4.3220, grad_fn=<MulBackward0>), 'accuracy_top-1': [tensor([25.])]}
{'loss': tensor(2.0964, grad_fn=<MulBackward0>), 'accuracy_top-1': [tensor([25.])]}
{'loss': tensor(2.2141, grad_fn=<MulBackward0>), 'accuracy_top-1': [tensor([25.])]}
{'loss': tensor(3.8033, grad_fn=<MulBackward0>), 'accuracy_top-1': [tensor([25.])]}
{'loss': tensor(3.0254, grad_fn=<MulBackward0>), 'accuracy_top-1': [tensor([25.])]}
{'loss': tensor(1.9332, grad_fn=<MulBackward0>), 'accuracy_top-1': [tensor([25.])]}
{'loss': tensor(2.6824, grad_fn=<MulBackward0>), 'accuracy_top-1': [tensor([25.])]}
{'loss': tensor(2.4030, grad_fn=<MulBackward0>), 'accuracy_top-1': [tensor([

In [8]:
def process_data(x, *args):
    # 对x进行处理
    processed_x = x * 2

    # 输出处理后的x
    print("Processed x:", processed_x)

    # 输出原始的*args参数（展开）
    print("*args:", *args)

    # 返回处理后的x和原始的*args参数
    return processed_x, args
