In [1]:
import os
import json

import torch
from PIL import Image

import cn_clip.clip as clip
from cn_clip.clip.model import CLIP
from cn_clip.clip import load_from_name

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

model, preprocess = load_from_name(
    # "./datapath/pretrained_weights/clip_cn_vit-b-16.pt", 
    r"datapath\experiments\muge_finetune_vit-b-16_roberta-base_bs160\checkpoints\epoch_latest.pt",
    device=device,
    vision_model_name="ViT-B-16",
    text_model_name="RoBERTa-wwm-ext-base-chinese",
    input_resolution=224,
    convert_to_float16=True,
)
model: CLIP = model.eval()

cuda
Loading vision model config from g:\code\github\Chinese-CLIP\cn_clip\clip\model_configs\ViT-B-16.json
Loading text model config from g:\code\github\Chinese-CLIP\cn_clip\clip\model_configs\RoBERTa-wwm-ext-base-chinese.json
Model info {'embed_dim': 512, 'image_resolution': 224, 'vision_layers': 12, 'vision_width': 768, 'vision_patch_size': 16, 'vocab_size': 21128, 'text_attention_probs_dropout_prob': 0.1, 'text_hidden_act': 'gelu', 'text_hidden_dropout_prob': 0.1, 'text_hidden_size': 768, 'text_initializer_range': 0.02, 'text_intermediate_size': 3072, 'text_max_position_embeddings': 512, 'text_num_attention_heads': 12, 'text_num_hidden_layers': 12, 'text_type_vocab_size': 2}
将模型转换成 float16 精度


In [3]:
image = preprocess(Image.open("examples/pokemon.jpeg")).unsqueeze(0).to(device)
text = clip.tokenize(["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"]).to(device)

In [4]:
# 这个层不在模型结构里显示
model.text_projection

Parameter containing:
tensor([[ 0.0641, -0.0203, -0.0554,  ...,  0.0064,  0.0111,  0.0027],
        [-0.0107,  0.0267,  0.0710,  ..., -0.0269, -0.0165, -0.0383],
        [ 0.0335, -0.0278, -0.0208,  ...,  0.1006, -0.0765, -0.0527],
        ...,
        [-0.0514, -0.0186, -0.0291,  ...,  0.0462,  0.0694,  0.0090],
        [ 0.0236, -0.0348,  0.0444,  ...,  0.0475,  0.0318, -0.0482],
        [-0.0399, -0.0189,  0.0106,  ...,  0.0303,  0.0917,  0.0359]],
       device='cuda:0', dtype=torch.float16, requires_grad=True)

In [5]:
print(image.shape, image.dtype)
print(text.shape, text.dtype)

torch.Size([1, 3, 224, 224]) torch.float32
torch.Size([4, 52]) torch.int64


In [6]:
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)

# 果然, 这里都是 float16
print(image_features.shape, image_features.dtype)
print(text_features.shape, text_features.dtype)

torch.Size([1, 512]) torch.float16
torch.Size([4, 512]) torch.float16


# 看看怎么不用精度转换, 直接用 float32

In [7]:
model, preprocess = load_from_name(
    # "./datapath/pretrained_weights/clip_cn_vit-b-16.pt", 
    r"datapath\experiments\muge_finetune_vit-b-16_roberta-base_bs160\checkpoints\epoch_latest.pt",
    device=device,
    vision_model_name="ViT-B-16",
    text_model_name="RoBERTa-wwm-ext-base-chinese",
    input_resolution=224,
    convert_to_float16=False,
)
model: CLIP = model.eval()

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)

# 果然, 这里都是 float16
print(image_features.shape, image_features.dtype)
print(text_features.shape, text_features.dtype)

Loading vision model config from g:\code\github\Chinese-CLIP\cn_clip\clip\model_configs\ViT-B-16.json
Loading text model config from g:\code\github\Chinese-CLIP\cn_clip\clip\model_configs\RoBERTa-wwm-ext-base-chinese.json
Model info {'embed_dim': 512, 'image_resolution': 224, 'vision_layers': 12, 'vision_width': 768, 'vision_patch_size': 16, 'vocab_size': 21128, 'text_attention_probs_dropout_prob': 0.1, 'text_hidden_act': 'gelu', 'text_hidden_dropout_prob': 0.1, 'text_hidden_size': 768, 'text_initializer_range': 0.02, 'text_intermediate_size': 3072, 'text_max_position_embeddings': 512, 'text_num_attention_heads': 12, 'text_num_hidden_layers': 12, 'text_type_vocab_size': 2}
torch.Size([1, 512]) torch.float32
torch.Size([4, 512]) torch.float32


In [8]:
model.visual(image, 0).dtype

torch.float32