<a href="https://colab.research.google.com/github/yagiyuki/clip-study-playground/blob/main/CLIP_acceleration_techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CLIPで大量に画像分類するときの高速化実装の工夫

In [1]:
%%bash
# 画像を管理するパスを作成
mkdir -p data

# dataディレクトリにsample.jpegと言う名前で画像ファイルを上げる

In [2]:
%%bash
# 画像を999枚複製（合計1000枚）
for i in {1..999}; do
  cp data/sample.jpeg data/sample_${i}.jpeg
done

## ベースライン

In [5]:
%%time
import glob
import torch
from PIL import Image
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer

# デバイス設定
device = 'cuda' if torch.cuda.is_available() else 'cpu'
HF_MODEL_PATH = 'line-corporation/clip-japanese-base'

# モデル読み込み
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
processor = AutoImageProcessor.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(HF_MODEL_PATH, trust_remote_code=True).to(device)

# 単純な推論ループ
with torch.no_grad():
    text = tokenizer(["ベンチプレス", "スクワット", "デッドリフト"]).to(device)
    text_feats = model.get_text_features(**text)
    for path in glob.glob('data/*'):
        img = Image.open(path).convert("RGB")
        inputs = processor(images=[img], return_tensors="pt").to(device)

        # テキスト特徴量を計算
        img_feats  = model.get_image_features(inputs.pixel_values)
        probs = (img_feats @ text_feats.T).softmax(dim=-1)
        #print(path, probs)

CPU times: user 31.3 s, sys: 180 ms, total: 31.5 s
Wall time: 34.6 s


##【工夫ポイント1】torch.compileによるモデル最適化

In [6]:
%%time
import glob
import torch
from PIL import Image
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer

# デバイス設定
device = 'cuda' if torch.cuda.is_available() else 'cpu'
HF_MODEL_PATH = 'line-corporation/clip-japanese-base'

# モデル読み込み
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
processor = AutoImageProcessor.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(HF_MODEL_PATH, trust_remote_code=True).to(device)
# コンパイルして最適化
model = torch.compile(model)

# 単純な推論ループ
with torch.no_grad():
    text = tokenizer(["ベンチプレス", "スクワット", "デッドリフト"]).to(device)
    text_feats = model.get_text_features(**text)
    for path in glob.glob('data/*'):
        img = Image.open(path).convert("RGB")
        inputs = processor(images=[img], return_tensors="pt").to(device)

        # テキスト特徴量を計算
        img_feats  = model.get_image_features(inputs.pixel_values)
        probs = (img_feats @ text_feats.T).softmax(dim=-1)
        #print(path, probs)

CPU times: user 36.2 s, sys: 312 ms, total: 36.5 s
Wall time: 39.1 s


##【工夫ポイント2】torch.cuda.amp.autocastによる混合精度推論

In [7]:
%%time
import glob
import torch
from PIL import Image
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
HF_MODEL_PATH = 'line-corporation/clip-japanese-base'

tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
processor = AutoImageProcessor.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(HF_MODEL_PATH, trust_remote_code=True).to(device)

with torch.no_grad():
    text = tokenizer(["ベンチプレス", "スクワット", "デッドリフト"]).to(device)
    # テキスト特徴量をFP16に変換
    text_feats = model.get_text_features(**text).half()

    for path in glob.glob('data/*'):
        img = Image.open(path).convert("RGB")
        inputs = processor(images=[img], return_tensors="pt").to(device)

        # autocastによるFP16混合精度
        with torch.cuda.amp.autocast():
            img_feats = model.get_image_features(inputs.pixel_values)

        probs = (img_feats @ text_feats.T).softmax(dim=-1)
        #print(path, probs)




CPU times: user 35.7 s, sys: 253 ms, total: 35.9 s
Wall time: 38.2 s


## 【工夫ポイント3】バッチ処理による転送・呼び出し回数の削減

In [8]:
%%time
import glob
import torch
from PIL import Image
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
HF_MODEL_PATH = 'line-corporation/clip-japanese-base'
batch_size = 50

tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
processor = AutoImageProcessor.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(HF_MODEL_PATH, trust_remote_code=True).to(device)

with torch.no_grad():
    text = tokenizer(["ベンチプレス", "スクワット", "デッドリフト"]).to(device)
    text_feats = model.get_text_features(**text)

    image_paths = glob.glob('data/*')
    for i in range(0, len(image_paths), batch_size):
        # ここだけが追加：複数画像を一度に処理
        batch = image_paths[i:i+batch_size]
        imgs = [Image.open(p).convert("RGB") for p in batch]
        inputs = processor(images=imgs, return_tensors="pt").to(device)

        img_feats = model.get_image_features(inputs.pixel_values)

        probs = (img_feats @ text_feats.T).softmax(dim=-1)
        #for path, p in zip(batch, probs):
        #    print(path, p)

CPU times: user 17.2 s, sys: 328 ms, total: 17.5 s
Wall time: 19.6 s


## 3つの工夫をした場合

In [9]:
%%time

import io
import requests
from PIL import Image
import torch
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
import glob

# デバイス設定
device = 'cuda' if torch.cuda.is_available() else 'cpu'
HF_MODEL_PATH = 'line-corporation/clip-japanese-base'

# トークナイザ／プロセッサ／モデルの読み込み
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
processor = AutoImageProcessor.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(HF_MODEL_PATH, trust_remote_code=True).to(device)

# ─────── 高速化の工夫ポイント1 ───────
# torch.compile でモデルをグラフ化＆最適化し、繰り返し推論のオーバーヘッドを削減
model = torch.compile(model)

# テキストをトークン化（ループ外で一度だけ行う）
text = tokenizer(
    ["ベンチプレス", "スクワット", "デッドリフト"]
).to(device)

# 画像ファイルのリストを取得
image_list = glob.glob('data/*')
batch_size = 50

with torch.no_grad():
    # ─────── 高速化の工夫ポイント2 ───────
    # テキスト特徴量をループ前に事前計算し、毎バッチ再計算を避ける
    text_features = model.get_text_features(**text)
    # ─────── 型合わせ対策 ───────
    # image_features が FP16 なので、text_features も FP16 にキャスト
    text_features = text_features.half()

    for i in range(0, len(image_list), batch_size):
        batch_image_list = image_list[i:i+batch_size]

        # ─────── 高速化の工夫ポイント4 ───────
        # バッチ処理により、CPU⇔GPU転送回数とモデル呼び出し回数を削減
        images = [Image.open(p).convert("RGB") for p in batch_image_list]
        inputs = processor(images=images, return_tensors="pt").to(device)

        # ─────── 高速化の工夫ポイント3 ───────
        # torch.cuda.amp.autocast() を使い、FP16混合精度で演算スループットを向上
        with torch.cuda.amp.autocast():
            image_features = model.get_image_features(inputs.pixel_values)

        # FP16 同士なので内積計算がエラーなく実行可能
        text_probs = (image_features @ text_features.T).softmax(dim=-1)

        # 各画像のラベル確率を表示
        #for image_path, probs in zip(batch_image_list, text_probs):
        #    print(f'Label probs of {image_path}:', probs)




CPU times: user 17.5 s, sys: 248 ms, total: 17.7 s
Wall time: 19.9 s


In [None]:
%%bash
free -h

