<a href="https://colab.research.google.com/github/yagiyuki/clip-study-playground/blob/main/CLIP_acceleration_techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CLIPで大量に画像分類するときの高速化実装の工夫

In [21]:
%%bash
# 画像を管理するパスを作成
mkdir -p data

# dataディレクトリにsample.jpegと言う名前で画像ファイルを上げる

# 画像を999枚複製（合計1000枚）
for i in {1..999}; do
  cp data/sample.jpeg data/sample_${i}.jpeg
done

## 特に工夫なしの場合

In [40]:
%%time

import io
import requests
from PIL import Image
import torch
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
import glob

# デバイス設定（そのままFP32, コンパイルなし）
device = 'cuda' if torch.cuda.is_available() else 'cpu'
HF_MODEL_PATH = 'line-corporation/clip-japanese-base'

# トークナイザ／プロセッサ／モデルの読み込み（標準設定）
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
processor = AutoImageProcessor.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(HF_MODEL_PATH, trust_remote_code=True).to(device)

# 画像ファイルのリストを取得
image_list = glob.glob('data/*')

# 各画像ごとに（バッチもAMPも事前計算もなしで）処理
for image_path in image_list:
    # テキストをその都度トークナイズ
    text_inputs = tokenizer(
        ["ベンチプレス", "スクワット", "デッドリフト"]
    ).to(device)

    # 画像を読み込んで前処理
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=[image], return_tensors="pt").to(device)

    with torch.no_grad():
        # テキスト特徴量を毎回計算
        text_features = model.get_text_features(**text_inputs)  # FP32

        # 画像特徴量を毎回計算
        image_features = model.get_image_features(inputs.pixel_values)  # FP32

        # 類似度計算・確率化
        text_probs = (image_features @ text_features.T).softmax(dim=-1)

    # 結果表示
    #print(f'Label probs of {image_path}:', text_probs)


CPU times: user 48.1 s, sys: 247 ms, total: 48.3 s
Wall time: 49.3 s


##【工夫ポイント1】torch.compileによるモデル最適化

In [41]:
%%time
import glob
import torch
from PIL import Image
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer

# デバイス設定
device = 'cuda' if torch.cuda.is_available() else 'cpu'
HF_MODEL_PATH = 'line-corporation/clip-japanese-base'

# モデル読み込み＋コンパイル
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
processor = AutoImageProcessor.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(HF_MODEL_PATH, trust_remote_code=True).to(device)
model = torch.compile(model)  # ← ここだけが追加

# 単純な推論ループ
with torch.no_grad():
    text = tokenizer(["ベンチプレス", "スクワット", "デッドリフト"]).to(device)
    for path in glob.glob('data/*'):
        img = Image.open(path).convert("RGB")
        inputs = processor(images=[img], return_tensors="pt").to(device)

        # 毎回テキスト特徴量を計算
        text_feats = model.get_text_features(**text)
        img_feats  = model.get_image_features(inputs.pixel_values)
        probs = (img_feats @ text_feats.T).softmax(dim=-1)
        #print(path, probs)

CPU times: user 47.3 s, sys: 264 ms, total: 47.6 s
Wall time: 48.6 s


##【工夫ポイント2】テキスト特徴量の事前計算（ループ外で一度だけ）

In [42]:
%%time
import glob
import torch
from PIL import Image
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer

# デバイス設定
device = 'cuda' if torch.cuda.is_available() else 'cpu'
HF_MODEL_PATH = 'line-corporation/clip-japanese-base'

# モデル読み込み＋コンパイル
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
processor = AutoImageProcessor.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(HF_MODEL_PATH, trust_remote_code=True).to(device)
model = torch.compile(model)  # ← ここだけが追加

# 単純な推論ループ
with torch.no_grad():
    text = tokenizer(["ベンチプレス", "スクワット", "デッドリフト"]).to(device)
    for path in glob.glob('data/*'):
        img = Image.open(path).convert("RGB")
        inputs = processor(images=[img], return_tensors="pt").to(device)

        # 毎回テキスト特徴量を計算
        text_feats = model.get_text_features(**text)
        img_feats  = model.get_image_features(inputs.pixel_values)
        probs = (img_feats @ text_feats.T).softmax(dim=-1)
        #print(path, probs)


CPU times: user 47.4 s, sys: 245 ms, total: 47.7 s
Wall time: 49 s


##【工夫ポイント3】torch.cuda.amp.autocastによる混合精度推論

In [43]:
%%time
import glob
import torch
from PIL import Image
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
HF_MODEL_PATH = 'line-corporation/clip-japanese-base'

tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
processor = AutoImageProcessor.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(HF_MODEL_PATH, trust_remote_code=True).to(device)

with torch.no_grad():
    text = tokenizer(["ベンチプレス", "スクワット", "デッドリフト"]).to(device)
    text_feats = model.get_text_features(**text).half()

    for path in glob.glob('data/*'):
        img = Image.open(path).convert("RGB")
        inputs = processor(images=[img], return_tensors="pt").to(device)

        # ここだけが追加：autocastによるFP16混合精度
        with torch.cuda.amp.autocast():
            img_feats = model.get_image_features(inputs.pixel_values)

        probs = (img_feats @ text_feats.T).softmax(dim=-1)
        #print(path, probs)




CPU times: user 42.3 s, sys: 253 ms, total: 42.5 s
Wall time: 43.4 s


## 【工夫ポイント4】バッチ処理による転送・呼び出し回数の削減

In [44]:
%%time
import glob
import torch
from PIL import Image
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
HF_MODEL_PATH = 'line-corporation/clip-japanese-base'
batch_size = 50

tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
processor = AutoImageProcessor.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(HF_MODEL_PATH, trust_remote_code=True).to(device)

with torch.no_grad():
    text = tokenizer(["ベンチプレス", "スクワット", "デッドリフト"]).to(device)
    text_feats = model.get_text_features(**text)

    image_paths = glob.glob('data/*')
    for i in range(0, len(image_paths), batch_size):
        # ここだけが追加：複数画像を一度に処理
        batch = image_paths[i:i+batch_size]
        imgs = [Image.open(p).convert("RGB") for p in batch]
        inputs = processor(images=imgs, return_tensors="pt").to(device)

        img_feats = model.get_image_features(inputs.pixel_values)

        probs = (img_feats @ text_feats.T).softmax(dim=-1)
        #for path, p in zip(batch, probs):
        #    print(path, p)

CPU times: user 20.4 s, sys: 149 ms, total: 20.6 s
Wall time: 21.4 s


## 4つの工夫をした場合

In [45]:
%%time

import io
import requests
from PIL import Image
import torch
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
import glob

# デバイス設定
device = 'cuda' if torch.cuda.is_available() else 'cpu'
HF_MODEL_PATH = 'line-corporation/clip-japanese-base'

# トークナイザ／プロセッサ／モデルの読み込み
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
processor = AutoImageProcessor.from_pretrained(HF_MODEL_PATH, trust_remote_code=True)
model = AutoModel.from_pretrained(HF_MODEL_PATH, trust_remote_code=True).to(device)

# ─────── 高速化の工夫ポイント1 ───────
# torch.compile でモデルをグラフ化＆最適化し、繰り返し推論のオーバーヘッドを削減
model = torch.compile(model)

# テキストをトークン化（ループ外で一度だけ行う）
text = tokenizer(
    ["ベンチプレス", "スクワット", "デッドリフト"]
).to(device)

# 画像ファイルのリストを取得
image_list = glob.glob('data/*')
batch_size = 50

with torch.no_grad():
    # ─────── 高速化の工夫ポイント2 ───────
    # テキスト特徴量をループ前に事前計算し、毎バッチ再計算を避ける
    text_features = model.get_text_features(**text)
    # ─────── 型合わせ対策 ───────
    # image_features が FP16 なので、text_features も FP16 にキャスト
    text_features = text_features.half()

    for i in range(0, len(image_list), batch_size):
        batch_image_list = image_list[i:i+batch_size]

        # ─────── 高速化の工夫ポイント4 ───────
        # バッチ処理により、CPU⇔GPU転送回数とモデル呼び出し回数を削減
        images = [Image.open(p).convert("RGB") for p in batch_image_list]
        inputs = processor(images=images, return_tensors="pt").to(device)

        # ─────── 高速化の工夫ポイント3 ───────
        # torch.cuda.amp.autocast() を使い、FP16混合精度で演算スループットを向上
        with torch.cuda.amp.autocast():
            image_features = model.get_image_features(inputs.pixel_values)

        # FP16 同士なので内積計算がエラーなく実行可能
        text_probs = (image_features @ text_features.T).softmax(dim=-1)

        # 各画像のラベル確率を表示
        #for image_path, probs in zip(batch_image_list, text_probs):
        #    print(f'Label probs of {image_path}:', probs)




CPU times: user 19.9 s, sys: 159 ms, total: 20.1 s
Wall time: 21 s
