# BLIP-2 模型測試 Notebook

這個 notebook 用於測試和探索 Hugging Face 的 BLIP-2 模型功能。

In [9]:
import torch
from PIL import Image
import os
from transformers import Blip2Processor, Blip2Model
# 配置
MODEL_NAME = "Salesforce/blip2-opt-2.7b"
CACHE_DIR = "/data/feihong/hf_cache"
DEVICE = "cuda:2" if torch.cuda.is_available() else "cpu"

print(f"使用設備: {DEVICE}")
print(f"PyTorch 版本: {torch.__version__}")
print(f"CUDA 可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA 版本: {torch.version.cuda}")
    print(f"GPU 名稱: {torch.cuda.get_device_name(0)}")

使用設備: cuda:2
PyTorch 版本: 2.9.1+cu126
CUDA 可用: True
CUDA 版本: 12.6
GPU 名稱: NVIDIA GeForce RTX 4090


## 3. 測試文本處理

## 4. 測試 Q-Former 的使用方式

In [13]:

processor = Blip2Processor.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)

# USE Blip2Model instead of Blip2ForConditionalGeneration
model = Blip2Model.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, cache_dir=CACHE_DIR).to(DEVICE)

text = ["a giant panda eating bamboo"]
inputs = processor(text=text, return_tensors="pt", padding=True).to(DEVICE, torch.float16)

with torch.no_grad():
    # This works in Blip2Model
    text_features = model.get_text_features(**inputs)

print(f"Features shape: {text_features.logits.shape}") # Expected: [1, 768]

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.14s/it]


Features shape: torch.Size([1, 7, 50304])




## 5. 測試完整的圖像-文本處理流程

In [None]:
# 創建一個測試圖像（如果沒有真實圖像）
test_image_path = "/data/feihong/drone_view"  # 可以修改為實際路徑

try:
    # 嘗試加載一個測試圖像
    if os.path.exists(test_image_path):
        import glob
        image_files = glob.glob(os.path.join(test_image_path, "**/*.jpeg"), recursive=True)
        if image_files:
            test_image = Image.open(image_files[0]).convert("RGB")
            print(f"加載圖像: {image_files[0]}")
        else:
            # 創建一個虛擬圖像用於測試
            test_image = Image.new('RGB', (224, 224), color='red')
            print("使用虛擬圖像進行測試")
    else:
        # 創建一個虛擬圖像用於測試
        test_image = Image.new('RGB', (224, 224), color='red')
        print("使用虛擬圖像進行測試")
    
    # 處理圖像和文本
    inputs = processor(images=test_image, text=test_text, return_tensors="pt").to(DEVICE)
    
    print(f"\n處理後的輸入:")
    print(f"  - pixel_values shape: {inputs['pixel_values'].shape}")
    print(f"  - input_ids shape: {inputs['input_ids'].shape}")
    print(f"  - attention_mask shape: {inputs['attention_mask'].shape}")
    
    # 測試完整模型前向傳播
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=50)
        generated_text = processor.decode(outputs[0], skip_special_tokens=True)
        print(f"\n生成的文本: {generated_text}")
        
except Exception as e:
    print(f"處理圖像時出錯: {e}")
    import traceback
    traceback.print_exc()

## 6. 測試 Vision Model

In [None]:
# 測試視覺模型
try:
    # 創建測試圖像
    test_image = Image.new('RGB', (224, 224), color='blue')
    pixel_values = processor(images=test_image, return_tensors="pt")['pixel_values'].to(DEVICE)
    
    print(f"輸入 pixel_values shape: {pixel_values.shape}")
    
    with torch.no_grad():
        vision_outputs = model.vision_model(pixel_values)
        print(f"\nVision Model 輸出:")
        print(f"  - last_hidden_state shape: {vision_outputs.last_hidden_state.shape}")
        print(f"  - pooler_output shape: {vision_outputs.pooler_output.shape if hasattr(vision_outputs, 'pooler_output') else 'N/A'}")
        
        # 檢查特徵維度
        B, N, D = vision_outputs.last_hidden_state.shape
        print(f"  - Batch size: {B}")
        print(f"  - Sequence length: {N}")
        print(f"  - Hidden dimension: {D}")
        
except Exception as e:
    print(f"測試 Vision Model 時出錯: {e}")
    import traceback
    traceback.print_exc()

## 7. 測試 Q-Former 與文本編碼的組合

In [None]:
# 嘗試使用語言模型來編碼文本，然後傳給 Q-Former
try:
    # 檢查語言模型
    if hasattr(model, 'language_model'):
        print("語言模型類型:", type(model.language_model))
        
        # 使用語言模型編碼文本
        with torch.no_grad():
            # 獲取文本嵌入
            if hasattr(model.language_model, 'get_input_embeddings'):
                text_embeddings = model.language_model.get_input_embeddings()(input_ids)
                print(f"文本嵌入 shape: {text_embeddings.shape}")
                
                # 如果有 query_tokens，嘗試使用 Q-Former
                if hasattr(model.qformer, 'query_tokens'):
                    batch_size = input_ids.shape[0]
                    query_embeds = model.qformer.query_tokens.expand(batch_size, -1, -1).to(DEVICE)
                    
                    qformer_outputs = model.qformer(
                        query_embeds=query_embeds,
                        encoder_hidden_states=text_embeddings,
                        encoder_attention_mask=attention_mask
                    )
                    print(f"\nQ-Former 與文本編碼結合成功!")
                    print(f"  - last_hidden_state shape: {qformer_outputs.last_hidden_state.shape}")
                    print(f"  - 平均池化後 shape: {qformer_outputs.last_hidden_state.mean(dim=1).shape}")
    else:
        print("模型沒有 language_model 屬性")
        
except Exception as e:
    print(f"測試 Q-Former 與文本編碼時出錯: {e}")
    import traceback
    traceback.print_exc()

## 8. 檢查模型配置和參數

In [None]:
# 查看模型配置
print("模型配置:")
print(f"  - Vision model hidden size: {model.vision_model.config.hidden_size}")
print(f"  - Q-Former hidden size: {model.qformer.config.hidden_size}")
if hasattr(model, 'language_model') and hasattr(model.language_model, 'config'):
    print(f"  - Language model hidden size: {model.language_model.config.hidden_size}")

# 查看 Q-Former 的詳細配置
print("\nQ-Former 配置詳情:")
qformer_config = model.qformer.config
for key in ['hidden_size', 'num_attention_heads', 'num_hidden_layers', 'intermediate_size']:
    if hasattr(qformer_config, key):
        print(f"  - {key}: {getattr(qformer_config, key)}")

# 檢查 Q-Former 的參數
print("\nQ-Former 參數統計:")
total_params = sum(p.numel() for p in model.qformer.parameters())
trainable_params = sum(p.numel() for p in model.qformer.parameters() if p.requires_grad)
print(f"  - 總參數數: {total_params:,}")
print(f"  - 可訓練參數數: {trainable_params:,}")

## 9. 測試不同長度的文本輸入

In [None]:
# 測試不同長度的文本
test_texts = [
    "Short text",
    "This is a medium length text description for testing",
    "This is a very long text description that contains many words and should test the model's ability to handle longer sequences with proper padding and truncation mechanisms"
]

print("測試不同長度的文本輸入:\n")
for i, text in enumerate(test_texts):
    print(f"文本 {i+1} (長度: {len(text.split())} 詞): {text}")
    
    # 不填充，保持原始長度
    text_inputs = processor(text=text, return_tensors="pt", padding=False, truncation=True, max_length=128)
    input_ids = text_inputs['input_ids'].to(DEVICE)
    attention_mask = text_inputs['attention_mask'].to(DEVICE)
    
    print(f"  - input_ids shape: {input_ids.shape}")
    print(f"  - attention_mask shape: {attention_mask.shape}")
    print(f"  - 實際長度: {input_ids.shape[1]}")
    print()

## 10. 測試 Q-Former 的正確使用方式（針對文本編碼）

In [None]:
# 根據前面的測試，確定 Q-Former 的正確使用方式
# 這對於修復 train_blip2.py 中的 text_forward 方法很重要

test_text = "A beautiful landscape with mountains and trees"
text_inputs = processor(text=test_text, return_tensors="pt", padding=False, truncation=True, max_length=128)
input_ids = text_inputs['input_ids'].to(DEVICE)
attention_mask = text_inputs['attention_mask'].to(DEVICE)

print("測試 Q-Former 用於文本編碼的不同方法:\n")

# 方法 1: 使用語言模型獲取文本嵌入，然後傳給 Q-Former
if hasattr(model, 'language_model') and hasattr(model.language_model, 'get_input_embeddings'):
    print("方法 1: 使用語言模型的輸入嵌入")
    try:
        with torch.no_grad():
            text_embeddings = model.language_model.get_input_embeddings()(input_ids)
            
            if hasattr(model.qformer, 'query_tokens'):
                batch_size = input_ids.shape[0]
                query_embeds = model.qformer.query_tokens.expand(batch_size, -1, -1).to(DEVICE)
                
                qformer_outputs = model.qformer(
                    query_embeds=query_embeds,
                    encoder_hidden_states=text_embeddings,
                    encoder_attention_mask=attention_mask
                )
                
                text_features = qformer_outputs.last_hidden_state.mean(dim=1)
                print(f"  成功! 輸出特徵 shape: {text_features.shape}")
                print(f"  這是在 train_blip2.py 中應該使用的方法")
    except Exception as e:
        print(f"  失敗: {e}")

print("\n總結:")
print("Q-Former 需要 query_embeds 參數，可以從 model.qformer.query_tokens 獲取")
print("文本應該先通過語言模型的嵌入層轉換為 embeddings，然後作為 encoder_hidden_states 傳入")