In [None]:
import torch
from transformers import Blip2Processor
from src.models.surroundblip import SurroundBlip

# 1) 환경 설정
model_name = "Salesforce/blip2-opt-2.7b"
device = "mps" if torch.cuda.is_available() else "cpu"

processor = Blip2Processor.from_pretrained(model_name)
model = SurroundBlip.from_pretrained(model_name, torch_dtype=torch.float16).to(device)
model.eval()

# 2) 더미 이미지 패치 생성 (B=1, P=8, C=3, H=W=224)
B, P, C, H, W = 1, 8, 3, 224, 224
dummy_pixel_values = torch.randn(B, P, C, H, W, device=device)

# 3) 더미 텍스트 입력
prompt = "Question: 이 장면에 무엇이 있나요? Answer:"
text_inputs = processor(
    text=prompt,
    return_tensors="pt",
    padding="max_length",
    max_length=32,
)
input_ids = text_inputs["input_ids"].to(device)
attention_mask = text_inputs["attention_mask"].to(device)

# 4) forward 호출
#    return_dict=True 로 하면 namedtuple 형태가 아닌 ModelOutput 형태로 결과를 돌려줍니다.
with torch.no_grad():
    outputs = model(
        pixel_values=dummy_pixel_values,      # (1, 8, 3, 224, 224)
        input_ids=input_ids,                  # (1, 32)
        attention_mask=attention_mask,        # (1, 32)
        interpolate_pos_encoding=True,
        use_cache=False,
        return_dict=True,
    )

# 5) 출력 정보 확인
#    outputs는 Blip2ForConditionalGenerationModelOutput 타입
print("loss: ", outputs.loss)  # None (labels를 주지 않았으므로)
print("logits.shape:", outputs.logits.shape)
#   → (batch_size, seq_len, vocab_size), e.g. torch.Size([1, 32+Q, 250112])

# 비전 인코더 출력
vision_out = outputs.vision_outputs
print("vision last_hidden_state.shape:", vision_out.last_hidden_state.shape)
#   → (B*P, S_img, D) or (B, P, S_img, D) flatten 전, 예: torch.Size([8, 197, 768])

# Q-Former 출력
qformer_out = outputs.qformer_outputs
print("qformer last_hidden_state.shape:", qformer_out.last_hidden_state.shape)
#   → (B, Q, D), 예: torch.Size([1, 32, 768])

# 언어 모델 로지츠
print("language_model_outputs[0].shape:", outputs.language_model_outputs[0].shape)
#   → (B, seq_len_out, vocab_size)

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Fetching 2 files: 100%|██████████| 2/2 [01:32<00:00, 46.27s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:23<00:00, 11.93s/it]


pixel_values torch.Size([1, 8, 3, 224, 224])
