In [None]:
import torch
from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images

device = "cuda" if torch.cuda.is_available() else "cpu"
# bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+) 
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16

# Initialize the model and load the pretrained weights.
# This will automatically download the model weights the first time it's run, which may take a while.
model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)

# Load and preprocess example images (flower scene from LLFF dataset)
image_names = [
    "examples/llff_flower/images/000.png",
    "examples/llff_flower/images/005.png",
    "examples/llff_flower/images/010.png"
]  
images = load_and_preprocess_images(image_names).to(device)

with torch.no_grad():
    with torch.cuda.amp.autocast(dtype=dtype):
        # Predict attributes including cameras, depth maps, and point maps.
        predictions = model(images)

In [None]:
# 查看 predictions 字典的所有 keys
print("=" * 60)
print("Predictions 包含的所有 keys:")
print("=" * 60)
for key in predictions.keys():
    print(f"  • {key}")

print("\n" + "=" * 60)
print("各個預測結果的形狀 (shape):")
print("=" * 60)
for key, value in predictions.items():
    if isinstance(value, torch.Tensor):
        print(f"{key:20s}: {list(value.shape)}")
    elif isinstance(value, list):
        print(f"{key:20s}: list with {len(value)} items")
        
print("\n" + "=" * 60)
print("說明:")
print("=" * 60)
print("  B = Batch size (批次大小)")
print("  S = Sequence length (圖片數量)")
print("  H = Height (圖片高度)")
print("  W = Width (圖片寬度)")

# VGGT Demo - 花卉場景 3D 重建

這個 notebook 展示如何使用 VGGT 模型進行 3D 場景重建。

## Predictions 字典內容說明：

- **`pose_enc`**: 相機姿態編碼 [B, S, 9]
- **`depth`**: 深度圖 [B, S, H, W, 1] - 預測每個像素的深度
- **`depth_conf`**: 深度置信度 [B, S, H, W] - 深度預測的可信度
- **`world_points`**: 3D 世界座標 [B, S, H, W, 3] - 每個像素的 3D 位置 (x, y, z)
- **`world_points_conf`**: 點雲置信度 [B, S, H, W] - 3D 點的可信度
- **`images`**: 原始輸入圖片

其中:
- B = Batch size (批次大小，通常為 1)
- S = Sequence length (圖片數量)
- H, W = 圖片的高度和寬度

In [None]:
# 顯示原始輸入圖片
original_images = predictions['images'].cpu().numpy()  # [B, S, 3, H, W]

fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('原始輸入圖片 (Original Input Images)', fontsize=16, fontweight='bold')

for i in range(3):
    # 轉換從 [3, H, W] 到 [H, W, 3] 並調整到 [0, 1] 範圍
    img = original_images[0, i].transpose(1, 2, 0)
    img = np.clip(img, 0, 1)
    
    axes[i].imshow(img)
    axes[i].set_title(f'Image {i}: {image_names[i].split("/")[-1]}')
    axes[i].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# 可視化 3D 世界點雲 (World Points)
world_points = predictions['world_points'].cpu().numpy()  # [B, S, H, W, 3]
world_points_conf = predictions['world_points_conf'].cpu().numpy()  # [B, S, H, W]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('3D World Points - XYZ 分量', fontsize=16, fontweight='bold')

for i in range(3):
    wp = world_points[0, i]  # [H, W, 3]
    
    # 顯示 X, Y, Z 各個分量
    for j, axis_name in enumerate(['X', 'Y', 'Z']):
        if j < 3:
            row = j // 3
            col = i
            if j == 0:
                im = axes[0, col].imshow(wp[:, :, j], cmap='RdBu')
                axes[0, col].set_title(f'Image {i}: {axis_name} coordinate')
                axes[0, col].axis('off')
                plt.colorbar(im, ax=axes[0, col], fraction=0.046)
    
    # 顯示 3D 點的置信度
    conf = world_points_conf[0, i]
    im = axes[1, i].imshow(conf, cmap='hot')
    axes[1, i].set_title(f'Image {i}: World Points Confidence')
    axes[1, i].axis('off')
    plt.colorbar(im, ax=axes[1, i], fraction=0.046)

plt.tight_layout()
plt.show()

In [None]:
# 可視化深度圖
import matplotlib.pyplot as plt
import numpy as np

depth_maps = predictions['depth'].cpu().numpy()  # [B, S, H, W, 1]
depth_conf = predictions['depth_conf'].cpu().numpy()  # [B, S, H, W]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Depth Maps (深度圖) - 花卉場景', fontsize=16, fontweight='bold')

for i in range(3):
    # 顯示深度圖
    depth = depth_maps[0, i, :, :, 0]
    im1 = axes[0, i].imshow(depth, cmap='turbo')
    axes[0, i].set_title(f'Image {i}: Depth Map')
    axes[0, i].axis('off')
    plt.colorbar(im1, ax=axes[0, i], fraction=0.046)
    
    # 顯示深度置信度
    conf = depth_conf[0, i]
    im2 = axes[1, i].imshow(conf, cmap='hot')
    axes[1, i].set_title(f'Image {i}: Depth Confidence')
    axes[1, i].axis('off')
    plt.colorbar(im2, ax=axes[1, i], fraction=0.046)

plt.tight_layout()
plt.show()

In [None]:
# 解碼相機參數 (Camera Parameters)
from vggt.utils.pose_enc import pose_encoding_to_extri_intri

pose_enc = predictions['pose_enc']  # [B, S, 9]
extrinsic, intrinsic = pose_encoding_to_extri_intri(
    pose_enc, 
    predictions['images'].shape[-2:]  # (H, W)
)

print("=" * 60)
print("相機參數:")
print("=" * 60)
print(f"Extrinsic (外參矩陣) shape: {extrinsic.shape}")  # [B, S, 4, 4]
print(f"Intrinsic (內參矩陣) shape: {intrinsic.shape}")  # [B, S, 3, 3]

print("\n第一張圖片的內參矩陣 (K matrix):")
print(intrinsic[0, 0].cpu().numpy())

print("\n第一張圖片的外參矩陣 (相機到世界坐標轉換):")
print(extrinsic[0, 0].cpu().numpy())