In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Photo extraction**

In [None]:
!apt-get update
!apt-get install -y ffmpeg

In [None]:
from google.colab import files

uploaded = files.upload()

the expected storage location for the photo is /content/drive/MyDrive/vggt/vggt/final/images

In [None]:
!mkdir -p /content/drive/MyDrive/vggt/vggt/final/images
!ffmpeg -i bedroom-1-square.mov -vf fps=0.6 /content/drive/MyDrive/vggt/vggt/final/images/frame_%04d.jpg

**VGGT(get pointcloud)**

In [None]:
%cd /content/drive/MyDrive/vggt/vggt/vggt

install requirements

In [None]:
!pip install -r requirements.txt

In [None]:
!pip install trimesh pycolmap hydra-core einops safetensors huggingface_hub open3d

get pointcloud

In [None]:
import os
import cv2
import torch
import numpy as np
import subprocess
from torchvision import transforms
from vggt.utils.pose_enc import pose_encoding_to_extri_intri
from vggt.utils.geometry import unproject_depth_map_to_point_map
from vggt.models.vggt import VGGT
from vggt.utils.load_fn import load_and_preprocess_images
import open3d as o3d
import torch, gc
from vggt.models.vggt import VGGT
from PIL import Image

# 2) Run Python garbage collection
gc.collect()
# 3) Empty PyTorch’s CUDA cache
torch.cuda.empty_cache()
# 4) (If on Unix) also try
torch.cuda.ipc_collect()

device = "cuda" if torch.cuda.is_available() else "cpu"
# bfloat16 is supported on Ampere GPUs (Compute Capability 8.0+)
dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
model = VGGT.from_pretrained("facebook/VGGT-1B").to(device)
# 文件夹路径
import os
image_folder = "/content/drive/MyDrive/vggt/vggt/final/images"
image_paths = sorted([
    os.path.join(image_folder, fname)
    for fname in os.listdir(image_folder)
    if fname.lower().endswith((".jpg", ".jpeg", ".png"))
])
images = load_and_preprocess_images(image_paths, mode="crop").to(device)  # [N, 3, H, W]
images = images.unsqueeze(0).to(device)
mean = torch.tensor([0.485, 0.456, 0.406], device=device)[None,:,None,None]
std  = torch.tensor([0.229, 0.224, 0.225], device=device)[None,:,None,None]
# images is [1, V, 3, H, W] floats in (0,1) normalized
inputs = images.clone()
inputs = inputs * std + mean               # back to [0,1]
inputs = (inputs * 255).round().to(torch.uint8)  # [1, V, 3, H, W], uint8
# move channels to last dim for numpy sampling
padded_imgs = inputs.squeeze(0) \
                    .permute(0,2,3,1)      \
                    .cpu().numpy()

# 推理过程
all_pointclouds = []

with torch.no_grad():
    with torch.cuda.amp.autocast(dtype=dtype):  # add batch dimension
        aggregated_tokens_list, ps_idx = model.aggregator(images)


    # Predict Cameras
    pose_enc = model.camera_head(aggregated_tokens_list)[-1]
    # Extrinsic and intrinsic matrices, following OpenCV convention (camera from world)
    extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, images.shape[-2:])

    # Predict Depth Maps
    depth_map, depth_conf = model.depth_head(aggregated_tokens_list, images, ps_idx)

    # Predict Point Maps
    #point_map, point_conf = model.point_head(aggregated_tokens_list, images, ps_idx)
    #point_map_by_unprojection = unproject_depth_map_to_point_map(depth_map.squeeze(0), extrinsic.squeeze(0),intrinsic.squeeze(0))

    # Construct 3D Points from Depth Maps and Cameras
    # which usually leads to more accurate 3D points than point map branch
'''
for i in range(depth_map.shape[0]):
    # depth map to camera space
    points_world = unproject_depth_map_to_point_map(
      depth_map[i], extrinsic[i], intrinsic[i]
    )  # shape: [H, W, 3]
    all_pointclouds.append(points_world.reshape(-1, 3))
    '''

###
# Squeeze out the batch dim so we loop over each view
# Only remove the batch dim—keep the trailing channel
depth_map   = depth_map.squeeze(0).cpu().numpy()     # [V, H, W, 1]
extrinsic   = extrinsic.squeeze(0).cpu().numpy()     # [V, 3, 4]
intrinsic   = intrinsic.squeeze(0).cpu().numpy()     # [V, 3, 3]

# Build one point-cloud per view
all_pointclouds = []
for v in range(depth_map.shape[0]):
    # take a mini-batch of size 1 so unproject sees [1,H,W,1]
    dm_batch  = depth_map[v:v+1]                    # shape [1, H, W, 1]
    ext_batch = extrinsic[v:v+1]                    # shape [1, 3, 4]
    int_batch = intrinsic[v:v+1]                    # shape [1, 3, 3]

    # unproject returns shape [1, H, W, 3], so take [0]
    pts_world = unproject_depth_map_to_point_map(
        dm_batch, ext_batch, int_batch
    )[0]                                           # shape [H, W, 3]
    all_pointclouds.append(pts_world.reshape(-1, 3))
###

# 合并点云
merged_points = np.concatenate(all_pointclouds, axis=0)
print(f"Total points: {merged_points.shape[0]}")

# 保存为 .ply
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(merged_points)
o3d.io.write_point_cloud("/content/drive/MyDrive/vggt/vggt/final/sparse/merged_pointcloud.ply", pcd)
print("✅ 已保存点云到 merged_pointcloud.ply")


import os, math, gc
import numpy as np

try:
    from sklearn.neighbors import KDTree
except Exception as e:
    import sys, subprocess
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "scikit-learn"], check=True)
    from sklearn.neighbors import KDTree

from read_write_model import Camera, Image as COLImage, Point3D, write_model

def rotmat_to_qvec(R: np.ndarray) -> np.ndarray:
    t = np.trace(R)
    if t > 0:
        S = np.sqrt(t + 1.0) * 2.0
        qw = 0.25 * S
        qx = (R[2,1] - R[1,2]) / S
        qy = (R[0,2] - R[2,0]) / S
        qz = (R[1,0] - R[0,1]) / S
    else:
        idx = np.argmax([R[0,0], R[1,1], R[2,2]])
        if idx == 0:
            S = np.sqrt(1.0 + R[0,0] - R[1,1] - R[2,2]) * 2.0
            qw = (R[2,1] - R[1,2]) / S; qx = 0.25 * S
            qy = (R[0,1] + R[1,0]) / S; qz = (R[0,2] + R[2,0]) / S
        elif idx == 1:
            S = np.sqrt(1.0 + R[1,1] - R[0,0] - R[2,2]) * 2.0
            qw = (R[0,2] - R[2,0]) / S; qx = (R[0,1] + R[1,0]) / S
            qy = 0.25 * S; qz = (R[1,2] + R[2,1]) / S
        else:
            S = np.sqrt(1.0 + R[2,2] - R[0,0] - R[1,1]) * 2.0
            qw = (R[1,0] - R[0,1]) / S; qx = (R[0,2] + R[2,0]) / S
            qy = (R[1,2] + R[2,1]) / S; qz = 0.25 * S
    q = np.array([qw, qx, qy, qz], dtype=float)
    return q / np.linalg.norm(q)

# 输出目录（保持你的路径）
colmap_out = "/content/drive/MyDrive/vggt/vggt/final/sparse"
os.makedirs(colmap_out, exist_ok=True)

V = extrinsic.shape[0]
Hc, Wc = padded_imgs.shape[1:3]      # 518, 518

# 1) 相机与位姿（518×518，不缩放）
cameras = {}
images_dict = {}
for vid in range(V):
    K = intrinsic[vid]
    fx, fy, cx, cy = float(K[0,0]), float(K[1,1]), float(K[0,2]), float(K[1,2])

    cameras[vid+1] = Camera(
        id=vid+1, model="PINHOLE",
        width=Wc, height=Hc,
        params=[fx, fy, cx, cy]
    )

    R, t = extrinsic[vid][:,:3], extrinsic[vid][:,3]
    images_dict[vid+1] = COLImage(
        id=vid+1,
        qvec=rotmat_to_qvec(R),
        tvec=t,
        camera_id=vid+1,
        name=os.path.basename(image_paths[vid]),   # 你的原始文件名
        xys=[], point3D_ids=[]
    )

# 2) 将各视图点云投影回像素，收集 (xyz, 视图id, 像素u,v, 颜色)
STRIDE = 3                     # 适度降采样以控制规模
xyz_all   = []
obs_meta  = []                 # (view_id, u, v)
colors_all= []

for vid in range(V):
    pts = all_pointclouds[vid][::STRIDE]           # (N,3) subsample
    E   = extrinsic[vid]; R, t = E[:,:3], E[:,3][:,None]
    K   = intrinsic[vid]

    # 世界→相机
    pts_cam = (R @ pts.T) + t                      # (3,N)
    z = pts_cam[2]
    # 丢掉 z<=0 的点
    valid_z = z > 1e-6
    if not np.any(valid_z):
        continue
    pts     = pts[valid_z]
    pts_cam = pts_cam[:, valid_z]

    # 相机→像素
    uv = (K @ pts_cam)[:2] / pts_cam[2:3]          # (2,Nv)
    u = np.round(uv[0]).astype(int)
    v = np.round(uv[1]).astype(int)

    mask = (u>=0)&(u<Wc)&(v>=0)&(v<Hc)
    pts  = pts[mask]
    u    = u[mask]; v = v[mask]

    if pts.shape[0] == 0:
        continue

    # 取颜色（RGB，uint8）
    cols = padded_imgs[vid, v, u].astype(np.uint8)

    xyz_all.append(pts)
    obs_meta.append(np.stack([np.full(len(u), vid, dtype=int), u, v], axis=1))
    colors_all.append(cols)

xyz_all    = np.concatenate(xyz_all,    axis=0) if len(xyz_all)    else np.empty((0,3))
obs_meta   = np.concatenate(obs_meta,   axis=0) if len(obs_meta)   else np.empty((0,3), int)
colors_all = np.concatenate(colors_all, axis=0) if len(colors_all) else np.empty((0,3), np.uint8)

print(f"[collect] candidates: {len(xyz_all):,}")

if len(xyz_all) == 0:
    raise RuntimeError("没有收集到任何候选点，检查 all_pointclouds / 投影步骤。")

# 3) KDTree 半径聚类，在世界坐标里把临近点“合并”为一个 3D track
scene_scale = np.linalg.norm(
    np.percentile(xyz_all, 97, axis=0) - np.percentile(xyz_all, 3, axis=0)
)
R_CLUSTER = max(1e-3, 0.005 * scene_scale)   # 自适应半径：场景尺度的 0.5%
print(f"[cluster] scene_scale≈{scene_scale:.3f}, radius={R_CLUSTER:.4f}")

tree    = KDTree(xyz_all)
visited = np.zeros(len(xyz_all), dtype=bool)

points3D = {}
pid = 1
kept = 0

for i in range(len(xyz_all)):
    if visited[i]:
        continue
    idx = tree.query_radius(xyz_all[i:i+1], r=R_CLUSTER, count_only=False)[0]
    visited[idx] = True

    # 这个簇来自哪些视图
    vids = obs_meta[idx, 0]
    uniq_views, inv = np.unique(vids, return_inverse=True)

    # 至少要 2 个不同视图才形成一个有效 track
    if len(uniq_views) < 2:
        continue

    # 3D 点位置（簇内平均）
    xyz = xyz_all[idx].mean(axis=0)
    # 颜色（簇内平均）
    rgb = colors_all[idx].mean(axis=0).astype(np.uint8).tolist()

    image_ids = []
    p2d_idxs  = []

    # 每个视图，取这个簇里像素坐标的平均作为代表观测
    for v in uniq_views:
        sel = (vids == v)
        u_mean = float(obs_meta[idx[sel], 1].astype(float).mean())
        v_mean = float(obs_meta[idx[sel], 2].astype(float).mean())

        img_id = int(v) + 1
        # 在对应 image 上追加一个 2D 观测
        p2d_idx = len(images_dict[img_id].xys)
        images_dict[img_id].xys.append([u_mean, v_mean])
        images_dict[img_id].point3D_ids.append(pid)

        image_ids.append(img_id)
        p2d_idxs.append(p2d_idx)

    rgb_np = colors_all[idx].mean(axis=0).astype(np.uint8)

    points3D[pid] = Point3D(
        id=pid,
        xyz=xyz,
        rgb=rgb_np,   # ✅ 传 numpy 数组
        error=0.0,
        image_ids=np.array(image_ids, dtype=np.int32),
        point2D_idxs=np.array(p2d_idxs, dtype=np.int32)
    )
    pid  += 1
    kept += 1

print(f"[tracks] kept 3D points: {kept:,}, pid_max={pid-1}")

images_dict_np = {}
for img in images_dict.values():
    xys_arr  = np.asarray(img.xys, dtype=np.float64)        # (N,2)
    pids_arr = np.asarray(img.point3D_ids, dtype=np.int64)  # (N,)
    images_dict_np[img.id] = COLImage(
        id=img.id,
        qvec=img.qvec,              # 已是 numpy 向量
        tvec=img.tvec,              # 已是 numpy 向量
        camera_id=img.camera_id,
        name=img.name,
        xys=xys_arr,
        point3D_ids=pids_arr,
    )
# 4) 写出模型（bin 或 txt）
write_model(cameras, images_dict, points3D, colmap_out, ext=".bin")
# 如果你也想导出 txt 便于人工查看，可以再写一遍：
# write_model(cameras, images_dict, points3D, colmap_out, ext=".txt")
###读取images518
out_518 = "/content/drive/MyDrive/vggt/vggt/final/images_518"
os.makedirs(out_518, exist_ok=True)

# 拿到 [V,3,H,W]
imgs_518 = images.squeeze(0).detach().cpu()  # 不要用 inputs

# 如果发现数值不在 [0,1]（说明已经是 (x-mean)/std），才做一次反归一化
imin, imax = float(imgs_518.min()), float(imgs_518.max())
if (imin < 0.0) or (imax > 1.0):
    # mean/std 形状对齐到 [3,1,1]
    mean_ = torch.tensor([0.485, 0.456, 0.406]).view(3,1,1)
    std_  = torch.tensor([0.229, 0.224, 0.225]).view(3,1,1)
    imgs_518 = (imgs_518 * std_ + mean_).clamp(0, 1)

# 转成 uint8 HWC 批量保存
padded_imgs_uint8 = (imgs_518 * 255.0).round().to(torch.uint8).permute(0,2,3,1).numpy()

save_paths = []
for i, in_path in enumerate(image_paths):
    base = os.path.basename(in_path)                   # 与输入同名
    out_path = os.path.join(out_518, base)
    Image.fromarray(padded_imgs_uint8[i]).save(out_path, quality=95, subsampling=0)
    save_paths.append((in_path, out_path))

print(f"✅ 已保存 {len(save_paths)} 张 518×518 图片到: {out_518}")


CANVAS = 518

def letterbox_rect(h, w, canvas=518):
    """按 VGGT 的 pad 规则：最长边缩放到 canvas，再居中填充。返回 (top, left, nh, nw)。"""
    s = canvas / max(h, w)
    nh = int(round(h * s))
    nw = int(round(w * s))
    top  = (canvas - nh) // 2
    left = (canvas - nw) // 2
    return top, left, nh, nw

saved = 0
for in_path in image_paths:
    # 只需原图尺寸来复原 letterbox 的内框
    img0 = cv2.imread(in_path)  # BGR
    if img0 is None:
        print(f"[warn] cannot read: {in_path}")
        continue
    h0, w0 = img0.shape[:2]

    top, left, nh, nw = letterbox_rect(h0, w0, CANVAS)

    # 生成 mask：有效区域=255，padding=0
    m = np.zeros((CANVAS, CANVAS), dtype=np.uint8)
    m[top:top+nh, left:left+nw] = 255

    # 保存为和图片同名的 PNG（建议放到独立目录）
    base = os.path.splitext(os.path.basename(in_path))[0]

    saved += 1

#obs_per_image = [len(images_dict[i+1].xys) for i in range(V)]
#print("observations per image:", obs_per_image, "  mean=", np.mean(obs_per_image))
#print("✅ COLMAP sparse (with tracks)  已写入：", colmap_out)

#npy
plane_data_path = "/content/drive/MyDrive/vggt/vggt/final/plane_detection_data"
scene_name = "/content/drive/MyDrive/vggt/vggt/final/images_518"

output_scene_path = os.path.join(plane_data_path, scene_name)
os.makedirs(output_scene_path, exist_ok=True)
os.makedirs(os.path.join(output_scene_path, "color"), exist_ok=True)

# 1. 构建并保存 point_map.npy
print("\n[1/3] 保存 point_map.npy...")
# 将 all_pointclouds 重新组织成 [V, H, W, 3] 格式
V = len(all_pointclouds)
H, W = 518, 518

point_map = np.zeros((V, H, W, 3), dtype=np.float32)

for vid in range(V):
    # all_pointclouds[vid] 是 [H*W, 3] 的格式
    pts = all_pointclouds[vid]

    # reshape 成 [H, W, 3]
    if pts.shape[0] == H * W:
        point_map[vid] = pts.reshape(H, W, 3)
    else:
        print(f"  警告：视图 {vid} 的点数量 {pts.shape[0]} 不等于 {H*W}")

np.save(os.path.join(output_scene_path, "point_map.npy"), point_map)
print(f"✓ 保存 point_map.npy: {point_map.shape}")

# 2. 保存 intrinsic.npy（确保维度正确）
print("\n[2/3] 保存 intrinsic.npy...")
# intrinsic 现在是 [V, 3, 3]，需要变成 [1, V, 3, 3]
intrinsic_for_plane = intrinsic[np.newaxis, :, :, :]
np.save(os.path.join(output_scene_path, "intrinsic.npy"), intrinsic_for_plane)
print(f"✓ 保存 intrinsic.npy: {intrinsic_for_plane.shape}")

# 3. 复制518x518的彩色图像
print("\n[3/3] 复制彩色图像...")
source_518_dir = "/content/drive/MyDrive/vggt/vggt/final/images_518"
target_color_dir = os.path.join(output_scene_path, "color")

# 获取所有518x518的图像
img_518_files = sorted([f for f in os.listdir(source_518_dir) if f.endswith(('.png', '.jpg', '.jpeg'))])

for i, img_file in enumerate(img_518_files, start=1):
    src = os.path.join(source_518_dir, img_file)
    dst = os.path.join(target_color_dir, f"frame_{str(i).zfill(4)}.png")

    # 复制或重新保存为png
    img = Image.open(src)
    img.save(dst)

**Plane adjust**

In [None]:
!python plane_detection_v2.py

save data for openMVS

In [None]:
%cd /content/drive/MyDrive/vggt/vggt

In [None]:
!wget -q https://raw.githubusercontent.com/colmap/colmap/dev/scripts/python/read_write_model.py

In [None]:
import os, numpy as np
from read_write_model import read_model, write_model, Image as COLImage, Point3D

# ========= 路径设置（按需修改） =========
ORIG_MODEL = "/content/drive/MyDrive/vggt/vggt/final/sparse"        # 原模型目录(复用 cameras.bin 与图像id/name)
EXT_NPY    = "/content/drive/MyDrive/vggt/vggt/final/plane_detection_data/empty_room_new/extrinsic_adjusted.npy"  # 你的外参 [V,3,4], OpenCV: world->cam
PLY_PATH   = "/content/drive/MyDrive/vggt/vggt/final/plane_detection_data/empty_room_new/flattened_full_points.ply"  # 你的修改后 PLY（世界坐标需与外参一致）
OUT_MODEL  = "/content/drive/MyDrive/vggt/vggt/final/plane_detection_data/sparse"    # 输出目录
VOXEL_SIZE = 0.0     # 可选：PLY体素降采样(世界单位)，点非常多可用 0.01~0.05；不需要就 0.0
MIN_VIEWS  = 2       # 每个3D点至少需要被多少个不同视图观测
CHUNK      = 400000  # 投影分块大小，内存紧时可下调
# ====================================

def rotmat_to_qvec(R: np.ndarray) -> np.ndarray:
    t = np.trace(R)
    if t > 0:
        S = np.sqrt(t + 1.0) * 2.0
        qw = 0.25 * S
        qx = (R[2,1] - R[1,2]) / S
        qy = (R[0,2] - R[2,0]) / S
        qz = (R[1,0] - R[0,1]) / S
    else:
        idx = np.argmax([R[0,0], R[1,1], R[2,2]])
        if idx == 0:
            S = np.sqrt(1.0 + R[0,0] - R[1,1] - R[2,2]) * 2.0
            qw = (R[2,1] - R[1,2]) / S; qx = 0.25 * S
            qy = (R[0,1] + R[1,0]) / S; qz = (R[0,2] + R[2,0]) / S
        elif idx == 1:
            S = np.sqrt(1.0 + R[1,1] - R[0,0] - R[2,2]) * 2.0
            qw = (R[0,2] - R[2,0]) / S; qx = (R[0,1] + R[1,0]) / S
            qy = 0.25 * S; qz = (R[1,2] + R[2,1]) / S
        else:
            S = np.sqrt(1.0 + R[2,2] - R[0,0] - R[1,1]) * 2.0
            qw = (R[1,0] - R[0,1]) / S; qx = (R[0,2] + R[2,0]) / S
            qy = (R[1,2] + R[2,1]) / S; qz = 0.25 * S
    q = np.array([qw, qx, qy, qz], dtype=float)
    return q / np.linalg.norm(q)

# 1) 读取原模型（拿 cameras 与图像 id/name/camera_id；图像顺序用 image_id 升序）
cameras, images_in, _points_in = read_model(ORIG_MODEL)
img_ids = sorted(images_in.keys())
V_model = len(img_ids)
print(f"[load] cameras={len(cameras)} images={V_model}")

# 2) 读取你的外参并一一对齐（OpenCV: world->cam）
E = np.load(EXT_NPY)
E = np.asarray(E)
if E.ndim == 4 and E.shape[0] == 1 and E.shape[2:] == (3, 4):
    E = E[0]
assert E.ndim == 3 and E.shape[1:] == (3,4), f"extrinsic shape must be [V,3,4], got {E.shape}"
assert E.shape[0] == V_model, f"view count mismatch: model {V_model} vs extrinsic {E.shape[0]}"

# 3) 从 cameras 提取 fx,fy,cx,cy 与图像尺寸
Ks, sizes = {}, {}
for iid in img_ids:
    img = images_in[iid]
    cam = cameras[img.camera_id]
    if cam.model == "PINHOLE":
        fx, fy, cx, cy = cam.params[:4]
    elif cam.model == "SIMPLE_PINHOLE":
        fx = fy = cam.params[0]; cx = cam.params[1]; cy = cam.params[2]
    elif cam.model in ("OPENCV","FULL_OPENCV","OPENCV_FISHEYE"):
        fx, fy, cx, cy = cam.params[0], cam.params[1], cam.params[2], cam.params[3]
    else:
        fx, fy, cx, cy = cam.params[0], cam.params[1], cam.params[2], cam.params[3]
    K = np.array([[fx, 0,  cx],
                  [0,  fy, cy],
                  [0,  0,   1]], dtype=np.float64)
    Ks[iid] = (K, fx, fy, cx, cy)
    sizes[iid] = (cam.height, cam.width)

# 4) 覆盖 images 的位姿（用你的外参）
images_out = {}
for k, iid in enumerate(img_ids):
    R = E[k,:,:3]; t = E[k,:,3]
    q = rotmat_to_qvec(R)
    img_in = images_in[iid]
    images_out[iid] = COLImage(
        id=iid, qvec=q, tvec=t,
        camera_id=img_in.camera_id, name=img_in.name,
        xys=[], point3D_ids=[]
    )


# 5) 读取你的 PLY（可选体素降采样）
import open3d as o3d
pcd = o3d.io.read_point_cloud(PLY_PATH)
if VOXEL_SIZE and VOXEL_SIZE > 0:
    pcd = pcd.voxel_down_sample(VOXEL_SIZE)
xyz_all = np.asarray(pcd.points, dtype=np.float64)
if xyz_all.shape[0] == 0:
    raise RuntimeError("PLY has 0 points.")
if pcd.has_colors():
    rgb_all = (np.asarray(pcd.colors)*255.0).clip(0,255).astype(np.uint8)
else:
    rgb_all = np.full((len(xyz_all),3), 200, dtype=np.uint8)
print(f"[ply] points={len(xyz_all)} colors={'yes' if pcd.has_colors() else 'no'}")

# 6) 投影构建 tracks（每点收集所有落入画面且 z>0 的观测；最后保留 >= MIN_VIEWS 的点）
points3D_out = {}
pid = 1
Rt_list = [(iid, E[k,:,:3], E[k,:,3]) for k, iid in enumerate(img_ids)]

N = len(xyz_all)
for start in range(0, N, CHUNK):
    xyz = xyz_all[start:start+CHUNK]
    cols = rgb_all[start:start+CHUNK]
    M = xyz.shape[0]
    # 为每个点收集观测列表：[(iid, u, v), ...]
    obs_per_point = [[] for _ in range(M)]

    for iid, R, t in Rt_list:
        (K, fx, fy, cx, cy) = Ks[iid]
        h, w = sizes[iid]
        pts_cam = (xyz @ R.T) + t[None,:]
        z = pts_cam[:,2]
        valid = z > 1e-6
        if not np.any(valid):
            continue
        x = pts_cam[valid,0]; y = pts_cam[valid,1]; zc = pts_cam[valid,2]
        u = (fx * x / zc) + cx
        v = (fy * y / zc) + cy
        ui = np.round(u).astype(np.int32)
        vi = np.round(v).astype(np.int32)
        inb = (ui>=0)&(ui<w)&(vi>=0)&(vi<h)
        if not np.any(inb):
            continue
        valid_idx = np.where(valid)[0][inb]
        for pidx, uu, vv in zip(valid_idx, ui[inb], vi[inb]):
            obs_per_point[pidx].append((iid, float(uu), float(vv)))

    # 汇总：每点对每个出现过的图像取(u,v)平均；写入 images_out 的 xys/point3D_ids
    for j in range(M):
        obs = obs_per_point[j]
        if not obs:
            continue
        uniq_imgs = sorted({iid for (iid,_,_) in obs})
        if len(uniq_imgs) < MIN_VIEWS:
            continue
        this_pid = pid; pid += 1
        image_ids = []; p2d_idxs = []
        for iid in uniq_imgs:
            uv = [(u,v) for (iid2,u,v) in obs if iid2==iid]
            if len(uv) == 1:
                u,v = uv[0]
            else:
                u = float(np.mean([p[0] for p in uv])); v = float(np.mean([p[1] for p in uv]))
            img = images_out[iid]
            idx2d = len(img.xys)
            img.xys.append([u,v])
            img.point3D_ids.append(this_pid)
            image_ids.append(iid); p2d_idxs.append(idx2d)

        points3D_out[this_pid] = Point3D(
            id=this_pid,
            xyz=xyz[j].astype(np.float64),
            rgb=cols[j].astype(np.uint8),
            error=0.0,
            image_ids=np.asarray(image_ids, dtype=np.int32),
            point2D_idxs=np.asarray(p2d_idxs, dtype=np.int32),
        )

print(f"[tracks] kept 3D points with >={MIN_VIEWS} views: {len(points3D_out)}")

# 7) 转成 numpy 数组并写出 .bin/.txt
images_out_cls = {}
for iid, img in images_out.items():
    images_out_cls[iid] = COLImage(
        id=img.id, qvec=img.qvec, tvec=img.tvec,
        camera_id=img.camera_id, name=img.name,
        xys=np.asarray(img.xys, dtype=np.float64),
        point3D_ids=np.asarray(img.point3D_ids, dtype=np.int64)
    )

os.makedirs(OUT_MODEL, exist_ok=True)
write_model(cameras, images_out_cls, points3D_out, OUT_MODEL, ext=".bin")
print(f"✅ wrote images.bin/txt + points3D.bin/txt with tracks to: {OUT_MODEL}")

**OpenMVS**

install requirements

In [None]:
%cd /content/drive/MyDrive/vggt/vggt/openMVS

In [None]:
%%bash
set -e

apt-get update -y

apt-get install -y \
    build-essential cmake git wget

apt-get install -y \
    libeigen3-dev \
    libopencv-dev \
    libboost-all-dev \
    libglfw3-dev libglu1-mesa-dev libxi-dev libxrandr-dev

apt-get install -y \
    libceres-dev \
    libsuitesparse-dev

apt-get install -y \
    libcgal-dev


In [None]:
%%bash
set -euxo pipefail
export DEBIAN_FRONTEND=noninteractive

apt-get update -y
apt-get install -y build-essential cmake git pkg-config

apt-get install -y \
  libeigen3-dev \
  libopencv-dev \
  libcgal-dev \
  libboost-iostreams-dev libboost-program-options-dev libboost-system-dev libboost-serialization-dev \
  libglew-dev libglfw3-dev \
  libfreeimage-dev libpng-dev libjpeg-dev libtiff-dev

apt-get install -y libceres-dev

In [None]:
%%bash
set -euxo pipefail
export DEBIAN_FRONTEND=noninteractive
apt-get update -y
apt-get install -y \
  build-essential cmake git pkg-config \
  libeigen3-dev libopencv-dev libcgal-dev \
  libboost-iostreams-dev libboost-program-options-dev libboost-system-dev libboost-serialization-dev \
  libglew-dev libglfw3-dev \
  libfreeimage-dev libpng-dev libjpeg-dev libtiff-dev \
  libceres-dev

build the file

In [None]:
%%bash
set -euxo pipefail
export DEBIAN_FRONTEND=noninteractive

apt-get update -y
apt-get install -y \
  build-essential cmake git pkg-config \
  libeigen3-dev libopencv-dev libcgal-dev \
  libboost-iostreams-dev libboost-program-options-dev libboost-system-dev libboost-serialization-dev \
  libfreeimage-dev libpng-dev libjpeg-dev libtiff-dev \
  libglew-dev libglfw3-dev \
  libgl1-mesa-dev libglu1-mesa-dev mesa-common-dev \
  libceres-dev

cd /content
if [ -d vcglib/.git ]; then
  cd vcglib; git reset --hard; git clean -fdx; git pull --rebase; cd ..
else
  git clone https://github.com/cdcseacave/VCG.git vcglib
fi
export VCG_ROOT="/content/vcglib"

if [ -d openMVS/.git ]; then
  cd openMVS
  git reset --hard
  git clean -fdx
  git fetch --all --tags
else
  git clone --recursive https://github.com/cdcseacave/openMVS.git
  cd openMVS
fi
git checkout -f v2.1.0
git submodule update --init --recursive

rm -rf _build
cmake -S . -B _build \
  -DCMAKE_BUILD_TYPE=Release \
  -DOpenMVS_USE_CUDA=OFF \
  -DVCG_ROOT="$VCG_ROOT"

cmake --build _build --target \
  InterfaceCOLMAP DensifyPointCloud ReconstructMesh RefineMesh TextureMesh \
  -j"$(nproc)"

echo "[OK] Binaries are at: /content/openMVS/_build/bin"
ls -l /content/openMVS/_build/bin

In [None]:
!wget -q https://raw.githubusercontent.com/colmap/colmap/dev/scripts/python/read_write_model.py -O /content/read_write_model.py


Interface

In [None]:
%%bash
OUT="/content/drive/MyDrive/vggt/vggt/final/openmvs_out"
rm -rf "$OUT"; mkdir -p "$OUT"

/content/openMVS/_build/bin/InterfaceCOLMAP \
  -i "/content/drive/MyDrive/vggt/vggt/final/plane_detection_data" \
  -o "$OUT/scene.mvs" \
  --image-folder "/content/drive/MyDrive/vggt/vggt/final/images_518"

19:29:00 [App     ] Build date: Nov 12 2025, 19:14:48
19:29:00 [App     ] CPU: Intel(R) Xeon(R) CPU @ 2.20GHz (2 cores)
19:29:00 [App     ] RAM: 12.67GB Physical Memory 0B Virtual Memory
19:29:00 [App     ] OS: Linux 6.6.105+ (x86_64)
19:29:00 [App     ] Disk: 12.62GB (100.00GB) space
19:29:00 [App     ] SSE & AVX compatible CPU & OS detected
19:29:00 [App     ] Command line: InterfaceCOLMAP -i /content/drive/MyDrive/vggt/vggt/final/plane_detection_data -o /content/drive/MyDrive/vggt/vggt/final/openmvs_out/scene.mvs --image-folder /content/drive/MyDrive/vggt/vggt/final/images_518
19:29:00 [App     ] Reading cameras: /content/drive/MyDrive/vggt/vggt/final/plane_detection_data/sparse/cameras.bin
19:29:00 [App     ] Reading images: /content/drive/MyDrive/vggt/vggt/final/plane_detection_data/sparse/images.bin
19:29:08 [App     ] Reading points: /content/drive/MyDrive/vggt/vggt/final/plane_detection_data/sparse/points3D.bin
19:29:26 [App     ] Exported data: 20 images & 5366480 vertices (26

mesh

In [None]:
%%bash
/content/openMVS/_build/bin/ReconstructMesh \
    "/content/drive/MyDrive/vggt/vggt/final/openmvs_out/scene.mvs"

19:29:27 [App     ] Build date: Nov 12 2025, 19:14:48
19:29:27 [App     ] CPU: Intel(R) Xeon(R) CPU @ 2.20GHz (2 cores)
19:29:27 [App     ] RAM: 12.67GB Physical Memory 0B Virtual Memory
19:29:27 [App     ] OS: Linux 6.6.105+ (x86_64)
19:29:27 [App     ] Disk: 12.62GB (100.00GB) space
19:29:27 [App     ] SSE & AVX compatible CPU & OS detected
19:29:27 [App     ] Command line: ReconstructMesh /content/drive/MyDrive/vggt/vggt/final/openmvs_out/scene.mvs
19:29:32 [App     ] Camera model loaded: platform 0; camera  0; f 0.584x0.587; poses 1
19:29:32 [App     ] Camera model loaded: platform 1; camera  0; f 0.584x0.588; poses 1
19:29:32 [App     ] Camera model loaded: platform 2; camera  0; f 0.584x0.588; poses 1
19:29:32 [App     ] Camera model loaded: platform 3; camera  0; f 0.582x0.587; poses 1
19:29:32 [App     ] Camera model loaded: platform 4; camera  0; f 0.584x0.588; poses 1
19:29:32 [App     ] Camera model loaded: platform 5; camera  0; f 0.591x0.595; poses 1
19:29:32 [App     ] Ca

texture

In [None]:
%%bash
set -o pipefail

SCENE_DIR="/content/drive/MyDrive/vggt/vggt/final/openmvs_out"
IMG_DIR="/content/drive/MyDrive/vggt/vggt/final/images_518"
IN_MVS="$SCENE_DIR/scene_mesh.mvs"

/content/openMVS/_build/bin/TextureMesh \
  -i "$IN_MVS" \
  -o "$SCENE_DIR/texture.mvs" \
  --export-type ply \
  --resolution-level 0 \
  --min-resolution 512 \
  --verbosity 2

ls -lh "$SCENE_DIR"/texture.*