In [1]:
import os
import torch
import numpy as np

from soundspaces.utils import load_metadata
from ss_baselines.savi.pretraining_ours.audiogoal_predictor import AudioGoalPredictor
from ss_baselines.savi.pretraining_ours.audiogoal_dataset import AudioGoalDataset
from ss_baselines.savi.config.default import get_config
from soundspaces.mp3d_utils import SCENE_SPLITS

# -----------------------
# 1) 基本配置（你按需改）
# -----------------------
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

CONFIG_PATH = "/home/Disk/sound-space/ss_baselines/savi/config/semantic_audionav/savi.yaml"
SPLIT = "val"                 # "train" / "val" / "test"
USE_CACHE = False             # 建议 inference 时 False，避免占用巨大内存
SAMPLE_INDEX = 0              # 你要看的 dataset index
PREDICT_LABEL = False
PREDICT_LOCATION = True

# 可选：加载 ckpt（设为 None 就不加载）
CKPT_PATH = "/home/Disk/yyz/sound-spaces/data/models/savi_final_depth/ckpt.73.pth"   # or None


# -----------------------
# 2) 构建 config / dataset
# -----------------------
config = get_config(config_paths=CONFIG_PATH, opts=None, run_type=None)
meta_dir = config.TASK_CONFIG.SIMULATOR.AUDIO.METADATA_DIR

scenes = SCENE_SPLITS[SPLIT]

scene_graphs = {}
for scene in scenes:
    points, graph = load_metadata(os.path.join(meta_dir, "mp3d", scene))
    scene_graphs[scene] = graph

dataset = AudioGoalDataset(
    scene_graphs=scene_graphs,
    scenes=scenes,
    split=SPLIT,
    use_polar_coordinates=False,
    use_cache=USE_CACHE,
)

print(f"[INFO] dataset split={SPLIT}, len={len(dataset)}")


# -----------------------
# 3) 构建模型 + 可选加载权重
# -----------------------
model = AudioGoalPredictor(
    predict_label=PREDICT_LABEL,
    predict_location=PREDICT_LOCATION
).to(DEVICE)
model.eval()

if CKPT_PATH is not None and os.path.exists(CKPT_PATH):
    ckpt = torch.load(CKPT_PATH, map_location="cpu")
    # 你的 trainer 存的是 {"audiogoal_predictor": state_dict}
    if "audiogoal_predictor" in ckpt:
        model.load_state_dict(ckpt["audiogoal_predictor"], strict=True)
        print(f"[INFO] loaded ckpt: {CKPT_PATH}")
    else:
        # 万一你保存的是裸 state_dict
        model.load_state_dict(ckpt, strict=True)
        print(f"[INFO] loaded ckpt (raw state_dict): {CKPT_PATH}")
else:
    print("[WARN] ckpt not loaded (path is None or not exists).")


# -----------------------
# 4) 取一个 sample，跑一次 forward
# -----------------------
(inputs_list, gt) = dataset[SAMPLE_INDEX]      # inputs_list = [spectrogram], gt shape (3,)
spectrogram = inputs_list[0]                  # torch tensor, shape (2, 65, 26)
depth = inputs_list[1]                # torch tensor, shape (1, 128, 128)  （没用到）

# 加 batch 维度 -> (1, 2, 65, 26)
spec = spectrogram.unsqueeze(0).to(DEVICE, dtype=torch.float32)
depth = depth.unsqueeze(0).to(DEVICE, dtype=torch.float32)  # (1,1,128,128)
# with torch.no_grad():
#     pred = model({"spectrogram": x})          # shape: (1, C) or (1, C+2) or (1,2)
#     pred = pred.squeeze(0).cpu()

# gt_np = gt.cpu().numpy() if torch.is_tensor(gt) else np.array(gt)

# print("========== SAMPLE ==========")
# print("Index:", SAMPLE_INDEX)
# print("Spectrogram shape:", spectrogram.shape)

# print("\n========== GT ==========")
# # gt[0] 是 label index；gt[1:] 是 (x,y)
# print("GT label:", int(gt_np[0]))
# print("GT xy   :", gt_np[1:])

# print("\n========== PRED ==========")
# if PREDICT_LABEL and PREDICT_LOCATION:
#     # pred[:-2] logits, pred[-2:] xy
#     logits = pred[:-2].numpy()
#     xy = pred[-2:].numpy()
#     pred_label = int(np.argmax(np.abs(logits)))
#     print("Pred label:", pred_label)
#     print("Pred xy   :", xy)
# elif PREDICT_LABEL:
#     logits = pred.numpy()
#     pred_label = int(np.argmax(np.abs(logits)))
#     print("Pred label:", pred_label)
# elif PREDICT_LOCATION:
#     xy = pred.numpy()
#     print("Pred xy:", xy)

# print("\n[INFO] raw pred tensor shape:", tuple(pred.shape))


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
2025-12-31 11:14:01.819146: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-31 11:14:02.072152: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-31 11:14:03.28

FileNotFoundError: [Errno 2] No such file or directory: 'configs/semantic_audionav/savi/mp3d/semantic_audiogoal.yaml'

In [2]:
import os, pickle
import numpy as np
from tqdm import tqdm

pkl_dir = "/home/Disk/yyz/sound-spaces/data/scene_observations/mp3d"   # 你的 pkl 目录
out_dir = "/home/Disk/sound-space/depth_npy/mp3d"            # 输出目录

os.makedirs(out_dir, exist_ok=True)

for fname in tqdm([f for f in os.listdir(pkl_dir) if f.endswith(".pkl")]):
    scene = os.path.splitext(fname)[0]
    pkl_path = os.path.join(pkl_dir, fname)

    with open(pkl_path, "rb") as f:
        data = pickle.load(f)  # dict[(node,angle)] -> {"rgb","depth"}

    scene_out = os.path.join(out_dir, scene)
    os.makedirs(scene_out, exist_ok=True)

    for (node, ang), obs in data.items():
        depth = obs["depth"]  # (128,128,1) float32
        # 存成 (1,H,W) 更符合 pytorch
        if depth.ndim == 3 and depth.shape[-1] == 1:
            depth = depth.transpose(2,0,1)  # (1,128,128)

        # 可选：float16 省一半空间/IO
        # depth = depth.astype(np.float16)

        dpath = os.path.join(scene_out, f"{node}_{ang}.npy")
        np.save(dpath, depth)


100%|██████████| 85/85 [00:05<00:00, 14.54it/s]
