In [None]:
import torch
import ai2thor.controller
from train.vint_train.models.nomad.nomad import NoMaD
import numpy as np

controller = ai2thor.controller.Controller(
    agentMode="default",
    visibilityDistance=1.5,
    gridSize=0.25,
    width=224,
    height=224,
    fieldOfView=90
)

# Load NoMaD model
model = NoMaD()  # Adjust based on actual model initialization
checkpoint = torch.load("/home/tuandang/tuandang/quanganh/visualnav-transformer/nomad.pth")
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Evaluation loop over FloorPlans 1–430
results = {}
for fp in range(1, 431):
    scene = f"FloorPlan{fp}"
    controller.reset(scene=scene)

    # Define navigation task (e.g., random goal position)
    goal = np.random.uniform(low=[-5, 0, -5], high=[5, 0, 5])  # Example goal
    success = False
    max_steps = 200
    step_count = 0

    while step_count < max_steps:
        # Get current observation (RGB image)
        event = controller.step(action="Pass")  # Update state
        rgb = event.frame  # Shape: (224, 224, 3)

        # Preprocess image and goal for NoMaD
        obs = preprocess_image(rgb)  # Implement preprocessing (e.g., normalize, to tensor)
        goal_input = preprocess_goal(goal)  # Adjust based on NoMaD’s goal format

        # Run model inference
        with torch.no_grad():
            action = model(obs.to(device), goal_input.to(device))
        action = action.cpu().numpy()

        # Execute action in AI2-THOR
        controller.step(
            action="MoveAhead" if action[0] > 0.5 else "RotateRight",
            moveMagnitude=0.25,
            degrees=30
        )

        # Check if goal is reached (e.g., within 0.5m)
        agent_pos = controller.last_event.metadata["agent"]["position"]
        dist_to_goal = np.linalg.norm(np.array([agent_pos["x"], agent_pos["z"]]) - goal[:2])
        if dist_to_goal < 0.5:
            success = True
            break
        step_count += 1

    results[scene] = {"success": success, "steps": step_count}
    print(f"{scene}: Success={success}, Steps={step_count}")

# Summarize results
success_rate = sum(r["success"] for r in results.values()) / len(results)
avg_steps = sum(r["steps"] for r in results.values() if r["success"]) / sum(r["success"] for r in results.values())
print(f"Success Rate: {success_rate:.2f}, Avg Steps (Success): {avg_steps:.2f}")

TypeError: __init__() missing 3 required positional arguments: 'vision_encoder', 'noise_pred_net', and 'dist_pred_net'