# Week 1 V2: Complete Training - ALL 3 Methods (FIXED)

## Changes from V1 (see EXPERT_DEBATE_AND_SOLUTION.md)
1. **Reward redesign**: progress reward, larger success bonus, action penalty
2. **GNS/PhysRobot parameter reduction**: 500K→3K (GNS), 391K→6K (PhysRobot)
3. **Timesteps**: 200K→500K
4. **Exploration**: ent_coef=0.01

**Target**: PPO >50%, GNS >30%, PhysRobot >30%


In [None]:
%%time
!pip install mujoco gymnasium stable-baselines3[extra] torch torch-geometric matplotlib pandas -q
import torch
print(f'PyTorch: {torch.__version__}, CUDA: {torch.cuda.is_available()}')


In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')
SAVE_DIR = '/content/drive/MyDrive/medical_robotics_week1_v2'
for _d in ['', 'models', 'results', 'logs']:
    os.makedirs(f'{SAVE_DIR}/{_d}' if _d else SAVE_DIR, exist_ok=True)
print(f'\ud83d\udcbe {SAVE_DIR} (+ models/ results/ logs/)')


In [None]:
# === ENVIRONMENT V2 — Fixed reward shaping ===
import numpy as np
import mujoco
import gymnasium as gym
from gymnasium import spaces

XML = '''<mujoco model="push_box">
  <compiler angle="degree" coordinate="local" inertiafromgeom="true"/>
  
  <option timestep="0.002" integrator="Euler" gravity="0 0 -9.81">
    <flag warmstart="enable"/>
  </option>
  
  <visual>
    <global offwidth="1280" offheight="720"/>
    <quality shadowsize="4096"/>
    <map force="0.1" zfar="30"/>
  </visual>
  
  <asset>
    <texture builtin="gradient" height="100" rgb1="0.3 0.5 0.7" rgb2="0.1 0.2 0.3" type="skybox" width="100"/>
    <texture builtin="flat" height="1278" mark="cross" markrgb="1 1 1" name="texgeom" random="0.01" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" type="cube" width="127"/>
    <texture builtin="checker" height="100" name="texplane" rgb1="0.2 0.2 0.2" rgb2="0.3 0.3 0.3" type="2d" width="100"/>
    <material name="MatPlane" reflectance="0.3" shininess="0.5" specular="0.5" texrepeat="3 3" texture="texplane"/>
    <material name="geom" texture="texgeom" texuniform="true"/>
  </asset>
  
  <default>
    <joint armature="0.01" damping="0.1" limited="true"/>
    <geom conaffinity="1" condim="3" contype="1" friction="0.5 0.005 0.0001" margin="0.001" material="geom" rgba="0.8 0.6 0.4 1"/>
  </default>
  
  <worldbody>
    <light directional="true" diffuse="0.8 0.8 0.8" pos="0 0 3" dir="0 0 -1"/>
    <light directional="true" diffuse="0.4 0.4 0.4" pos="0 0 3" dir="1 1 -1"/>
    <geom name="floor" type="plane" size="3 3 0.1" rgba="0.8 0.8 0.8 1" material="MatPlane"/>
    
    <body name="arm_base" pos="0 0 0.02">
      <geom name="base_geom" type="cylinder" size="0.05 0.02" rgba="0.3 0.3 0.3 1"/>
      <body name="upper_arm" pos="0 0 0.02">
        <joint name="shoulder" type="hinge" axis="0 0 1" range="-180 180" damping="0.5"/>
        <geom name="upper_arm_geom" type="capsule" fromto="0 0 0 0.4 0 0" size="0.025" rgba="0.5 0.5 0.8 1"/>
        <body name="forearm" pos="0.4 0 0">
          <joint name="elbow" type="hinge" axis="0 0 1" range="-180 180" damping="0.5"/>
          <geom name="forearm_geom" type="capsule" fromto="0 0 0 0.3 0 0" size="0.025" rgba="0.5 0.5 0.8 1"/>
          <site name="endeffector" pos="0.3 0 0" size="0.03" rgba="1 0.5 0 0.8"/>
        </body>
      </body>
    </body>
    
    <body name="box" pos="0.35 0 0.05">
      <freejoint name="box_freejoint"/>
      <geom name="box_geom" type="box" size="0.05 0.05 0.05" mass="0.5" rgba="0.2 0.8 0.2 1" friction="0.5 0.005 0.0001"/>
      <site name="box_center" pos="0 0 0" size="0.01" rgba="0 1 0 1"/>
    </body>
    
    <site name="goal" pos="0.5 0.3 0.02" size="0.06" rgba="1 0 0 0.4" type="sphere"/>
  </worldbody>
  
  <actuator>
    <motor name="shoulder_motor" joint="shoulder" gear="1.0" ctrllimited="true" ctrlrange="-10 10"/>
    <motor name="elbow_motor" joint="elbow" gear="1.0" ctrllimited="true" ctrlrange="-10 10"/>
  </actuator>
</mujoco>
'''

class PushBoxEnv(gym.Env):
    """PushBox V2: Fixed reward shaping with progress reward + large success bonus."""
    def __init__(self, render_mode=None, box_mass=0.5):
        super().__init__()
        self.model = mujoco.MjModel.from_xml_string(XML)
        self.data = mujoco.MjData(self.model)
        self.box_mass = box_mass
        self._set_box_mass(box_mass)
        self._ee_site_id = mujoco.mj_name2id(self.model, mujoco.mjtObj.mjOBJ_SITE, 'endeffector')
        self.action_space = spaces.Box(low=-10.0, high=10.0, shape=(2,), dtype=np.float32)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(16,), dtype=np.float32)
        self.goal_pos = np.array([0.5, 0.3, 0.02])
        self.max_episode_steps = 500
        self.current_step = 0
        self.success_threshold = 0.15   # V2: relaxed from 0.1 to 0.15m
        self._prev_dist_box_goal = None  # V2: for progress reward
        self._prev_dist_ee_box = None    # V2: for reaching progress
        self.render_mode = render_mode
    
    def _set_box_mass(self, mass):
        box_body_id = mujoco.mj_name2id(self.model, mujoco.mjtObj.mjOBJ_BODY, 'box')
        self.model.body_mass[box_body_id] = mass
    
    def set_box_mass(self, mass):
        self.box_mass = mass
        self._set_box_mass(mass)
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        mujoco.mj_resetData(self.model, self.data)
        if seed is not None:
            np.random.seed(seed)
        self.data.qpos[0] = np.random.uniform(-0.5, 0.5)
        self.data.qpos[1] = np.random.uniform(-0.5, 0.5)
        self.data.qpos[2] = np.random.uniform(0.25, 0.45)
        self.data.qpos[3] = np.random.uniform(-0.15, 0.15)
        self.data.qpos[4] = 0.05
        self.data.qpos[5:9] = [1, 0, 0, 0]
        self.data.qvel[:] = 0.0
        mujoco.mj_forward(self.model, self.data)
        self.current_step = 0
        # V2: initialize progress tracking
        ee_pos = self.data.site_xpos[self._ee_site_id]
        box_pos = self.data.qpos[2:5]
        self._prev_dist_box_goal = np.linalg.norm(box_pos[:2] - self.goal_pos[:2])
        self._prev_dist_ee_box = np.linalg.norm(ee_pos[:2] - box_pos[:2])
        return self._get_obs(), self._get_info()
    
    def _get_obs(self):
        joint_pos = self.data.qpos[:2].copy()
        joint_vel = self.data.qvel[:2].copy()
        ee_pos = self.data.site_xpos[self._ee_site_id].copy()
        box_pos = self.data.qpos[2:5].copy()
        box_vel = self.data.qvel[2:5].copy()
        goal_pos = self.goal_pos.copy()
        obs = np.concatenate([joint_pos, joint_vel, ee_pos, box_pos, box_vel, goal_pos])
        return obs.astype(np.float32)
    
    def _get_info(self):
        box_pos = self.data.qpos[2:5]
        distance_to_goal = np.linalg.norm(box_pos[:2] - self.goal_pos[:2])
        success = distance_to_goal < self.success_threshold
        return {'distance_to_goal': distance_to_goal, 'success': success,
                'box_mass': self.box_mass, 'timestep': self.current_step}
    
    def step(self, action):
        self.data.ctrl[:] = action
        for _ in range(5):
            mujoco.mj_step(self.model, self.data)
        
        ee_pos = self.data.site_xpos[self._ee_site_id].copy()
        box_pos = self.data.qpos[2:5].copy()
        
        dist_ee_box = np.linalg.norm(ee_pos[:2] - box_pos[:2])
        dist_box_goal = np.linalg.norm(box_pos[:2] - self.goal_pos[:2])
        
        # ====== V2: Redesigned reward ======
        # 1. Reach progress (delta-based): reward for getting closer to box
        reach_progress = (self._prev_dist_ee_box - dist_ee_box)
        self._prev_dist_ee_box = dist_ee_box
        
        # 2. Push progress (delta-based): reward for pushing box toward goal
        push_progress = (self._prev_dist_box_goal - dist_box_goal)
        self._prev_dist_box_goal = dist_box_goal
        
        # 3. Distance-based shaping (weaker, for gradient)
        reach_reward = -dist_ee_box
        push_reward = -dist_box_goal
        
        # 4. Action penalty (discourage torque waste)
        action_penalty = -0.01 * np.sum(action ** 2)
        
        # Combined reward
        reward = (
            0.5 * reach_reward +        # mild distance shaping
            1.0 * push_reward +          # mild distance shaping
            10.0 * reach_progress +      # strong progress signal for reaching
            20.0 * push_progress +       # very strong progress signal for pushing
            action_penalty               # regularization
        )
        
        success = dist_box_goal < self.success_threshold
        if success:
            remaining_steps = self.max_episode_steps - self.current_step
            reward += 500.0 + remaining_steps * 1.0   # large bonus + early completion bonus
        # ====== End V2 reward ======
        
        self.current_step += 1
        terminated = success
        truncated = self.current_step >= self.max_episode_steps
        return self._get_obs(), reward, terminated, truncated, self._get_info()
    
    def render(self):
        pass
    
    def close(self):
        pass

def make_push_box_env(box_mass=0.5):
    def _init():
        return PushBoxEnv(box_mass=box_mass)
    return _init

# Quick sanity check
env = PushBoxEnv()
obs, info = env.reset()
print(f'Obs: {obs.shape}, Action: {env.action_space.shape}, Threshold: {env.success_threshold}m')
env.close()
print('\u2705 Environment V2 loaded (progress reward + 500 success bonus)')


In [None]:
# === AGENTS V2: Drastically reduced parameters for GNS/PhysRobot ===
import torch
import torch.nn as nn
from stable_baselines3 import PPO
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback

try:
    from torch_geometric.nn import MessagePassing
    from torch_geometric.data import Data as PyGData, Batch as PyGBatch
    HAS_PYG = True
except ImportError:
    HAS_PYG = False
    print('torch_geometric not found, GNS will use non-graph fallback')

class SuccessTrackingCallback(BaseCallback):
    def __init__(self, verbose=1):
        super().__init__(verbose)
        self.episode_count = 0
        self.success_count = 0
        self.success_achieved = False
        self.episodes_to_success = None
    
    def _on_step(self):
        infos = self.locals.get('infos', [{}])
        dones = self.locals.get('dones', [False])
        for i, done in enumerate(dones):
            if done:
                self.episode_count += 1
                info = infos[i] if i < len(infos) else {}
                if info.get('success', False):
                    self.success_count += 1
                    if not self.success_achieved:
                        self.success_achieved = True
                        self.episodes_to_success = self.episode_count
                        print(f'\n\ud83c\udf89 First success at episode {self.episode_count}!')
                if self.episode_count % 100 == 0:
                    rate = self.success_count / self.episode_count * 100
                    print(f'  [Ep {self.episode_count}] Success rate so far: {rate:.1f}%')
        return True

# ──────────────────────────────────────────────────────
# AGENT 1: Pure PPO (unchanged, just added ent_coef)
# ──────────────────────────────────────────────────────
class PurePPOAgent:
    def __init__(self, env, verbose=1):
        self.model = PPO('MlpPolicy', env, learning_rate=3e-4, n_steps=2048, batch_size=64,
                         n_epochs=10, gamma=0.99, ent_coef=0.01, verbose=verbose)
    
    def train(self, total_timesteps, callback=None):
        self.model.learn(total_timesteps=total_timesteps, callback=callback, progress_bar=True)
    
    def predict(self, obs, deterministic=True):
        action, _ = self.model.predict(obs, deterministic=deterministic)
        return action
    
    def save(self, path):
        self.model.save(path)
    
    def evaluate(self, env, n_episodes=100):
        rewards, successes = [], []
        for _ in range(n_episodes):
            obs, info = env.reset()
            done, ep_reward = False, 0
            while not done:
                action = self.predict(obs)
                obs, reward, terminated, truncated, info = env.step(action)
                ep_reward += reward
                done = terminated or truncated
            rewards.append(ep_reward)
            successes.append(1 if info.get('success', False) else 0)
        return {'mean_reward': np.mean(rewards), 'std_reward': np.std(rewards),
                'success_rate': np.mean(successes)}

# ──────────────────────────────────────────────────────
# AGENT 2: GNS V2 — Minimal parameters (~5K total)
# ──────────────────────────────────────────────────────
if HAS_PYG:
    def obs_to_graph_batch(observations):
        """Convert batch of obs [B,16] -> PyG Batch with 2 nodes (ee, box)."""
        batch_size = observations.shape[0]
        dev = observations.device
        graphs = []
        for i in range(batch_size):
            o = observations[i]
            ee_pos  = o[4:7]
            ee_vel  = torch.zeros(3, device=dev)
            box_pos = o[7:10]
            box_vel = o[10:13]
            positions  = torch.stack([ee_pos,  box_pos])
            velocities = torch.stack([ee_vel,  box_vel])
            node_feats = torch.cat([positions, velocities], dim=-1)  # [2,6]
            edge_index = torch.tensor([[0,1],[1,0]], dtype=torch.long, device=dev).t().contiguous()
            rel01 = box_pos - ee_pos
            rel10 = ee_pos - box_pos
            d01 = torch.norm(rel01).unsqueeze(0)
            d10 = torch.norm(rel10).unsqueeze(0)
            edge_attr = torch.stack([torch.cat([rel01, d01]), torch.cat([rel10, d10])])
            g = PyGData(x=node_feats, pos=positions, edge_index=edge_index, edge_attr=edge_attr)
            graphs.append(g)
        return PyGBatch.from_data_list(graphs)

    class GNSGraphLayerV2(MessagePassing):
        """Minimal GN layer for 2-node graph. ~2K params."""
        def __init__(self, node_dim, edge_dim, hidden_dim=32):
            super().__init__(aggr='add')
            self.edge_mlp = nn.Sequential(
                nn.Linear(2*node_dim + edge_dim, hidden_dim), nn.ReLU(),
                nn.Linear(hidden_dim, edge_dim))
            self.node_mlp = nn.Sequential(
                nn.Linear(node_dim + edge_dim, hidden_dim), nn.ReLU(),
                nn.Linear(hidden_dim, node_dim))
        def forward(self, x, edge_index, edge_attr):
            return self.propagate(edge_index, x=x, edge_attr=edge_attr)
        def message(self, x_i, x_j, edge_attr):
            return self.edge_mlp(torch.cat([x_i, x_j, edge_attr], dim=-1))
        def update(self, aggr_out, x):
            return self.node_mlp(torch.cat([x, aggr_out], dim=-1))

    class GNSFeaturesExtractorV2(BaseFeaturesExtractor):
        """GNS V2: ~5K params (down from ~500K). 1 MP layer, hidden=32."""
        def __init__(self, observation_space, features_dim=64):
            super().__init__(observation_space, features_dim)
            hid = 32
            edge_dim = 4
            self.node_encoder = nn.Sequential(nn.Linear(6, hid), nn.ReLU())
            self.edge_encoder = nn.Sequential(nn.Linear(edge_dim, hid), nn.ReLU())
            self.gn_layer = GNSGraphLayerV2(hid, hid, hid)  # single layer
            self.decoder = nn.Linear(hid, 3)
            self.feature_proj = nn.Sequential(nn.Linear(3 + 16, features_dim), nn.ReLU())
        
        def forward(self, observations):
            graph = obs_to_graph_batch(observations)
            x = self.node_encoder(graph.x)
            ea = self.edge_encoder(graph.edge_attr)
            x = x + self.gn_layer(x, graph.edge_index, ea)
            acc = self.decoder(x)
            box_acc = acc[1::2]
            combined = torch.cat([box_acc, observations], dim=-1)
            return self.feature_proj(combined)

class GNSAgent:
    def __init__(self, env, verbose=1):
        if HAS_PYG:
            policy_kwargs = dict(
                features_extractor_class=GNSFeaturesExtractorV2,
                features_extractor_kwargs=dict(features_dim=64),
                net_arch=dict(pi=[64, 64], vf=[64, 64]))
        else:
            policy_kwargs = dict(net_arch=dict(pi=[64, 64], vf=[64, 64]))
        self.model = PPO('MlpPolicy', env, learning_rate=3e-4, n_steps=2048, batch_size=64,
                         n_epochs=10, gamma=0.99, ent_coef=0.01,
                         policy_kwargs=policy_kwargs, verbose=verbose)
    
    def train(self, total_timesteps, callback=None):
        self.model.learn(total_timesteps=total_timesteps, callback=callback, progress_bar=True)
    
    def predict(self, obs, deterministic=True):
        action, _ = self.model.predict(obs, deterministic=deterministic)
        return action
    
    def save(self, path):
        self.model.save(path)
    
    def evaluate(self, env, n_episodes=100):
        rewards, successes = [], []
        for _ in range(n_episodes):
            obs, info = env.reset()
            done, ep_reward = False, 0
            while not done:
                action = self.predict(obs)
                obs, reward, terminated, truncated, info = env.step(action)
                ep_reward += reward
                done = terminated or truncated
            rewards.append(ep_reward)
            successes.append(1 if info.get('success', False) else 0)
        return {'mean_reward': np.mean(rewards), 'std_reward': np.std(rewards),
                'success_rate': np.mean(successes)}

# ──────────────────────────────────────────────────────
# AGENT 3: PhysRobot V2 — Lightweight MLP physics (~6K params)
#   No GNN (2-node graph = waste). Direct relative-geometry MLP.
# ──────────────────────────────────────────────────────
class PhysRobotFeaturesExtractorV2(BaseFeaturesExtractor):
    """Physics-informed V2: lightweight MLP on relative geometry. ~6K params."""
    def __init__(self, observation_space, features_dim=64):
        super().__init__(observation_space, features_dim)
        # Physics stream: relative geometry -> predicted box acceleration
        # Input: [rel_pos(3), rel_vel(3), dist(1), goal_dir(2)] = 9
        self.physics_net = nn.Sequential(
            nn.Linear(9, 32), nn.ReLU(),
            nn.Linear(32, 3)   # predicted box acceleration
        )
        # Policy stream
        self.policy_stream = nn.Sequential(
            nn.Linear(16, 64), nn.ReLU(),
            nn.Linear(64, features_dim)
        )
        # Fusion
        self.fusion = nn.Sequential(
            nn.Linear(features_dim + 3, features_dim), nn.ReLU()
        )
    
    def forward(self, observations):
        # Extract relevant quantities from obs
        ee_pos  = observations[:, 4:7]
        box_pos = observations[:, 7:10]
        box_vel = observations[:, 10:13]
        goal_pos = observations[:, 13:16]
        
        # Physics: relative geometry
        rel_pos = box_pos - ee_pos
        rel_vel = box_vel  # ee_vel \u2248 0 (TODO: fix in P1 with Jacobian)
        dist = torch.norm(rel_pos, dim=-1, keepdim=True).clamp(min=1e-6)
        goal_dir = (goal_pos[:, :2] - box_pos[:, :2])  # 2D direction to goal
        
        physics_input = torch.cat([rel_pos, rel_vel, dist, goal_dir], dim=-1)  # [B, 9]
        physics_pred = self.physics_net(physics_input)  # [B, 3]
        
        # Policy stream
        policy_features = self.policy_stream(observations)  # [B, features_dim]
        
        # Fusion
        combined = torch.cat([policy_features, physics_pred], dim=-1)
        return self.fusion(combined)

class PhysRobotAgent:
    def __init__(self, env, verbose=1):
        policy_kwargs = dict(
            features_extractor_class=PhysRobotFeaturesExtractorV2,
            features_extractor_kwargs=dict(features_dim=64),
            net_arch=dict(pi=[64, 64], vf=[64, 64]))
        self.model = PPO('MlpPolicy', env, learning_rate=3e-4, n_steps=2048, batch_size=64,
                         n_epochs=10, gamma=0.99, ent_coef=0.01,
                         policy_kwargs=policy_kwargs, verbose=verbose)
    
    def train(self, total_timesteps, callback=None):
        self.model.learn(total_timesteps=total_timesteps, callback=callback, progress_bar=True)
    
    def predict(self, obs, deterministic=True):
        action, _ = self.model.predict(obs, deterministic=deterministic)
        return action
    
    def save(self, path):
        self.model.save(path)
    
    def evaluate(self, env, n_episodes=100):
        rewards, successes = [], []
        for _ in range(n_episodes):
            obs, info = env.reset()
            done, ep_reward = False, 0
            while not done:
                action = self.predict(obs)
                obs, reward, terminated, truncated, info = env.step(action)
                ep_reward += reward
                done = terminated or truncated
            rewards.append(ep_reward)
            successes.append(1 if info.get('success', False) else 0)
        return {'mean_reward': np.mean(rewards), 'std_reward': np.std(rewards),
                'success_rate': np.mean(successes)}

# Print parameter counts
def count_params(model):
    return sum(p.numel() for p in model.parameters())

_tmp_env = DummyVecEnv([make_push_box_env()])
_ppo = PurePPOAgent(_tmp_env, verbose=0)
_gns = GNSAgent(_tmp_env, verbose=0)
_pr = PhysRobotAgent(_tmp_env, verbose=0)
print(f'Parameter counts:')
print(f'  Pure PPO:     {count_params(_ppo.model.policy):>8,}')
print(f'  GNS V2:       {count_params(_gns.model.policy):>8,}')
print(f'  PhysRobot V2: {count_params(_pr.model.policy):>8,}')
del _ppo, _gns, _pr
_tmp_env.close()
print('\u2705 All 3 agents V2 loaded')


In [None]:
# V2: Increased timesteps, more eval episodes
CONFIG = {
    'ppo_timesteps': 500_000,
    'gns_timesteps': 500_000,
    'physrobot_timesteps': 500_000,
    'n_envs': 4,
    'box_mass': 0.5,
    'eval_episodes': 100
}
print('V2 Configuration:', CONFIG)
print(f'Expected training time: ~{3 * CONFIG["ppo_timesteps"] / 200_000 * 5:.0f} min total on T4 GPU')


In [None]:
%%time
# === TRAINING ALL 3 METHODS (V2) ===
import time
import json
results = {}

# Method 1: Pure PPO
print('='*60)
print('\ud83d\ude80 TRAINING PURE PPO (V2: ent_coef=0.01, 500K steps)')
print('='*60)
env = DummyVecEnv([make_push_box_env(CONFIG['box_mass']) for _ in range(CONFIG['n_envs'])])
agent1 = PurePPOAgent(env, verbose=0)
callback1 = SuccessTrackingCallback(verbose=1)
start = time.time()
try:
    agent1.train(CONFIG['ppo_timesteps'], callback=callback1)
    train_time = time.time() - start
    eval_env = PushBoxEnv(box_mass=CONFIG['box_mass'])
    eval_res = agent1.evaluate(eval_env, n_episodes=CONFIG['eval_episodes'])
    results['Pure PPO'] = {
        'episodes_to_success': callback1.episodes_to_success,
        'total_episodes': callback1.episode_count,
        'total_successes': callback1.success_count,
        'timesteps': CONFIG['ppo_timesteps'],
        'train_time': train_time,
        'success_rate': eval_res['success_rate'],
        'mean_reward': eval_res['mean_reward'],
        'std_reward': eval_res['std_reward']
    }
    agent1.save(f'{SAVE_DIR}/models/ppo_v2')
    print(f'\u2705 PPO: {eval_res["success_rate"]:.2%} success, {train_time/60:.1f} min')
    eval_env.close()
except Exception as e:
    print(f'\u274c PPO failed: {e}')
    import traceback; traceback.print_exc()
    results['Pure PPO'] = {'error': str(e)}
env.close()

# Method 2: GNS V2
print('\n' + '='*60)
print('\ud83d\ude80 TRAINING GNS V2 (hidden=32, 1 MP layer, ~5K params)')
print('='*60)
env = DummyVecEnv([make_push_box_env(CONFIG['box_mass']) for _ in range(CONFIG['n_envs'])])
agent2 = GNSAgent(env, verbose=0)
callback2 = SuccessTrackingCallback(verbose=1)
start = time.time()
try:
    agent2.train(CONFIG['gns_timesteps'], callback=callback2)
    train_time = time.time() - start
    eval_env = PushBoxEnv(box_mass=CONFIG['box_mass'])
    eval_res = agent2.evaluate(eval_env, n_episodes=CONFIG['eval_episodes'])
    results['GNS V2'] = {
        'episodes_to_success': callback2.episodes_to_success,
        'total_episodes': callback2.episode_count,
        'total_successes': callback2.success_count,
        'timesteps': CONFIG['gns_timesteps'],
        'train_time': train_time,
        'success_rate': eval_res['success_rate'],
        'mean_reward': eval_res['mean_reward'],
        'std_reward': eval_res['std_reward']
    }
    agent2.save(f'{SAVE_DIR}/models/gns_v2')
    print(f'\u2705 GNS V2: {eval_res["success_rate"]:.2%} success, {train_time/60:.1f} min')
    eval_env.close()
except Exception as e:
    print(f'\u274c GNS failed: {e}')
    import traceback; traceback.print_exc()
    results['GNS V2'] = {'error': str(e)}
env.close()

# Method 3: PhysRobot V2
print('\n' + '='*60)
print('\ud83d\ude80 TRAINING PHYSROBOT V2 (MLP physics, ~6K params)')
print('='*60)
env = DummyVecEnv([make_push_box_env(CONFIG['box_mass']) for _ in range(CONFIG['n_envs'])])
agent3 = PhysRobotAgent(env, verbose=0)
callback3 = SuccessTrackingCallback(verbose=1)
start = time.time()
try:
    agent3.train(CONFIG['physrobot_timesteps'], callback=callback3)
    train_time = time.time() - start
    eval_env = PushBoxEnv(box_mass=CONFIG['box_mass'])
    eval_res = agent3.evaluate(eval_env, n_episodes=CONFIG['eval_episodes'])
    results['PhysRobot V2'] = {
        'episodes_to_success': callback3.episodes_to_success,
        'total_episodes': callback3.episode_count,
        'total_successes': callback3.success_count,
        'timesteps': CONFIG['physrobot_timesteps'],
        'train_time': train_time,
        'success_rate': eval_res['success_rate'],
        'mean_reward': eval_res['mean_reward'],
        'std_reward': eval_res['std_reward']
    }
    agent3.save(f'{SAVE_DIR}/models/physrobot_v2')
    print(f'\u2705 PhysRobot V2: {eval_res["success_rate"]:.2%} success, {train_time/60:.1f} min')
    eval_env.close()
except Exception as e:
    print(f'\u274c PhysRobot failed: {e}')
    import traceback; traceback.print_exc()
    results['PhysRobot V2'] = {'error': str(e)}
env.close()

print('\n' + '='*60)
print('\ud83c\udf89 ALL TRAINING COMPLETE')
print('='*60)
for m, r in results.items():
    if 'error' not in r:
        print(f'  {m}: {r["success_rate"]:.2%} success ({r["train_time"]/60:.1f} min)')


In [None]:
# === RESULTS COMPARISON ===
import pandas as pd
import matplotlib.pyplot as plt

df_data = []
for method, res in results.items():
    if 'error' not in res:
        df_data.append({
            'Method': method,
            'Success Rate': f"{res['success_rate']:.1%}",
            'Mean Reward': f"{res['mean_reward']:.1f} \u00b1 {res['std_reward']:.1f}",
            'First Success (ep)': res.get('episodes_to_success', 'N/A'),
            'Train Time (min)': f"{res['train_time']/60:.1f}",
            'Timesteps': f"{res['timesteps']/1000:.0f}K"
        })

df = pd.DataFrame(df_data)
print('\n\ud83d\udcca Results Comparison (V2):')
print(df.to_string(index=False))

# Comparison with V1
print('\n\ud83d\udcca Comparison with V1 results:')
v1 = {'Pure PPO': '6%', 'GNS': '0%', 'PhysRobot': '0%'}
for m, r in results.items():
    if 'error' not in r:
        v1_key = m.replace(' V2', '')
        v1_rate = v1.get(v1_key, '?')
        print(f'  {m}: V1={v1_rate} \u2192 V2={r["success_rate"]:.1%}')

# Save
with open(f'{SAVE_DIR}/results/training_results_v2.json', 'w') as f:
    json.dump(results, f, indent=2, default=str)
print(f'\n\ud83d\udcbe Saved to {SAVE_DIR}/results/training_results_v2.json')

# Bar chart
if df_data:
    methods = [d['Method'] for d in df_data]
    rates = [results[m]['success_rate'] * 100 for m in results if 'error' not in results[m]]
    colors = ['#4CAF50', '#2196F3', '#FF9800'][:len(methods)]
    fig, ax = plt.subplots(figsize=(8, 5))
    bars = ax.bar(methods, rates, color=colors)
    ax.set_ylabel('Success Rate (%)')
    ax.set_title('Week 1 V2: Training Results (500K steps)')
    ax.set_ylim(0, 100)
    for bar, rate in zip(bars, rates):
        ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 2,
                f'{rate:.1f}%', ha='center', va='bottom', fontweight='bold')
    # Add V1 comparison line
    v1_rates = [6, 0, 0][:len(methods)]
    for i, (m, v1r) in enumerate(zip(methods, v1_rates)):
        ax.plot([i-0.3, i+0.3], [v1r, v1r], 'r--', linewidth=2, alpha=0.5)
    ax.legend(['V1 baseline'], loc='upper right')
    plt.tight_layout()
    plt.savefig(f'{SAVE_DIR}/results/success_rates_v2.png', dpi=150)
    plt.show()
    print(f'\ud83d\udcbe Chart saved')


In [None]:
# === OOD GENERALIZATION TEST (V2) ===
print('\n\ud83e\uddea OOD Generalization Test (different box masses)...')
mass_range = [0.25, 0.5, 0.75, 1.0, 1.5, 2.0, 3.0]
ood_results = {}
n_ood_episodes = 100

agents_map = {}
for name, agent_var in [('Pure PPO', 'agent1'), ('GNS V2', 'agent2'), ('PhysRobot V2', 'agent3')]:
    if name in results and 'error' not in results[name]:
        agents_map[name] = eval(agent_var)

for method_name, agent in agents_map.items():
    print(f'\nTesting {method_name}...')
    method_results = []
    for mass in mass_range:
        test_env = PushBoxEnv(box_mass=mass)
        success_count = 0
        for _ in range(n_ood_episodes):
            obs, info = test_env.reset()
            done = False
            while not done:
                action = agent.predict(obs)
                obs, reward, terminated, truncated, info = test_env.step(action)
                done = terminated or truncated
            if info.get('success', False):
                success_count += 1
        rate = success_count / n_ood_episodes
        method_results.append({'mass': mass, 'success_rate': rate})
        marker = '\u2705' if rate > 0.3 else '\u26a0\ufe0f' if rate > 0 else '\u274c'
        print(f'  Mass {mass:.2f}: {rate:.1%} {marker}')
        test_env.close()
    ood_results[method_name] = method_results

# Save and plot
with open(f'{SAVE_DIR}/results/ood_results_v2.json', 'w') as f:
    json.dump(ood_results, f, indent=2)

if ood_results:
    fig, ax = plt.subplots(figsize=(10, 6))
    colors = {'Pure PPO': '#4CAF50', 'GNS V2': '#2196F3', 'PhysRobot V2': '#FF9800'}
    for method_name, method_results in ood_results.items():
        masses = [r['mass'] for r in method_results]
        rates = [r['success_rate'] * 100 for r in method_results]
        ax.plot(masses, rates, 'o-', label=method_name,
                color=colors.get(method_name, 'gray'), linewidth=2, markersize=8)
    ax.axvline(x=0.5, color='gray', linestyle='--', alpha=0.5, label='Training mass')
    ax.set_xlabel('Box Mass (kg)')
    ax.set_ylabel('Success Rate (%)')
    ax.set_title('OOD Generalization: Success Rate vs Box Mass')
    ax.legend()
    ax.set_ylim(-5, 105)
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'{SAVE_DIR}/results/ood_generalization_v2.png', dpi=150)
    plt.show()
print(f'\n\ud83d\udcbe OOD results saved')


In [None]:
# === FINAL SUMMARY ===
print('\n' + '='*60)
print('\ud83c\udfaf WEEK 1 V2 EXPERIMENT COMPLETE')
print('='*60)
print(f'Models: {SAVE_DIR}/models/')
print(f'Results: {SAVE_DIR}/results/')

print('\n\ud83d\udcca Key Results:')
for method, res in results.items():
    if 'error' not in res:
        print(f'  {method}: {res["success_rate"]:.1%} success ({res["train_time"]/60:.1f}m)')

print('\n\ud83d\udcdd Changes from V1:')
print('  1. Reward: progress-based + 500 success bonus + action penalty')
print('  2. GNS: 500K\u2192~5K params (hidden 128\u219232, 3\u21921 MP layer)')
print('  3. PhysRobot: 391K\u2192~6K params (MLP physics, no GNN)')
print('  4. Timesteps: 200K\u2192500K, ent_coef=0.01')
print('  5. Success threshold: 0.1\u21920.15m')

print('\n\ud83d\udd2e Next steps (P1):')
print('  - Fix ee_vel (Jacobian from joint velocities)')
print('  - Implement proper antisymmetric edge frame')
print('  - Multi-seed experiments (5 seeds)')
print('  - Multi-object environment')
