Skip to content

Commit

Permalink
Merge pull request #197 from zuoxingdong/metric
Browse files Browse the repository at this point in the history
Update metric: independent of Trajectory object, but clear arguments …
  • Loading branch information
zuoxingdong committed Jul 1, 2019
2 parents 5174394 + a8f49b0 commit 1b8748f
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 52 deletions.
6 changes: 3 additions & 3 deletions baselines/ppo/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,9 @@ def learn_one_update(self, data):
self.policy_optimizer.zero_grad()
policy_loss.backward()
policy_grad_norm = nn.utils.clip_grad_norm_(self.policy.parameters(), self.config['agent.max_grad_norm'])
self.policy_optimizer.step()
if self.config['agent.use_lr_scheduler']:
self.policy_lr_scheduler.step(self.total_timestep)
self.policy_optimizer.step()

clipped_Vs = old_Vs + torch.clamp(Vs - old_Vs, -eps, eps)
value_loss = torch.max(F.mse_loss(Vs, old_Qs, reduction='none'),
Expand Down Expand Up @@ -161,9 +161,9 @@ def learn(self, D, **kwargs):
with torch.no_grad():
last_observations = tensorify(np.concatenate([traj.last_observation for traj in D], 0), self.device)
last_Vs = self.value(last_observations).squeeze(-1)
Qs = [bootstrapped_returns(self.config['agent.gamma'], traj, last_V)
Qs = [bootstrapped_returns(self.config['agent.gamma'], traj.rewards, last_V, traj.reach_terminal)
for traj, last_V in zip(D, last_Vs)]
As = [gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj, V, last_V)
As = [gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj.rewards, V, last_V, traj.reach_terminal)
for traj, V, last_V in zip(D, Vs, last_Vs)]

# Metrics -> Tensor, device
Expand Down
4 changes: 2 additions & 2 deletions baselines/vpg/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ def learn(self, D, **kwargs):
with torch.no_grad():
last_observations = tensorify(np.concatenate([traj.last_observation for traj in D], 0), self.device)
last_Vs = self.V_head(self.feature_network(last_observations)).squeeze(-1)
Qs = [bootstrapped_returns(self.config['agent.gamma'], traj, last_V)
Qs = [bootstrapped_returns(self.config['agent.gamma'], traj.rewards, last_V, traj.reach_terminal)
for traj, last_V in zip(D, last_Vs)]
As = [gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj, V, last_V)
As = [gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj.rewards, V, last_V, traj.reach_terminal)
for traj, V, last_V in zip(D, Vs, last_Vs)]

# Metrics -> Tensor, device
Expand Down
4 changes: 2 additions & 2 deletions lagom/metric/gae.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from .td import td0_error


def gae(gamma, lam, traj, Vs, last_V):
def gae(gamma, lam, rewards, Vs, last_V, reach_terminal):
r"""Calculate the Generalized Advantage Estimation (GAE) of a batch of episodic transitions.
Let :math:`\delta_t` be the TD(0) error at time step :math:`t`, the GAE at time step :math:`t` is calculated
Expand All @@ -15,5 +15,5 @@ def gae(gamma, lam, traj, Vs, last_V):
A_t^{\mathrm{GAE}(\gamma, \lambda)} = \sum_{k=0}^{\infty}(\gamma\lambda)^k \delta_{t + k}
"""
delta = td0_error(gamma, traj, Vs, last_V)
delta = td0_error(gamma, rewards, Vs, last_V, reach_terminal)
return geometric_cumsum(gamma*lam, delta)[0].astype(np.float32)
12 changes: 6 additions & 6 deletions lagom/metric/returns.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
from lagom.utils import numpify


def returns(gamma, traj):
return geometric_cumsum(gamma, traj.rewards)[0].astype(np.float32)
def returns(gamma, rewards):
return geometric_cumsum(gamma, rewards)[0, :].astype(np.float32)


def bootstrapped_returns(gamma, traj, last_V):
def bootstrapped_returns(gamma, rewards, last_V, reach_terminal):
r"""Return (discounted) accumulated returns with bootstrapping for a
batch of episodic transitions.
Expand All @@ -24,8 +24,8 @@ def bootstrapped_returns(gamma, traj, last_V):
"""
last_V = numpify(last_V, np.float32).item()

if traj.reach_terminal:
out = geometric_cumsum(gamma, np.append(traj.rewards, 0.0))
if reach_terminal:
out = geometric_cumsum(gamma, np.append(rewards, 0.0))
else:
out = geometric_cumsum(gamma, np.append(traj.rewards, last_V))
out = geometric_cumsum(gamma, np.append(rewards, last_V))
return out[0, :-1].astype(np.float32)
14 changes: 8 additions & 6 deletions lagom/metric/td.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from lagom.utils import numpify


def td0_target(gamma, traj, Vs, last_V):
def td0_target(gamma, rewards, Vs, last_V, reach_terminal):
r"""Calculate TD(0) targets of a batch of episodic transitions.
Let :math:`r_1, r_2, \dots, r_T` be a list of rewards and let :math:`V(s_0), V(s_1), \dots, V(s_{T-1}), V(s_{T})`
Expand All @@ -18,18 +18,19 @@ def td0_target(gamma, traj, Vs, last_V):
The state values for terminal states are masked out as zero !
"""
rewards = numpify(rewards, np.float32)
Vs = numpify(Vs, np.float32)
last_V = numpify(last_V, np.float32)

if traj.reach_terminal:
if reach_terminal:
Vs = np.append(Vs, 0.0)
else:
Vs = np.append(Vs, last_V)
out = traj.numpy_rewards + gamma*Vs[1:]
out = rewards + gamma*Vs[1:]
return out.astype(np.float32)


def td0_error(gamma, traj, Vs, last_V):
def td0_error(gamma, rewards, Vs, last_V, reach_terminal):
r"""Calculate TD(0) errors of a batch of episodic transitions.
Let :math:`r_1, r_2, \dots, r_T` be a list of rewards and let :math:`V(s_0), V(s_1), \dots, V(s_{T-1}), V(s_{T})`
Expand All @@ -44,12 +45,13 @@ def td0_error(gamma, traj, Vs, last_V):
The state values for terminal states are masked out as zero !
"""
rewards = numpify(rewards, np.float32)
Vs = numpify(Vs, np.float32)
last_V = numpify(last_V, np.float32)

if traj.reach_terminal:
if reach_terminal:
Vs = np.append(Vs, 0.0)
else:
Vs = np.append(Vs, last_V)
out = traj.numpy_rewards + gamma*Vs[1:] - Vs[:-1]
out = rewards + gamma*Vs[1:] - Vs[:-1]
return out.astype(np.float32)
66 changes: 33 additions & 33 deletions test/test_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,32 +108,32 @@ def test_returns(num_env, init_seed, T):
runner = EpisodeRunner()
D = runner(agent, env, T)
for traj in D:
Qs = returns(gamma, traj)
Qs = returns(gamma, traj.rewards)
assert np.allclose(ys[len(traj)], Qs)

# Raw test
D = Trajectory()
D.dones = [False, False, True]
D.rewards = [1, 2, 3]
out = returns(1.0, D)
out = returns(1.0, D.rewards)
assert np.allclose(out, [6, 5, 3])
out = returns(0.1, D)
out = returns(0.1, D.rewards)
assert np.allclose(out, [1.23, 2.3, 3])

D = Trajectory()
D.dones = [False, False, False, False, False]
D.rewards = [1, 2, 3, 4, 5]
out = returns(1.0, D)
out = returns(1.0, D.rewards)
assert np.allclose(out, [15, 14, 12, 9, 5])
out = returns(0.1, D)
out = returns(0.1, D.rewards)
assert np.allclose(out, [1.2345, 2.345, 3.45, 4.5, 5])

D = Trajectory()
D.dones = [False, False, False, False, False, False, False, True]
D.rewards = [1, 2, 3, 4, 5, 6, 7, 8]
out = returns(1.0, D)
out = returns(1.0, D.rewards)
assert np.allclose(out, [36, 35, 33, 30, 26, 21, 15, 8])
out = returns(0.1, D)
out = returns(0.1, D.rewards)
assert np.allclose(out, [1.2345678, 2.345678, 3.45678, 4.5678, 5.678, 6.78, 7.8, 8])


Expand All @@ -145,7 +145,7 @@ def test_bootstrapped_returns(gamma, last_V):
infos = [{}, {}, {}, {}, {}]
D.step_infos = [StepInfo(done, info) for done, info in zip(dones, infos)]
D.rewards = [0.1, 0.2, 0.3, 0.4, 0.5]
out = bootstrapped_returns(gamma, D, last_V)
out = bootstrapped_returns(gamma, D.rewards, last_V, D.reach_terminal)
y = [0.1 + gamma*(0.2 + gamma*(0.3 + gamma*(0.4 + gamma*(0.5 + gamma*last_V)))),
0.2 + gamma*(0.3 + gamma*(0.4 + gamma*(0.5 + gamma*last_V))),
0.3 + gamma*(0.4 + gamma*(0.5 + gamma*last_V)),
Expand All @@ -158,15 +158,15 @@ def test_bootstrapped_returns(gamma, last_V):
infos = [{}, {}, {}, {}]
D.step_infos = [StepInfo(done, info) for done, info in zip(dones, infos)]
D.rewards = [0.1, 0.2, 0.3, 0.4]
out = bootstrapped_returns(gamma, D, last_V)
out = bootstrapped_returns(gamma, D.rewards, last_V, D.reach_terminal)
y = [0.1 + gamma*(0.2 + gamma*(0.3 + gamma*(0.4 + gamma*last_V*0.0))),
0.2 + gamma*(0.3 + gamma*(0.4 + gamma*last_V*0.0)),
0.3 + gamma*(0.4 + gamma*last_V*0.0),
0.4 + gamma*last_V*0.0]
assert np.allclose(out, y)

D.step_infos[-1].done = False
out = bootstrapped_returns(gamma, D, last_V)
out = bootstrapped_returns(gamma, D.rewards, last_V, D.reach_terminal)
y = [0.1 + gamma*(0.2 + gamma*(0.3 + gamma*(0.4 + gamma*last_V))),
0.2 + gamma*(0.3 + gamma*(0.4 + gamma*last_V)),
0.3 + gamma*(0.4 + gamma*last_V),
Expand All @@ -178,7 +178,7 @@ def test_bootstrapped_returns(gamma, last_V):
infos = [{}, {}, {}, {}, {}]
D.step_infos = [StepInfo(done, info) for done, info in zip(dones, infos)]
D.rewards = [0.1, 0.2, 0.3, 0.4, 0.5]
out = bootstrapped_returns(gamma, D, last_V)
out = bootstrapped_returns(gamma, D.rewards, last_V, D.reach_terminal)
y = [0.1 + gamma*(0.2 + gamma*(0.3 + gamma*(0.4 + gamma*(0.5 + gamma*last_V*0.0)))),
0.2 + gamma*(0.3 + gamma*(0.4 + gamma*(0.5 + gamma*last_V*0.0))),
0.3 + gamma*(0.4 + gamma*(0.5 + gamma*last_V*0.0)),
Expand All @@ -187,7 +187,7 @@ def test_bootstrapped_returns(gamma, last_V):
assert np.allclose(out, y)

D.step_infos[-1].done = False
out = bootstrapped_returns(gamma, D, last_V)
out = bootstrapped_returns(gamma, D.rewards, last_V, D.reach_terminal)
y = [0.1 + gamma*(0.2 + gamma*(0.3 + gamma*(0.4 + gamma*(0.5 + gamma*last_V)))),
0.2 + gamma*(0.3 + gamma*(0.4 + gamma*(0.5 + gamma*last_V))),
0.3 + gamma*(0.4 + gamma*(0.5 + gamma*last_V)),
Expand All @@ -200,7 +200,7 @@ def test_bootstrapped_returns(gamma, last_V):
infos = [{}, {}]
D.step_infos = [StepInfo(done, info) for done, info in zip(dones, infos)]
D.rewards = [0.1, 0.2]
out = bootstrapped_returns(gamma, D, last_V)
out = bootstrapped_returns(gamma, D.rewards, last_V, D.reach_terminal)
y = [0.1 + gamma*(0.2 + gamma*last_V),
0.2 + gamma*last_V]
assert np.allclose(out, y)
Expand All @@ -214,15 +214,15 @@ def test_td0_target(gamma):
D.step_infos = [StepInfo(done, info) for done, info in zip(dones, infos)]
D.rewards = [0.1, 0.2, 0.3, 0.4]
Vs = [1, 2, 3, 4]
out = td0_target(gamma, D, Vs, 40)
out = td0_target(gamma, D.rewards, Vs, 40, D.reach_terminal)
y = [0.1 + gamma*2,
0.2 + gamma*3,
0.3 + gamma*4,
0.4 + gamma*40*0.0]
assert np.allclose(out, y)

D.step_infos[-1].done = False
out = td0_target(gamma, D, Vs, 40)
out = td0_target(gamma, D.rewards, Vs, 40, D.reach_terminal)
y = [0.1 + gamma*2,
0.2 + gamma*3,
0.3 + gamma*4,
Expand All @@ -235,7 +235,7 @@ def test_td0_target(gamma):
D.step_infos = [StepInfo(done, info) for done, info in zip(dones, infos)]
D.rewards = [0.1, 0.2, 0.3, 0.4]
Vs = [1, 2, 3, 4]
out = td0_target(gamma, D, Vs, 40)
out = td0_target(gamma, D.rewards, Vs, 40, D.reach_terminal)
y = [0.1 + gamma*2,
0.2 + gamma*3,
0.3 + gamma*4,
Expand All @@ -248,7 +248,7 @@ def test_td0_target(gamma):
D.step_infos = [StepInfo(done, info) for done, info in zip(dones, infos)]
D.rewards = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
Vs = [1, 2, 3, 4, 5, 6]
out = td0_target(gamma, D, Vs, 60)
out = td0_target(gamma, D.rewards, Vs, 60, D.reach_terminal)
y = [0.1 + gamma*2,
0.2 + gamma*3,
0.3 + gamma*4,
Expand All @@ -258,7 +258,7 @@ def test_td0_target(gamma):
assert np.allclose(out, y)

D.step_infos[-1].done = False
out = td0_target(gamma, D, Vs, 60)
out = td0_target(gamma, D.rewards, Vs, 60, D.reach_terminal)
y = [0.1 + gamma*2,
0.2 + gamma*3,
0.3 + gamma*4,
Expand All @@ -276,15 +276,15 @@ def test_td0_error(gamma):
D.step_infos = [StepInfo(done, info) for done, info in zip(dones, infos)]
D.rewards = [0.1, 0.2, 0.3, 0.4]
Vs = [1, 2, 3, 4]
out = td0_error(gamma, D, Vs, 40)
out = td0_error(gamma, D.rewards, Vs, 40, D.reach_terminal)
y = [0.1 + gamma*2 - 1,
0.2 + gamma*3 - 2,
0.3 + gamma*4 - 3,
0.4 + gamma*40*0.0 - 4]
assert np.allclose(out, y)

D.step_infos[-1].done = False
out = td0_error(gamma, D, Vs, 40)
out = td0_error(gamma, D.rewards, Vs, 40, D.reach_terminal)
y = [0.1 + gamma*2 - 1,
0.2 + gamma*3 - 2,
0.3 + gamma*4 - 3,
Expand All @@ -297,7 +297,7 @@ def test_td0_error(gamma):
D.step_infos = [StepInfo(done, info) for done, info in zip(dones, infos)]
D.rewards = [0.1, 0.2, 0.3, 0.4]
Vs = [1, 2, 3, 4]
out = td0_error(gamma, D, Vs, 40)
out = td0_error(gamma, D.rewards, Vs, 40, D.reach_terminal)
y = [0.1 + gamma*2 - 1,
0.2 + gamma*3 - 2,
0.3 + gamma*4 - 3,
Expand All @@ -310,7 +310,7 @@ def test_td0_error(gamma):
D.step_infos = [StepInfo(done, info) for done, info in zip(dones, infos)]
D.rewards = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
Vs = [1, 2, 3, 4, 5, 6]
out = td0_error(gamma, D, Vs, 60)
out = td0_error(gamma, D.rewards, Vs, 60, D.reach_terminal)
y = [0.1 + gamma*2 - 1,
0.2 + gamma*3 - 2,
0.3 + gamma*4 - 3,
Expand All @@ -320,7 +320,7 @@ def test_td0_error(gamma):
assert np.allclose(out, y)

D.step_infos[-1].done = False
out = td0_error(gamma, D, Vs, 60)
out = td0_error(gamma, D.rewards, Vs, 60, D.reach_terminal)
y = [0.1 + gamma*2 - 1,
0.2 + gamma*3 - 2,
0.3 + gamma*4 - 3,
Expand All @@ -337,9 +337,9 @@ def test_gae():
D.step_infos = [StepInfo(done, info) for done, info in zip(dones, infos)]
D.rewards = [1, 2, 3]
Vs = [0.1, 1.1, 2.1]
out = gae(1.0, 0.5, D, Vs, 10)
out = gae(1.0, 0.5, D.rewards, Vs, 10, D.reach_terminal)
assert np.allclose(out, [3.725, 3.45, 0.9])
out = gae(0.1, 0.2, D, Vs, 10)
out = gae(0.1, 0.2, D.rewards, Vs, 10, D.reach_terminal)
assert np.allclose(out, [1.03256, 1.128, 0.9])

D = Trajectory()
Expand All @@ -348,9 +348,9 @@ def test_gae():
D.step_infos = [StepInfo(done, info) for done, info in zip(dones, infos)]
D.rewards = [1, 2, 3]
Vs = [0.5, 1.5, 2.5]
out = gae(1.0, 0.5, D, Vs, 99)
out = gae(1.0, 0.5, D.rewards, Vs, 99, D.reach_terminal)
assert np.allclose(out, [3.625, 3.25, 0.5])
out = gae(0.1, 0.2, D, Vs, 99)
out = gae(0.1, 0.2, D.rewards, Vs, 99, D.reach_terminal)
assert np.allclose(out, [0.6652, 0.76, 0.5])

D = Trajectory()
Expand All @@ -359,9 +359,9 @@ def test_gae():
D.step_infos = [StepInfo(done, info) for done, info in zip(dones, infos)]
D.rewards = [1, 2, 3, 4, 5]
Vs = [0.5, 1.5, 2.5, 3.5, 4.5]
out = gae(1.0, 0.5, D, Vs, 20)
out = gae(1.0, 0.5, D.rewards, Vs, 20, D.reach_terminal)
assert np.allclose(out, [6.40625, 8.8125, 11.625, 15.25, 20.5])
out = gae(0.1, 0.2, D, Vs, 20)
out = gae(0.1, 0.2, D.rewards, Vs, 20, D.reach_terminal)
assert np.allclose(out, [0.665348, 0.7674, 0.87, 1, 2.5])

D = Trajectory()
Expand All @@ -370,9 +370,9 @@ def test_gae():
D.step_infos = [StepInfo(done, info) for done, info in zip(dones, infos)]
D.rewards = [1, 2, 3, 4, 5]
Vs = [0.1, 1.1, 2.1, 3.1, 4.1]
out = gae(1.0, 0.5, D, Vs, 10)
out = gae(1.0, 0.5, D.rewards, Vs, 10, D.reach_terminal)
assert np.allclose(out, [5.80625, 7.6125, 9.225, 10.45, 10.9])
out = gae(0.1, 0.2, D, Vs, 10)
out = gae(0.1, 0.2, D.rewards, Vs, 10, D.reach_terminal)
assert np.allclose(out, [1.03269478, 1.1347393, 1.23696, 1.348, 1.9])

D = Trajectory()
Expand All @@ -381,7 +381,7 @@ def test_gae():
D.step_infos = [StepInfo(done, info) for done, info in zip(dones, infos)]
D.rewards = [1, 2, 3, 4, 5, 6, 7, 8]
Vs = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
out = gae(1.0, 0.5, D, Vs, 30)
out = gae(1.0, 0.5, D.rewards, Vs, 30, D.reach_terminal)
assert np.allclose(out, [5.84375, 7.6875, 9.375, 10.75, 11.5, 11., 8, 0.])
out = gae(0.1, 0.2, D, Vs, 30)
out = gae(0.1, 0.2, D.rewards, Vs, 30, D.reach_terminal)
assert np.allclose(out, [0.206164098, 0.308204915, 0.410245728, 0.5122864, 0.61432, 0.716, 0.8, 0])

0 comments on commit 1b8748f

Please sign in to comment.