In [1]:
import gym
import myrl
from myrl.environments import Envs
from myrl.policies import GaussianPolicy, CategoricalPolicy
from myrl.value_functions import ValueFunctionMLP, polyak
from myrl.visualizer import showit
from myrl.utils import clip_grad_norm_, ExperimentWriter, check_output, Discrete2Continuous

import torch
import copy
import torch.nn as nn

best = None
bestr = float('-inf')
bestep = -1
wll = ExperimentWriter('tb/ppo_that_works_')

In [27]:
env = gym.make('Pendulum-v0')
idim = env.observation_space.shape[0]
odim = env.action_space.shape[0]

pi = GaussianPolicy([idim, 32, odim], std=0.15)
# pi = CategoricalPolicy([idim, 32, 5])
# picon = Discrete2Continuous(-2, 2, 5, pi.act)
vfunc = ValueFunctionMLP([idim, 128, 32, odim])
tvfunc = copy.deepcopy(vfunc)
envs = Envs('Pendulum-v0', 256)
envs.evenout(200)
idim, odim, pi

255 / 256

(3,
 1,
 GaussianPolicy(
   (layers): ModuleList(
     (0): Linear(in_features=3, out_features=32, bias=True)
     (1): Linear(in_features=32, out_features=1, bias=True)
     (2): Linear(in_features=32, out_features=1, bias=True)
   )
 ))

In [28]:
opt = torch.optim.Adam(pi.parameters(), lr=1e-2)
copt = torch.optim.Adam(vfunc.parameters(), lr=1e-1)

In [29]:
wll.new()
writer = wll.writer
eps = 10000
gamma = 0.97

best = None
bestr = float('-inf')
bestep = -1

for ep in range(eps):
    oldobs, a, r, obs, d, oldprobs, smpls, _, _ = envs.rollout(pi.act, length=1, debug=0)
    adv = r - vfunc(oldobs).detach() + vfunc(obs).detach()*gamma*(1-d)
    adv = ((adv-adv.mean())/(adv.std()+1e-8)).detach()
    oldobs, a, r, obs, d, oldprobs, smpls = oldobs.detach(), a.detach(), r.detach(), obs.detach(), d.detach(), oldprobs.detach(), smpls.detach()
    for optstep in range(4):
        _, (newprobs, _, _, _) = pi.act(oldobs, smpl=smpls)
        ratio = torch.exp(newprobs-oldprobs)
        e = 0.15
        clipped = (ratio != torch.clamp(ratio, 1-e, 1+e)).float().mean().detach().item()
        loss = -torch.min(ratio*adv, torch.clamp(ratio, 1-e, 1+e)*adv).mean()
        loss *= 100000
        opt.zero_grad()
        loss.backward()
        # clip_grad_norm_(opt, 0.5)
        opt.step()
    
    for optstep in range(4):
        td = ((r + gamma*tvfunc(obs).detach() - vfunc(oldobs))**2).mean()
        copt.zero_grad()
        td.backward()
        copt.step()

    polyak(vfunc, tvfunc, 1-1/50)


    if ep%4==0:
        print(ep, loss.item(), "r", r.mean().item(), " td", td.item(), "clip", clipped)
    if ep%120==0:
        showit(env, pi.act)
        env.close()
    writer.add_scalar("a/loss", loss.item(), ep)
    writer.add_scalar("a/td", td.item(), ep)
    writer.add_scalar("a/reward", r.mean(), ep)
    writer.add_scalar("a/clipped", clipped, ep)
    writer.add_scalar("a/lr", opt.param_groups[0]['lr'], ep)



515625
692 89.9278564453125 r -5.553956031799316  td 0.5701225996017456 clip 0.0625
696 -60.144813537597656 r -5.584073543548584  td 8.002304077148438 clip 0.03125
700 -0.28833746910095215 r -5.585497856140137  td 0.7295847535133362 clip 0.07421875
704 -66.462158203125 r -5.464475154876709  td 1.2523040771484375 clip 0.05859375
708 -121.712890625 r -5.318703651428223  td 3.197056770324707 clip 0.09765625
712 -133.3583221435547 r -5.336618900299072  td 0.9044182300567627 clip 0.0625
716 260.7818603515625 r -5.183304309844971  td 3.5305228233337402 clip 0.0625
720 -262.1909484863281 r -5.436185836791992  td 1.3257014751434326 clip 0.06640625
198 /2000724 311.2313232421875 r -5.658764362335205  td 2.188645839691162 clip 0.13671875
728 -16.53679656982422 r -5.561578750610352  td 4.219154357910156 clip 0.0390625
732 -385.4676208496094 r -5.677513122558594  td 1.1578238010406494 clip 0.0625
736 123.79363250732422 r -5.153835296630859  td 0.9272525310516357 clip 0.0703125
740 12.8499231338500

KeyboardInterrupt: 

In [22]:
m = torch.tensor([[0.01, 0.1, 1], [10, 100, 1000]])
std = torch.zeros_like(m) + 0.0001 + 1 -1
dis = torch.distributions.Normal(m, std)
smpls = dis.sample()
smpls

tensor([[1.0115e-02, 9.9966e-02, 1.0002e+00],
        [1.0000e+01, 1.0000e+02, 1.0000e+03]])

In [24]:
torch.exp(dis.log_prob(smpls))

tensor([[2059.3035, 3760.7412,  877.5601],
        [1357.1128, 1720.5764, 1893.9595]])

In [23]:
dis.log_prob(smpls)

tensor([[7.6301, 8.2324, 6.7771],
        [7.2131, 7.4504, 7.5464]])

In [None]:
    if bestr < r.mean().item():
        bestr = r.mean().item()
        bestep = ep
        best = copy.deepcopy(pi)
    if ep - bestep > 25 and 0:
        pi = copy.deepcopy(best)
        opt = torch.optim.Adam(pi.parameters(), lr=1e-1)
        shedualer_increase = torch.optim.lr_scheduler.MultiplicativeLR(opt, lambda ep:1.05)
        shedualer_decrease = torch.optim.lr_scheduler.MultiplicativeLR(opt, lambda ep:0.7)
        bestep = ep
        print("swap")

In [4]:
from myrl.utils import get_batch_obs
bobs = get_batch_obs(env, 32)
a, (d, s, h) = pi(bobs)

ValueError: too many values to unpack (expected 3)

In [90]:
check_output(env, pi)

tensor([[0.5899]], grad_fn=<AddmmBackward>)

In [91]:
from myrl.policies import CategoricalPolicy
pi2 = CategoricalPolicy([3, 16, 5])

In [26]:
showit(env, pi.act)

198 /2000

-744.5402928564353