In [27]:
import gym
import myrl
from myrl.environments import Envs
from myrl.policies import GaussianPolicy, CategoricalPolicy
from myrl.value_functions import ValueFunctionMLP, polyak
from myrl.visualizer import showit
from myrl.utils import clip_grad_norm_, ExperimentWriter, check_output, Discrete2Continuous

import torch
import copy
import torch.nn as nn

best = None
bestr = float('-inf')
bestep = -1
wll = ExperimentWriter('tb/ppo_take_infinity__+++_tryagain_')

In [2]:
env = gym.make('Pendulum-v0')
idim = env.observation_space.shape[0]
odim = env.action_space.shape[0]

pi = GaussianPolicy([idim, 32, odim], std=0.15)
# pi = CategoricalPolicy([idim, 32, 5])
picon = Discrete2Continuous(-2, 2, 5, pi.act)
vfunc = ValueFunctionMLP([idim, 128, 32, odim])
tvfunc = copy.deepcopy(vfunc)
envs = Envs('Pendulum-v0', 256)
envs.evenout(200)
idim, odim, pi

255 / 256

(3,
 1,
 GaussianPolicy(
   (layers): ModuleList(
     (0): Linear(in_features=3, out_features=32, bias=True)
     (1): Linear(in_features=32, out_features=1, bias=True)
     (2): Linear(in_features=32, out_features=1, bias=True)
   )
 ))

In [3]:
opt = torch.optim.Adam(pi.parameters(), lr=1e-2)
copt = torch.optim.Adam(vfunc.parameters(), lr=1e-1)

In [4]:
wll.new()
writer = wll.writer
eps = 10000
gamma = 0.97

best = None
bestr = float('-inf')
bestep = -1

for ep in range(eps):
    oldobs, a, r, obs, d, oldprobs, smpls, _ = envs.rollout(pi.act, length=1, debug=0)
    adv = r - vfunc(oldobs).detach() + vfunc(obs).detach()*gamma*(1-d)
    adv = ((adv-adv.mean())/(adv.std()+1e-8)).detach()
    oldobs, a, r, obs, d, oldprobs, smpls = oldobs.detach(), a.detach(), r.detach(), obs.detach(), d.detach(), oldprobs.detach(), smpls.detach()
    for optstep in range(4):
        _, (newprobs, _, ps) = pi.act(oldobs, smpl=smpls)
        ratio = torch.exp(newprobs-oldprobs)
        e = 0.15
        clipped = (ratio != torch.clamp(ratio, 1-e, 1+e)).float().mean().detach().item()
        loss = -torch.min(ratio*adv, torch.clamp(ratio, 1-e, 1+e)*adv).mean()
        loss *= 100000
        opt.zero_grad()
        loss.backward()
        clip_grad_norm_(opt, 0.5)
        opt.step()
    
    for optstep in range(4):
        td = ((r + gamma*tvfunc(obs).detach() - vfunc(oldobs))**2).mean()
        copt.zero_grad()
        td.backward()
        copt.step()

    polyak(vfunc, tvfunc, 1-1/50)


    if ep%4==0:
        print(ep, loss.item(), "r", r.mean().item(), " td", td.item(), "clip", clipped)
    if ep%120==0:
        showit(env, pi.act)
        env.close()
    writer.add_scalar("a/loss", loss.item(), ep)
    writer.add_scalar("a/td", td.item(), ep)
    writer.add_scalar("a/reward", r.mean(), ep)
    writer.add_scalar("a/clipped", clipped, ep)
    writer.add_scalar("a/lr", opt.param_groups[0]['lr'], ep)



15975952148438 r -7.180079460144043  td 2.0554332733154297 clip 0.37890625
284 300.3746643066406 r -7.183871269226074  td 0.027320606634020805 clip 0.421875
288 -959.1957397460938 r -7.254042148590088  td 0.0681377574801445 clip 0.33203125
292 -383.1043701171875 r -7.201418399810791  td 0.07073328644037247 clip 0.09375
296 -290.7246398925781 r -7.1575541496276855  td 0.024050330743193626 clip 0.2109375
300 -471.5356140136719 r -6.99013614654541  td 0.07959326356649399 clip 0.1484375
304 270.5162048339844 r -6.922449111938477  td 0.05915914848446846 clip 0.3828125
308 -181.8395233154297 r -6.979433059692383  td 0.061166178435087204 clip 0.05859375
312 -135.28298950195312 r -6.979071617126465  td 0.20902277529239655 clip 0.25390625
316 -134.2525634765625 r -6.927289962768555  td 0.5249055624008179 clip 0.2421875
320 -113.45204162597656 r -6.973128795623779  td 0.3565579056739807 clip 0.0390625
324 -680.51171875 r -6.879942893981934  td 0.8860902786254883 clip 0.49609375
328 -89.878929138

KeyboardInterrupt: 

In [22]:
m = torch.tensor([[0.01, 0.1, 1], [10, 100, 1000]])
std = torch.zeros_like(m) + 0.0001 + 1 -1
dis = torch.distributions.Normal(m, std)
smpls = dis.sample()
smpls

tensor([[1.0115e-02, 9.9966e-02, 1.0002e+00],
        [1.0000e+01, 1.0000e+02, 1.0000e+03]])

In [24]:
torch.exp(dis.log_prob(smpls))

tensor([[2059.3035, 3760.7412,  877.5601],
        [1357.1128, 1720.5764, 1893.9595]])

In [23]:
dis.log_prob(smpls)

tensor([[7.6301, 8.2324, 6.7771],
        [7.2131, 7.4504, 7.5464]])

In [None]:
    if bestr < r.mean().item():
        bestr = r.mean().item()
        bestep = ep
        best = copy.deepcopy(pi)
    if ep - bestep > 25 and 0:
        pi = copy.deepcopy(best)
        opt = torch.optim.Adam(pi.parameters(), lr=1e-1)
        shedualer_increase = torch.optim.lr_scheduler.MultiplicativeLR(opt, lambda ep:1.05)
        shedualer_decrease = torch.optim.lr_scheduler.MultiplicativeLR(opt, lambda ep:0.7)
        bestep = ep
        print("swap")

In [4]:
from myrl.utils import get_batch_obs
bobs = get_batch_obs(env, 32)
a, (d, s, h) = pi(bobs)

ValueError: too many values to unpack (expected 3)

In [90]:
check_output(env, pi)

tensor([[0.5899]], grad_fn=<AddmmBackward>)

In [91]:
from myrl.policies import CategoricalPolicy
pi2 = CategoricalPolicy([3, 16, 5])

In [26]:
showit(env, pi.act)

198 /2000

-2.1498235748277876