In [2]:
import gym
from myrl.environments import Envs
from myrl.policies import GaussianPolicy
from myrl.value_functions import ValueFunctionMLP, polyak
from myrl.visualizer import showit
from myrl.utils import ExperimentWriter, check_output, global_gradient_clip 
from myrl.buffers import ReplayBuffer
import torch
import copy

envname = 'LunarLanderContinuous-v2'
envname = 'Pendulum-v0'

wll = ExperimentWriter('tb/ppo_pendulum_test_thatworks_')
env = gym.make(envname)
idim = env.observation_space.shape[0]
odim = env.action_space.shape[0]
envs = Envs(envname, 256)
envs.evenout(200)
idim, odim

255 / 256

(3, 1)

In [60]:
pi = GaussianPolicy([idim, 64, 64, odim], std_start=0.9, std_min=0.1)
vfunc = ValueFunctionMLP([idim, 64, 64, odim])
tvfunc = copy.deepcopy(vfunc)

In [61]:
bonus = RandomNetworkBonus([idim, 128, 1], [idim, 128, 1], lr=1e-4, opt_steps=1)
tvfunc = copy.deepcopy(vfunc)

In [62]:
opt = torch.optim.Adam(pi.parameters(), lr=3e-4)
copt = torch.optim.Adam(vfunc.parameters(), lr=2e-4)

In [63]:
wll.new()
writer = wll.writer
eps = 10000
gamma = 1

for ep in range(0, eps):
    oldobs, a, r, obs, d, oldprobs, smpls, muold, stdold = envs.rollout(pi.act)
    # bon = bonus.get_bonus(obs)
    # bonus.step(obs)
    # r += bon*100
    r = envs.discounted_sum(r, gamma)
    print(r.shape)
    rbuff = ReplayBuffer(nitems=9, max_len=100000)
    rbuff.add(oldobs, a, r, obs, d, oldprobs, smpls, muold, stdold)
    # print(len(rbuff), "leni")

    for imini_batch in range(15):
        oldobs, a, r, obs, d, oldprobs, smpls, muold, stdold = rbuff.get(len(rbuff)//10)
        adv = r - vfunc(oldobs).detach() #+ vfunc(obs).detach()*gamma*(1-d)
        adv = ((adv-adv.mean())/(adv.std()+1e-8)).detach()
        for optstep in range(4):
            _, (newprobs, _, mu, std) = pi.act(oldobs, smpl=smpls)
            p = torch.distributions.Normal(muold, stdold)
            q = torch.distributions.Normal(mu, std)
            kldiv = torch.distributions.kl_divergence(p, q)
            if kldiv.mean().item() > 0.03:
                # print("stopppp", kldiv.mean().item(), optstep)
                break
            # print(kldiv)
            ratio = torch.exp(newprobs-oldprobs)
            e = 0.15
            clipped = (ratio != torch.clamp(ratio, 1-e, 1+e)).float().mean().detach().item()
            loss = -torch.min(ratio*adv, torch.clamp(ratio, 1-e, 1+e)*adv).mean()
            opt.zero_grad()
            loss.backward()shooting method: optimize over actions only
            global_gradient_clip(pi)
            opt.step()
        
        for optstep in range(4):                    
            td = ((r - vfunc(oldobs))**2).mean()
            copt.zero_grad()
            td.backward()
            copt.step()

        rsss =  r.mean().item()
        polyak(vfunc, tvfunc, 1-1/50)


    if ep%4==0:
        print(ep, loss.item(), "r", r.mean().item()-bon.mean().item()*100, " td", td.item(), "clip", clipped, "___", pi.last_std.mean().item(), bon.mean().item()*100)
    if ep%50==0:
        print("showit=",showit(env, pi.act, max_steps=650))
        env.close()
        # print(mu.squeeze(1)[:20])


    writer.add_scalar("a/loss", loss.item(), ep)
    writer.add_scalar("a/td", td.item(), ep)
    writer.add_scalar("a/reward", r.mean(), ep)
    writer.add_scalar("a/clipped", clipped, ep)
    writer.add_scalar("a/lr", opt.param_groups[0]['lr'], ep)



KeyboardInterrupt: 

In [65]:
import gym

from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.cmd_util import make_vec_env

# Parallel environments
env = make_vec_env(envname, n_envs=4)

model = PPO(MlpPolicy, env, verbose=2)
model.learn(total_timesteps=50000, log_interval=1, tb_log_name='/home/darijan/mujoco-py/tb/ppo_stable_4')

env = gym.make(envname)
obs = env.reset()

for i in range(200):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
env.close()

Using cpu device
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.24e+03 |
| time/              |           |
|    fps             | 3906      |
|    iterations      | 1         |
|    time_elapsed    | 2         |
|    total_timesteps | 8192      |
----------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -1.21e+03   |
| time/                   |             |
|    fps                  | 1847        |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.001112595 |
|    clip_fraction        | 0           |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.41       |
|    explained_variance   | -6e+03      |
|  

In [2]:
class RandomNetworkBonus():
    def __init__(self, random_net_arch, estimator_net_arch, lr, opt_steps=2, weight=1):
        self.random_net = ValueFunctionMLP(random_net_arch)
        self.estimator_net = ValueFunctionMLP(estimator_net_arch)
        self.opt = torch.optim.Adam(self.estimator_net.parameters(), lr=lr)
        self.opt_steps = opt_steps
        self.weight = weight
    def step(self, obs):
        for i in range(self.opt_steps):
            loss = ((self.random_net(obs) + self.estimator_net(obs))**2).mean()
            self.opt.zero_grad()
            loss.backward()
            self.opt.step()
    def get_bonus(self, obs):
        loss = ((self.random_net(obs) + self.estimator_net(obs))**2).detach()
        if loss.shape[0] > 1:
            return loss
        return loss.item()

In [None]:
class IntrinsicCuriosityModule():
    def __init__(self, ):
        self.inverse_model = pass
        self.forward_model = pass
        

In [70]:
showit(env, pi.act)

284 /2000

182.735506752273