In [12]:
import gym
from myrl.environments import Envs
from myrl.policies import GaussianPolicy
from myrl.value_functions import ValueFunctionMLP, polyak
from myrl.visualizer import showit
from myrl.utils import ExperimentWriter, check_output, global_gradient_clip 
from myrl.buffers import ReplayBuffer
import torch
import copy

envname = 'LunarLanderContinuous-v2'
# envname = 'Pendulum-v0'

wll = ExperimentWriter('tb/ppo_lun_curious_')
env = gym.make(envname)
idim = env.observation_space.shape[0]
odim = env.action_space.shape[0]
envs = Envs(envname, 4)
# envs.evenout(200)
idim, odim

(8, 2)

In [17]:
pi = GaussianPolicy([idim, 64, 64, odim], std_start=0.9, std_min=0.1)
vfunc = ValueFunctionMLP([idim, 64, 64, odim])
tvfunc = copy.deepcopy(vfunc)

In [18]:
bonus = RandomNetworkBonus([idim, 128, 1], [idim, 128, 1], lr=1e-4, opt_steps=1)
tvfunc = copy.deepcopy(vfunc)


In [19]:
opt = torch.optim.Adam(pi.parameters(), lr=3e-4)
copt = torch.optim.Adam(vfunc.parameters(), lr=2e-4)

In [20]:
wll.new()
writer = wll.writer
eps = 10000
gamma = 1

for ep in range(0, eps):
    oldobs, a, r, obs, d, oldprobs, smpls, muold, stdold = envs.rollout(pi.act, length=10000)
    bon = bonus.get_bonus(obs)
    bonus.step(obs)
    r += bon*100*0
    r = envs.discounted_sum(r, gamma)
    rbuff = ReplayBuffer(nitems=9, max_len=100000)
    rbuff.add(oldobs, a, r, obs, d, oldprobs, smpls, muold, stdold)
    # print(len(rbuff), "leni")

    for imini_batch in range(1):
        oldobs, a, r, obs, d, oldprobs, smpls, muold, stdold = rbuff.get(len(rbuff)//1-1)
        adv = r - vfunc(oldobs).detach() #+ vfunc(obs).detach()*gamma*(1-d)
        adv = ((adv-adv.mean())/(adv.std()+1e-8)).detach()
        for optstep in range(4):
            _, (newprobs, _, mu, std) = pi.act(oldobs, smpl=smpls)
            p = torch.distributions.Normal(muold, stdold)
            q = torch.distributions.Normal(mu, std)
            kldiv = torch.distributions.kl_divergence(p, q)
            if kldiv.mean().item() > 0.03:
                # print("stopppp", kldiv.mean().item(), optstep)
                break
            # print(kldiv)
            ratio = torch.exp(newprobs-oldprobs)
            e = 0.15
            clipped = (ratio != torch.clamp(ratio, 1-e, 1+e)).float().mean().detach().item()
            loss = -torch.min(ratio*adv, torch.clamp(ratio, 1-e, 1+e)*adv).mean()
            opt.zero_grad()
            loss.backward()
            global_gradient_clip(pi)
            opt.step()
        
        for optstep in range(4):                    
            td = ((r - vfunc(oldobs))**2).mean()
            copt.zero_grad()
            td.backward()
            copt.step()

        rsss =  r.mean().item()
        polyak(vfunc, tvfunc, 1-1/50)


    if ep%4==0:
        print(ep, loss.item(), "r", r.mean().item()-bon.mean().item()*100, " td", td.item(), "clip", clipped, "___", pi.last_std.mean().item(), bon.mean().item()*100)
    if ep%50==0:
        print("showit=",showit(env, pi.act, max_steps=650))
        env.close()
        # print(mu.squeeze(1)[:20])


    writer.add_scalar("a/loss", loss.item(), ep)
    writer.add_scalar("a/td", td.item(), ep)
    writer.add_scalar("a/reward", r.mean(), ep)
    writer.add_scalar("a/clipped", clipped, ep)
    writer.add_scalar("a/lr", opt.param_groups[0]['lr'], ep)



0 -0.0029380368068814278 r -38.44097754219547  td 3250.54248046875 clip 0.0 ___ 0.720962405204773 0.11002203682437539
showit= -174.1745670771495
4 -0.001292870263569057 r -27.718106530606747  td 2330.606201171875 clip 0.0 ___ 0.7267155051231384 2.882533334195614
8 -0.0013082309160381556 r -52.77108614798635  td 9683.8623046875 clip 0.0 ___ 0.7256836295127869 0.36927645560353994
12 -0.0004536219057627022 r -76.2408279273659  td 13141.2587890625 clip 0.0 ___ 0.7238295674324036 0.37869871594011784
16 -0.0041170730255544186 r -32.46529074013233  td 4120.0478515625 clip 0.0 ___ 0.7229253053665161 1.2391764670610428
20 -0.0028424065094441175 r -18.98022198677063  td 1.9729820489883423 clip 0.0 ___ 0.7244485020637512 18.671494722366333
24 -0.008192663080990314 r -35.165094615891576  td 4106.28564453125 clip 0.0 ___ 0.7192338705062866 0.5841810721904039
28 -0.0007566241547465324 r -71.89846699032933  td 12180.4306640625 clip 0.0 ___ 0.72895348072052 0.2779683331027627
32 0.0002081262064166367 

KeyboardInterrupt: 

In [3]:
import gym

from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.cmd_util import make_vec_env

# Parallel environments
env = make_vec_env(envname, n_envs=4)

model = PPO(MlpPolicy, env, verbose=2)
model.learn(total_timesteps=250000, log_interval=1, tb_log_name='/home/darijan/mujoco-py/tb/ppo_stable_4')

env = gym.make(envname)
obs = env.reset()

for i in range(200):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
env.close()

|    clip_range           | 0.2         |
|    entropy_loss         | -2.75       |
|    explained_variance   | 0.657       |
|    learning_rate        | 0.0003      |
|    loss                 | 20.8        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.00768    |
|    std                  | 0.959       |
|    value_loss           | 99.6        |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 131         |
|    ep_rew_mean          | -11.4       |
| time/                   |             |
|    fps                  | 924         |
|    iterations           | 11          |
|    time_elapsed         | 97          |
|    total_timesteps      | 90112       |
| train/                  |             |
|    approx_kl            | 0.007990321 |
|    clip_fraction        | 0.0781      |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.74 

In [16]:
class RandomNetworkBonus():
    def __init__(self, random_net_arch, estimator_net_arch, lr, opt_steps=2, weight=1):
        self.random_net = ValueFunctionMLP(random_net_arch)
        self.estimator_net = ValueFunctionMLP(estimator_net_arch)
        self.opt = torch.optim.Adam(self.estimator_net.parameters(), lr=lr)
        self.opt_steps = opt_steps
        self.weight = weight
    def step(self, obs):
        for i in range(self.opt_steps):
            loss = ((self.random_net(obs) + self.estimator_net(obs))**2).mean()
            self.opt.zero_grad()
            loss.backward()
            self.opt.step()
    def get_bonus(self, obs):
        loss = ((self.random_net(obs) + self.estimator_net(obs))**2).detach()
        if loss.shape[0] > 1:
            return loss
        return loss.item()

In [None]:
class IntrinsicCuriosityModule():
    def __init__(self, ):
        self.inverse_model = pass
        self.forward_model = pass
        

In [70]:
showit(env, pi.act)

284 /2000

182.735506752273

In [23]:
obs = env.reset()
rsum = 0
for i in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
    import time
    time.sleep(0.01)
    rsum += rewards
    print(i, end='\r')
    if dones:
        break
        print("gotovo")
    
env.close()

999

In [9]:
rsum

103.13485146332415