In [3]:
import gym
from myrl.environments import Envs
from myrl.policies import GaussianPolicy
from myrl.value_functions import ValueFunctionMLP, polyak
from myrl.visualizer import showit
from myrl.utils import ExperimentWriter, check_output, global_gradient_clip, list_gae, gae, add_list2list
from myrl.buffers import ReplayBuffer
from myrl.exploration import RandomNetworkBonus
import torch
import copy
import pickle
import numpy as np

# envname = 'LunarLanderContinuous-v2'
envname = 'Pendulum-v0'

wll = ExperimentWriter('tb/ppo_pen_works_largeGAE_')
env = gym.make(envname)
idim = env.observation_space.shape[0]
odim = env.action_space.shape[0]
envs = Envs(envname, 16)
envs.evenout(200)
idim, odim




(3, 1)

In [4]:
def objective_function(trial):
    env = gym.make(envname)
    envs = Envs(envname, 128)
    envs.evenout(200)
    seed = trial.suggest_int("seed", 0, 5)
    np.random.seed(seed)
    envs.seed(seed)
    torch.manual_seed(seed)
    eps = 50
    lr_policy = trial.suggest_loguniform("lr_policy", 1e-4, 5e-4)
    lr_vfunc  = trial.suggest_loguniform("lr_vfunc", 1e-4, 1e2)
    w1_pi     = trial.suggest_int("width_layer1_policy", 8, 128)
    #w2_pi     = trial.suggest_int("width_layer2_policy", 1, 64)
    w1_vfunc  = trial.suggest_int("width_layer1_vfunc", 4, 128)
    w2_vfunc  = trial.suggest_int("width_layer2_vfunc", 4, 128)
    std_start = trial.suggest_uniform("std_start", 0.7, 1.5)
    std_min   = trial.suggest_uniform("std_min", 0.15, 0.6)
    gamma     = trial.suggest_loguniform("gamma", 0.90, 1e0)
    lmbda     = trial.suggest_loguniform("lambda", 0.01, 1e0)
    polyak_alpha = trial.suggest_loguniform("polyak_alpha", 0.97, 1e0)

    pi = GaussianPolicy([idim, w1_pi, odim], std_start=std_start, std_min=std_min, clip_range=[-2, 2])
    vfunc = ValueFunctionMLP([idim, w1_vfunc, w2_vfunc, 1])
    tvfunc = copy.deepcopy(vfunc)
    opt = torch.optim.Adam(pi.parameters(), lr=lr_policy)
    copt = torch.optim.Adam(vfunc.parameters(), lr=lr_vfunc)

    return train(eps, pi, vfunc, tvfunc, opt, copt, gamma, lmbda, polyak_alpha, 'ppo_pendulum_')


In [5]:
def train(eps, pi, vfunc, tvfunc, opt, copt, gamma, lmbda, polyak_alpha, study_name, debug=False, device=torch.device('cpu')):
    rew_set = False
    rew_ma = None
    alpha = 1-1/15
    vfunc.to(device)
    pi.to(device)
    n_mini_batches = 20
    
    if debug:
        wll.new()
        writer = wll.writer
    for ep in range(0, eps):
        l = envs.list_rollout(pi.act, gamma=gamma, length=160)
        rss = l[0][2].mean().item()
        adv = list_gae(l, tvfunc, gamma, lmbda=lmbda)
        add_list2list(adv, l)
        rbuff = ReplayBuffer(nitems=10, max_len=100000)
        rbuff.list_add(l)

        for imini_batch in range(n_mini_batches):
            mini_batch_size = int(len(rbuff)/(n_mini_batches/3))
            oldobs, a, r, obs, d, oldprobs, smpls, muold, stdold, gae = rbuff.get(mini_batch_size)
            oldobs, a, r, obs, d, oldprobs = oldobs.to(device), a.to(device), r.to(device), obs.to(device), d.to(device), oldprobs.to(device)
            smpls, muold, stdold, gae = smpls.to(device), muold.to(device), stdold.to(device), gae.to(device)
            adv = gae.detach() - vfunc(oldobs)
            adv = ((adv-adv.mean())/(adv.std()+1e-8)).detach()
            for optstep in range(2):
                _, (newprobs, _, mu, std) = pi.act(oldobs, smpl=smpls)
                p = torch.distributions.Normal(muold, stdold)
                q = torch.distributions.Normal(mu, std)
                kldiv = torch.distributions.kl_divergence(p, q)
                if kldiv.mean().item() > 0.03:
                    print("stopppp", kldiv.mean().item(), optstep)
                    break
                ratio = torch.exp(newprobs-oldprobs)
                e = 0.15
                clipped = (ratio != torch.clamp(ratio, 1-e, 1+e)).float().mean().detach().item()
                loss = -torch.min(ratio*adv, torch.clamp(ratio, 1-e, 1+e)*adv).mean()
                opt.zero_grad()
                loss.backward()
                # global_gradient_clip(pi)
                opt.step()
            
            for optstep in range(2):                    
                td = ((gae.detach() - vfunc(oldobs))**2).mean()
                copt.zero_grad()
                td.backward()
                # global_gradient_clip(vfunc)
                copt.step()


        polyak(vfunc, tvfunc, polyak_alpha)
        if rew_set:
            rew_ma = alpha*rew_ma+(1-alpha)*r.mean().item()
        else:
            rew_ma = r.mean().item()
            rew_set = True
        print(ep, end='\r')

        if debug:
            if ep%1==0:
                print(ep, loss.item(), "r", rss, " td", td.item(), "clip", clipped, "___", pi.last_std.mean().item())
            if ep%20==0:
                pass
                # print("showit=",showit(env, pi.to(torch.device('cpu')).act, max_steps=650))
                # env.close()
            writer.add_scalar("a/std", pi.last_std.mean().item(), ep)
            writer.add_scalar("a/loss", loss.item(), ep)
            writer.add_scalar("a/td", td.item(), ep)
            writer.add_scalar("a/reward", r.mean(), ep)
    rfinal = oldobs, a, r, obs, d, oldprobs, smpls, muold, stdold, gae = rbuff.get(len(rbuff)-1)
    with open('optuna/'+study_name+'.pckl', 'wb') as f:
        pickle.dump(study, f)
    if debug:
        return rew_ma, pi, vfunc
    return -rew_ma


In [12]:
study.optimize(objective_function, n_trials=200, n_jobs=1)

0.017025327625858438, 'polyak_alpha': 0.9873055825706365}. Best is trial 66 with value: 5.33431339263916.[0m
[32m[I 2020-10-24 13:59:01,026][0m Trial 174 finished with value: 6.937841188192962 and parameters: {'seed': 1, 'lr_policy': 0.0001361565999469788, 'lr_vfunc': 23.403027932180144, 'width_layer1_policy': 90, 'width_layer1_vfunc': 122, 'width_layer2_vfunc': 95, 'std_start': 0.9680687111607704, 'std_min': 0.47841446554868183, 'gamma': 0.9686299563665551, 'lambda': 0.030112676497760258, 'polyak_alpha': 0.9820420856300301}. Best is trial 66 with value: 5.33431339263916.[0m
[32m[I 2020-10-24 14:00:18,103][0m Trial 175 finished with value: 5.927765783728624 and parameters: {'seed': 1, 'lr_policy': 0.00020479098357992817, 'lr_vfunc': 4.813878808064894, 'width_layer1_policy': 71, 'width_layer1_vfunc': 32, 'width_layer2_vfunc': 117, 'std_start': 0.8482650237675269, 'std_min': 0.5718168826406769, 'gamma': 0.9669846527816939, 'lambda': 0.022704265650466973, 'polyak_alpha': 0.989345754

KeyboardInterrupt: 

In [25]:
print(study.best_params)
study.best_params['lr_policy']
some = study.get_trials()[202].params

{'lr_policy': 0.000392148144561186, 'lr_vfunc': 0.9557335221938151, 'width_layer1_policy': 100, 'width_layer1_vfunc': 189, 'width_layer2_vfunc': 85, 'std_start': 0.6401847525381115, 'gamma': 0.821649998393548, 'lambda': 0.02157171173951722, 'polyak_alpha': 0.7743138632219162}


In [8]:
# study = optuna.create_study(study_name='PPO_Pendulum')
import pickle  
import optuna

with open('optuna/PPO_Pendulum.pckl', 'rb') as f:
    best_study = pickle.load(f)
best_study.best_params

{'lr_policy': 0.000392148144561186,
 'lr_vfunc': 0.9557335221938151,
 'width_layer1_policy': 100,
 'width_layer1_vfunc': 189,
 'width_layer2_vfunc': 85,
 'std_start': 0.6401847525381115,
 'gamma': 0.821649998393548,
 'lambda': 0.02157171173951722,
 'polyak_alpha': 0.7743138632219162}

In [6]:
params = {
'lr_policy': 0.000392148144561186,
 'lr_vfunc': 0.5,
 'width_layer1_policy': 32,
 'width_layer1_vfunc': 64,
 'width_layer2_vfunc': 32,
 'std_start': 0.640,
 'gamma': 0.95,
 'lambda': 0.75,
 'polyak_alpha': 1-1/500,
 'std_min': 0.1
 }

In [7]:
def best_f(eps=100):
    from_what = params
    seed = 0
    np.random.seed(seed)
    env.seed(seed)
    torch.manual_seed(seed)
    eps = eps
    lr_policy = from_what["lr_policy"]
    lr_vfunc  = from_what["lr_vfunc"]
    w1_pi     = from_what["width_layer1_policy"]
    w1_vfunc  = from_what["width_layer1_vfunc"]
    w2_vfunc  = from_what["width_layer2_vfunc"]
    std_start = from_what["std_start"]
    std_min   = from_what["std_min"]
    gamma     = from_what["gamma"]
    lmbda     = from_what["lambda"]
    polyak_alpha = from_what["polyak_alpha"]

    pi = GaussianPolicy([idim, w1_pi, odim], std_start=std_start, std_min=std_min)
    vfunc = ValueFunctionMLP([idim, w1_vfunc, w2_vfunc, 1])
    tvfunc = copy.deepcopy(vfunc)
    opt = torch.optim.Adam(pi.parameters(), lr=lr_policy)
    copt = torch.optim.Adam(vfunc.parameters(), lr=lr_vfunc)

    return train(eps, pi, vfunc, tvfunc, opt, copt, gamma, lmbda, polyak_alpha, "test", debug=True)

pi, vfunc, rfinal = best_f(200)

5  td 8.895524024963379 clip 0.01092896144837141 ___ 0.5623748898506165
30 -0.0015395451337099075 r -5.202552795410156  td 16.493202209472656 clip 0.09649122506380081 ___ 0.5593835711479187
31 -0.00513425562530756 r -5.802738189697266  td 8.722978591918945 clip 0.008196720853447914 ___ 0.5545316934585571
32 -0.006186817307025194 r -3.639064311981201  td 17.85369300842285 clip 0.02631578966975212 ___ 0.5506178140640259
33 -0.004197955597192049 r -5.867679119110107  td 5.00850248336792 clip 0.03551912680268288 ___ 0.5539911985397339
34 0.003887142287567258 r -5.85380220413208  td 11.565130233764648 clip 0.15789473056793213 ___ 0.5597400069236755
35 0.00031011970713734627 r -7.002562522888184  td 4.498677730560303 clip 0.008196720853447914 ___ 0.5601415038108826
36 -0.002927321707829833 r -6.360142707824707  td 22.315746307373047 clip 0.008771929889917374 ___ 0.5613582134246826
37 -0.005352415610104799 r -4.032168388366699  td 5.926567077636719 clip 0.08196721225976944 ___ 0.5592589378356

NameError: name 'study' is not defined