In [1]:
import gym
import torch
import copy
from myrl.buffers import ReplayBuffer
from myrl.visualizer import showit
from myrl.utils import ExperimentWriter, Discrete2Continuous, RunningNormalizer
from myrl.value_functions import polyak, DQN
from myrl.policies import RandomPolicy
from myrl.environments import Envs
import myrl
from myrl.utils import get_batch_obs
import numpy as np
import random 

envname = 'LunarLander-v2'
env = gym.make(envname)
envs = Envs(envname, 1)
seed = 3
np.random.seed(seed)
envs.seed(seed)

wll = ExperimentWriter('tb/'+str(random.randint(10000, 99999))+'dqn_' + envname + "-")

import pickle
import optuna

In [2]:
def objective(trial):
    env = gym.make(envname)
    envs = Envs(envname, 1)
    seed = trial.suggest_int("seed", 0, 5)
    np.random.seed(seed)
    envs.seed(seed)
    eps = 1000
    rbuff_size = trial.suggest_int("rbuff_size", 1000, 100000)
    hdim1 = trial.suggest_int("width_layer1_vfunc", 16, 256)
    hdim2 = trial.suggest_int("width_layer2_vfunc", 16, 256)
    lr    = trial.suggest_loguniform("lr", 1e-3, 1e2)
    gamma = trial.suggest_loguniform("gamma", 0.98, 1e0)
    polyak_alpha = trial.suggest_loguniform("polyak_alpha", 0.95, 1e0)
    e_greedy = trial.suggest_uniform("e_greedy", 0.08, 0.4)
    print("objective")
    
    return train(eps, rbuff_size, hdim1, hdim2, lr, gamma, polyak_alpha, e_greedy, 'dqn_all')

In [3]:
def train(eps, rbuff_size, hdim1, hdim2, lr, gamma, polyak_alpha, e_greedy, study_name, debug=False, device=torch.device('cpu'), save_study=True, dqn_file=None):
    adim = 4 #env.action_space.shape[0]
    sdim = env.observation_space.shape[0]
    if dqn_file is not None:
        dqn = DQN([sdim, hdim1, hdim2, adim]).to(device)
    else:
        with open(dqn_file, 'rb') as f:
            dqn = pickle.load(f)

    tdqn = copy.deepcopy(dqn)
    opt = torch.optim.Adam(dqn.parameters(), lr=lr)
    rbuff = ReplayBuffer(rbuff_size)

    if debug:
        wll.new()
        writer = wll.writer
    bsize = 512
    warmup = 50
    loss = torch.zeros(1, 1)
    collector = lambda obs: dqn.cpu().act(obs, epsilon=e_greedy)
    shower    = lambda obs: dqn.cpu().act(obs, epsilon=0)
    random_policy = RandomPolicy(env).act
    rew_ma_start = -12345
    rew_ma = rew_ma_start
    alpha = 1 - 1/15
    print("start")

    for ep in range(0, eps):
        if ep%1==0:
            pi = collector if len(rbuff)>bsize*5 else random_policy
            oldobs, a, r, obs, d, _, _, _ = envs.rollout(pi)
            rbuff.add(oldobs, a, r, obs, d)
            if debug:
                writer.add_scalar('dqn/reward', r.sum(), ep)
            if rew_ma == rew_ma_start:
                rew_ma = r.sum().item()
            else:
                rew_ma = alpha*rew_ma + (1-alpha)*r.sum().item()
        if ep%100==0:
            print(ep, rew_ma, len(rbuff))#, end='\r')
        if bsize*4 > len(rbuff):
            # print(len(rbuff), end='\r')
            continue
        if debug:
            if ep%10==0:
                print(ep, loss.item(), r.sum().item(), wll.z)
            if ep%50==0:
                print("showit = ", showit(env, shower, time_delta=0.01))
        # print("eppp"*10, ep)
        for i in range(50):
            # print(i)
            oldobs, a, r, obs, done = rbuff.get(bsize)
            oldobs, a, r, obs, done = oldobs.to(device), a.to(device), r.to(device), obs.to(device), done.to(device)
            for opt_step in range(3):
                target = tdqn.get_max(obs)*gamma*(1-done) + r
                dqn.to(device)
                calc = dqn.get_q(oldobs, a)
                loss = ((target - calc)**2).mean()
                opt.zero_grad()
                loss.backward()
                opt.step()
            polyak(dqn, tdqn, 1-1/750)
        if debug:
            writer.add_scalar('dqn/model', loss.item(), ep)
    if save_study:
        with open('optuna/' + study_name + '.pckl', 'wb') as f:
            pickle.dump(study, f)
    else:
        with open('optuna/' + envname + str(random.randint(0, 100000)) + '.pckl', 'wb') as f:
            pickle.dump(dqn, f)
    if debug:
        return -rew_ma, dqn
    return -rew_ma

In [24]:
# study = optuna.create_study()
study_name = 'lunarlander'
study.optimize(objective, n_trials=200, n_jobs=1)

: 139, 'width_layer2_vfunc': 108, 'lr': 6.1048738889983785, 'gamma': 0.9809804529910606, 'polyak_alpha': 0.9522893842061843, 'e_greedy': 0.3211007290521547}. Best is trial 8 with value: -18.180229011035912.[0m
objective
start
0 -293.72625732421875 127
100 -200.1617618103446 8059
200 -215.22784083452615 15689
300 -188.12780305140515 23457
400 -162.531576326161 31575
500 -188.23998974536806 39481
600 -219.86125406858844 47384
700 -230.35792337761572 55137
800 -167.91333250174523 63252
900 -139.51547336848614 70801
[32m[I 2020-10-23 23:51:29,611][0m Trial 10 finished with value: 161.18924558921276 and parameters: {'seed': 3, 'rbuff_size': 43667, 'width_layer1_vfunc': 255, 'width_layer2_vfunc': 48, 'lr': 0.06770154896486837, 'gamma': 0.9818982951884994, 'polyak_alpha': 0.9584622721732654, 'e_greedy': 0.22078515443556646}. Best is trial 8 with value: -18.180229011035912.[0m
objective
start
0 -66.03731536865234 120
100 -742.8005135586562 12109
200 -735.5304574905459 24498
300 -635.137417

KeyboardInterrupt: 

In [26]:
study.best_params, study.best_value

({'seed': 3,
  'rbuff_size': 59381,
  'width_layer1_vfunc': 153,
  'width_layer2_vfunc': 220,
  'lr': 0.002635551778581981,
  'gamma': 0.9996860608966027,
  'polyak_alpha': 0.9713959737104305,
  'e_greedy': 0.2015436906349644},
 -110.98360406523054)

In [45]:
t = {'seed': 3,
  'rbuff_size': 59381,
  'width_layer1_vfunc': 153,
  'width_layer2_vfunc': 220,
  'lr': 0.007,#0.002635551778581981,
  'gamma': 0.9996860608966027,
  'polyak_alpha': 0.9713959737104305,
  'e_greedy': 0.2015436906349644}

In [67]:
seed = 3
np.random.seed(seed)
envs.seed(seed)
# t = study.best_params
#       def train(eps, rbuff_size, hdim1, hdim2, lr, gamma, polyak_alpha, e_greedy, study_name, debug=False):
dqn_file = 'optuna/LunarLander-v288024.pckl'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("using", device)
rew, dqn = train(600, t['rbuff_size'], t['width_layer1_vfunc'], t['width_layer2_vfunc'], t['lr'], t['gamma'], t['polyak_alpha'], t['e_greedy'], 'final', debug=True, device=torch.device('cuda'), save_study=False, dqn_file=dqn_file)

using cuda
start
0 -308.8083190917969 124
30 14.982115745544434 -132.8147430419922 2
40 4.198455810546875 -92.33831787109375 2
50 8.084043502807617 -143.2115936279297 2
showit =  -72.65378352864694
60 10.400001525878906 -101.22713470458984 2
70 13.777146339416504 -59.03606414794922 2
80 3.6312055587768555 -188.18115234375 2
90 15.936710357666016 -228.52159118652344 2
100 -143.44290743814992 25989
100 3.323150157928467 -225.41937255859375 2
showit =  -124.12244103232533
110 7.867131233215332 -118.1345443725586 2
120 4.227761745452881 -295.7260437011719 2
130 6.9953460693359375 -130.1145477294922 2
140 6.6211395263671875 -162.5361328125 2
150 6.532442092895508 -354.2683410644531 2
showit =  -124.6585020750267
160 14.443611145019531 -100.92578887939453 2
170 5.078551292419434 -99.27471923828125 2
180 6.672517776489258 -197.13316345214844 2
190 2.5633692741394043 -88.73200988769531 2
200 -85.16217947181985 98624
200 3.1840453147888184 -102.52192687988281 2
showit =  -119.26770071557965
210

In [38]:
shower    = lambda obs: dqn.to(torch.device('cpu')).act(obs, epsilon=0.0)
print("showit = ", showit(env, shower, time_delta=0.001, save_file='videos/lunar_lander9.avi', fps=100))

showit =  -1.0038239166753868


In [4]:
with open('optuna/LunarLander-v24995.pckl', 'rb') as f:
    dqn = pickle.load(f)
