In [1]:
import gym
import torch
from myrl.buffers import ReplayBuffer, PrioritizedReplayBuffer
from myrl.visualizer import showit
from myrl.utils import ExperimentWriter, get_batch_obs, get_batch_a, Discrete2Continuous
from myrl.value_functions import polyak, DQN
from myrl.policies import RandomPolicy
from myrl.environments import Envs

wll = ExperimentWriter('tb/dqn_for_github_')

envname = 'Pendulum-v0'

env = gym.make(envname)
envs = Envs(envname, 1)
env.observation_space.sample()

array([ 0.6727337, -0.7722899, -5.7983904], dtype=float32)

In [2]:
rbuff = ReplayBuffer(1000000)

In [20]:
print(env.action_space, env.observation_space)
# adim = env.action_space.n 
adim = 7
sdim = env.observation_space.shape[0]
dqn = DQN([sdim, 64, 32, adim])
dqn_cont = Discrete2Continuous(-2, 2, adim, dqn.act).act
import copy
tdqn = copy.deepcopy(dqn)
dqn

Box(-2.0, 2.0, (1,), float32) Box(-8.0, 8.0, (3,), float32)


DQN(
  (layers): ModuleList(
    (0): Linear(in_features=3, out_features=64, bias=True)
    (1): Linear(in_features=64, out_features=32, bias=True)
    (2): Linear(in_features=32, out_features=7, bias=True)
  )
)

In [9]:
opt = torch.optim.Adam(dqn.parameters(), lr=1e-1)

In [22]:
wll.new()
writer = wll.writer
bsize = 128
warmup = 100
gamma = 0.97

collector = lambda obs: dqn.act(obs, epsilon=0.15)
cont_collector = Discrete2Continuous(-2, 2, adim, collector).act
shower    = lambda obs: dqn.act(obs, epsilon=0)
cont_shower = Discrete2Continuous(-2, 2, adim, shower).act
random_policy = RandomPolicy(env, True).act


for ep in range(0, 100000):
    if ep%1==0:
        pi = cont_collector if len(rbuff)*0>=bsize*30 else random_policy
        oldobs, _, r, obs, d, _, _, _, a = envs.rollout(pi, debug=0)
        rbuff.add(oldobs, a, r, obs, d)
    writer.add_scalar('dqn/reward', r.mean(), ep)
    rew = r.sum().item()
    

    if bsize*5 > len(rbuff):
        print(len(rbuff), end='\r')
        continue

    for jup in range(20):
        oldobs, a, r, obs, done = rbuff.get(bsize)
        for opt_step in range(3):
            target = tdqn.get_max(obs) * (1-done)
            calc = dqn.get_q(oldobs, a)
            loss = ((r + gamma*target - calc)**2).mean()
            opt.zero_grad()
            loss.backward()
            opt.step()          
        polyak(dqn, tdqn, 1-1/400)


    writer.add_scalar('dqn/model', loss.item(), ep)

    
    if ep%10==0:
        print(ep, loss.item(), rew, "jup")
    if ep%30==0:
        print(cont_shower(get_batch_obs(env, 10))[-1][-1].transpose(0, 1))
    if ep%500==0:
        print("showit = ", showit(env, cont_shower, time_delta=0.01, max_steps=1000))


9140625 -1217.428955078125 jup
33260 48.78451919555664 -1144.470947265625 jup
33270 44.68959045410156 -1246.6986083984375 jup
tensor([[4, 1, 1, 3, 0, 1, 1, 3, 4, 0]])
33280 49.94258117675781 -1209.6519775390625 jup
33290 52.66849899291992 -1507.5042724609375 jup
33300 48.48670959472656 -1473.4912109375 jup
tensor([[1, 4, 1, 0, 4, 1, 1, 1, 1, 4]])
33310 54.777462005615234 -1007.6829223632812 jup
33320 50.992591857910156 -1662.701904296875 jup
33330 50.831607818603516 -1794.418701171875 jup
tensor([[4, 1, 0, 4, 1, 1, 1, 4, 4, 4]])
33340 57.545894622802734 -1071.7847900390625 jup
33350 47.501731872558594 -1529.8084716796875 jup
33360 43.547607421875 -892.4580688476562 jup
tensor([[4, 1, 4, 4, 1, 1, 1, 4, 1, 1]])
33370 40.36446762084961 -849.4303588867188 jup
33380 49.456581115722656 -1452.39208984375 jup
33390 44.80898666381836 -901.8341674804688 jup
tensor([[4, 4, 1, 1, 1, 1, 1, 1, 4, 1]])
33400 53.312652587890625 -744.5797729492188 jup
33410 45.87380599975586 -1158.294921875 jup
33420 4

KeyboardInterrupt: 

In [12]:
showit(env, cont_collector ,time_delta=0.01)



-1467.5481380216147

In [19]:
cont_shower = Discrete2Continuous(-2, 2, adim, shower).act
showit(env, cont_shower ,time_delta=0.01)



-1764.5133321127173

In [21]:
cont_shower(get_batch_obs(env, 10))[-1][-1]

tensor([[0],
        [1],
        [0],
        [4],
        [1],
        [1],
        [4],
        [1],
        [1],
        [1]])