In [1]:
import gym
import torch
import copy
from myrl.buffers import ReplayBuffer
from myrl.visualizer import showit
from myrl.utils import ExperimentWriter, Discrete2Continuous
from myrl.value_functions import polyak, DQN
from myrl.policies import RandomPolicy
from myrl.environments import Envs


envname = 'BipedalWalker-v3'
envname = 'MountainCar-v0'
envname = 'Pendulum-v0'
# envname = 'CartPole-v0'
# envname = 'Boxing-ram-v0'
# envname = 'Ant-v2'
# envname = 'LunarLandŽer-v2'


wll = ExperimentWriter('tb/doubledqn_2_ ' + envname + " -")
env = gym.make(envname)
envs = Envs(envname, 1)
env.action_space, env.observation_space, env.observation_space.sample()

(Box(1,), Box(3,), array([0.29680926, 0.19608858, 1.0966952 ], dtype=float32))

In [2]:
rbuff = ReplayBuffer(700000)

In [3]:
print(env.action_space, env.observation_space)
adim = 5#env.action_space.shape[0]
sdim = env.observation_space.shape[0]
dqn1 = DQN([sdim, 32, 16, adim])
dqn2 = DQN([sdim, 32, 16, adim])
tdqn1 = copy.deepcopy(dqn1)
tdqn2 = copy.deepcopy(dqn2)
dqn1

Box(1,) Box(3,)


DQN(
  (layers): ModuleList(
    (0): Linear(in_features=3, out_features=32, bias=True)
    (1): Linear(in_features=32, out_features=16, bias=True)
    (2): Linear(in_features=16, out_features=5, bias=True)
  )
)

In [4]:
opt = torch.optim.Adam(list(dqn1.parameters())+list(dqn2.parameters()), lr=1e-2)

In [5]:
# wll.new()
writer = wll.writer
bsize = 256
warmup = 100
gamma = 0.99
loss = torch.tensor([1])

collector = lambda obs: dqn1.act(obs, epsilon=0.15)
d2c_collector = Discrete2Continuous(-2, 2, adim, collector).act
shower    = lambda obs: dqn1.act(obs, epsilon=0)
d2c_shower = Discrete2Continuous(-2, 2, adim, shower).act
random_policy = RandomPolicy(env).act
random_policy    = lambda obs: dqn1.act(obs, epsilon=1)
d2c_random_policy = Discrete2Continuous(-2, 2, adim, random_policy).act

for ep in range(670, 10000):
    obs = env.reset()
    if ep%1==0:
        pi = d2c_collector if len(rbuff)>bsize*5 else d2c_random_policy
        l1 = envs.rollout(pi)

        oldobs, _, r, obs, d, _, _, _, a = envs.rollout(pi)
        rbuff.add(oldobs, a, r, obs, d)
        writer.add_scalar('dqn/reward', r.sum(), ep)


    if bsize*10 > len(rbuff):
        print(len(rbuff), end='\r')
        continue

    if ep%10==0:
        print(ep, loss.item(), r.sum().item(), wll.z)
    if ep%50==0:
        print("showit = ", showit(env, d2c_shower))

    for i in range(15):
        oldobs, a, r, obs, done = rbuff.get(bsize)
        for opt_step in range(4):
            # target1 = tdqn1.get_max(obs) * (1-done)
            # target2 = tdqn2.get_max(obs) * (1-done)
            # target = torch.min(target1, target2)*gamma*(1-done) + r  
            import random
            if random.uniform(0, 1) > 0:
                max_action = dqn1.get_action(obs)
                target = dqn1.get_q(obs, max_action)
            else:
                max_action = dqn1.get_action(obs)
                target = dqn2.get_q(obs, max_action)

            target = target*gamma*(1-done) + r            
            calc1 = dqn1.get_q(oldobs, a)
            calc2 = dqn2.get_q(oldobs, a)
            l2 = ((target - calc2)**2).mean()
            l1 = ((target - calc1)**2).mean()
            loss = l1 + l2
            opt.zero_grad()
            loss.backward()
            opt.step()
        # polyak(dqn1, tdqn1, 1-1/100)
        # polyak(dqn2, tdqn2, 1-1/100)

    writer.add_scalar('dqn/model', loss.item(), ep)
    writer.add_scalar('dqn/model1', l1.item(), ep)
    writer.add_scalar('dqn/model2', l2.item(), ep)
    



2004006008001000120014001600180020002200240020 28.96443748474121 -1466.02099609375 1
30 31.65414047241211 -1209.1741943359375 1
40 30.62737274169922 -1328.48681640625 1
50 34.157352447509766 -1595.881591796875 1
showit =  -1200.9164926988076
60 31.42276382446289 -1563.5518798828125 1
70 39.814823150634766 -1392.8045654296875 1
80 38.701576232910156 -1339.0732421875 1
90 37.42328643798828 -1442.5106201171875 1
100 41.65359115600586 -1387.8355712890625 1
showit =  -1491.0219798034784
110 41.82170867919922 -1579.4315185546875 1
120 41.01228332519531 -1543.7633056640625 1
130 38.82516098022461 -1469.294677734375 1
140 42.485328674316406 -1501.2596435546875 1
150 40.216819763183594 -1523.367919921875 1
showit =  -1650.8421307938775
160 39.04027557373047 -1519.58642578125 1
170 5.488317012786865 -1052.0478515625 1
180 5.25505256652832 -1558.3291015625 1
190 9.042651176452637 -479.3251647949219 1
200 8.76923942565918 -1574.0145263671875 1
showit =  -1648.4716166383137
210 9.942784309387207 -1

KeyboardInterrupt: 

In [None]:
class RunningNormalizer():
    def __init__(self, alpha=0.97):
        self.m = None
        self.std = None
    def step(self, tensor):
        m = tensor.mean()
        std = tensor.std()
        if self.m is None:
            self.m = m
            self.std = std
        else:
            self.m = self.m*self.alpha + m*(1-self.alpha)
            self.std = self.std*self.alpha + std*(1-self.alpha)
        return (tensor-self.m)/self.std