In [1]:
import gym
import myrl
from myrl.buffers import ReplayBuffer, PrioritizedReplayBuffer
from myrl.environments import Envs
from myrl.value_functions import ValueFunctionMLP, polyak
from myrl.policies import LinearPolicy, RandomPolicy
from myrl.visualizer import showit
from myrl.utils import normal_noise, ExperimentWriter
import torch
import copy
from collections import deque
import numpy as np
import torch.nn as nn


z = 0
envname = 'Pendulum-v0'
env = gym.make(envname)
envs = Envs(envname, 3)
random_policy = RandomPolicy(env).act
obsdim = env.observation_space.shape[0]
adim = env.action_space.shape[0]
wll = ExperimentWriter('tb/pentd3')

device = torch.device('cuda' if torch.cuda.is_available() or 0 else 'cpu')
device

device(type='cuda')

In [8]:
rbuff = ReplayBuffer(max_len=100000)

maxer = LinearPolicy(obsdim, 32, adim)
q1 = ValueFunctionMLP(obsdim+adim, 64, 32, 1)
q2 = ValueFunctionMLP(obsdim+adim, 64, 32, 1)
tq1 = copy.deepcopy(q1)
tq2 = copy.deepcopy(q2)
opt = torch.optim.Adam(list(q1.parameters())+list(q2.parameters()), lr=1e-3)
mopt = torch.optim.Adam(maxer.parameters(), lr=1e-3)

In [9]:
def chck(pi):
    import numpy as np 
    nes = np.expand_dims(env.observation_space.sample(), axis=0)
    a, (d, sm, h)=pi.act(nes)
    return d
def clip_grad_norm_(module, max_grad_norm):
    nn.utils.clip_grad_norm_([p for g in module.param_groups for p in g["params"]], max_grad_norm)

In [11]:
wll.new()
writer = wll.writer
for i in range(1):
    eps = 10000
    gamma = 0.95
    bsize = 128
    warmup = 5000
    maxeract = lambda obs: maxer.act(obs, std=0.3)

    for ep in range(0, eps):

        for i in range(1):
            pi = random_policy if len(rbuff)<warmup else maxeract
            oldobs, a, r, obs, d, _, _, _ = envs.rollout(pi, debug=0)
            rbuff.add(oldobs, a, r, obs, d)
        writer.add_scalar("multi/rew", r.mean().item(), ep)
        rreal = r.mean().item()
        

        if len(rbuff) < bsize:
            print(len(rbuff), end='\r')
            continue  

        for put in range(20):
            # print(len(rbuff), bsize)
            # print(len(rbuff.deqs[0]))
            oldobs, a, r, obs, d = rbuff.get(bsize)


            for optstep in range(2):
                amax = maxer(obs).detach()
                amax += normal_noise(amax, 0.08)
                in1 = torch.cat((obs, amax), dim=-1)
                in2 = torch.cat((oldobs, a), dim=-1)
                target = r + gamma*(1-d*0)*torch.min(tq1(in1), tq2(in1))[0]
                td_error = ((target - q1(in2))**2) + ((target - q2(in2))**2)
                td = td_error.mean()
                opt.zero_grad()
                td.backward()
                opt.step()
            # rbuff.add(oldobs, a, r, obs, d)#, vals=-td_error)
            
            if not put%2 or 1:
                for optstep in range(4):
                    amax = maxer(obs)
                    in1 = torch.cat((obs, amax), dim=-1)
                    maxer_loss = -q1(in1).mean()
                    mopt.zero_grad()
                    maxer_loss.backward()
                    clip_grad_norm_(mopt, 0.5)
                    mopt.step()

            polyak(q1, tq1, 1-1/70)
            polyak(q2, tq2, 1-1/70)

        writer.add_scalar("multi/maxer_loss", maxer_loss.item(), ep)
        writer.add_scalar("multi/td_q", td.item(), ep)

        if ep%20==0:
            print(ep, maxer_loss.item(), r.mean().item(), td.item(), len(rbuff), chck(maxer).item(), rreal, wll.s+str(wll.z))
        if ep%200==0:
            showit(env, maxer.act, )
            env.close()


0 133.97738647460938 -6.336296081542969 8.408854484558105 100000 -1.0595449209213257 -6.513232707977295 tb/pentd36
100 135.8064727783203 -6.703187942504883 34.17329788208008 100000 -1.3588658571243286 -7.248016834259033 tb/pentd36
120 133.97274780273438 -7.099801540374756 37.11856460571289 100000 -1.2055333852767944 -7.416893482208252 tb/pentd36
140 140.18563842773438 -7.476479530334473 3.5062918663024902 100000 -1.0993207693099976 -7.140685081481934 tb/pentd36
160 136.23696899414062 -7.02285099029541 76.46674346923828 100000 0.4909179210662842 -7.197425365447998 tb/pentd36
180 138.33795166015625 -6.810129642486572 69.14186096191406 100000 -1.690279483795166 -6.618500232696533 tb/pentd36
200 136.7490234375 -6.778636932373047 3.575767993927002 100000 -0.07536043971776962 -7.258176326751709 tb/pentd36
198 /2000220 140.67257690429688 -6.941531181335449 1.0943195819854736 100000 0.14448195695877075 -7.555851459503174 tb/pentd36
240 141.18626403808594 -7.236208915710449 0.9635403156280518 1

KeyboardInterrupt: 

In [24]:
len(rbuff), len(rbuff.minheap.heap), type(rbuff.minheap.heap)

(58373, 58340, list)

In [51]:
rbuff.minheap.heap[:100]

0000,  0.0000]]),
  tensor([[0.0061, 0.0943]]),
  tensor([[-0.1348]]),
  tensor([[ 0.0172,  1.4702,  0.1591,  0.1603, -0.0201, -0.0367,  0.0000,  0.0000]]),
  tensor([[0.]])],
 [-1.2980748414993286,
  tensor([[ 0.8366,  0.4229,  1.0289, -0.7017, -0.2128, -0.0089,  0.0000,  0.0000]]),
  tensor([[-0.0011,  0.3590]]),
  tensor([[-1.7647]]),
  tensor([[ 0.8469,  0.4065,  1.0289, -0.7284, -0.2132, -0.0089,  0.0000,  0.0000]]),
  tensor([[0.]])],
 [-1.3040266036987305,
  tensor([[ 0.6615,  0.3700,  1.2344, -0.6874, -0.4556, -0.0880,  0.0000,  0.0000]]),
  tensor([[0.0196, 0.2994]]),
  tensor([[-3.2734]]),
  tensor([[ 0.6740,  0.3543,  1.2567, -0.6960, -0.4596, -0.0802,  0.0000,  0.0000]]),
  tensor([[0.]])],
 [-1.3010417222976685,
  tensor([[-0.2988,  0.0552, -0.3717,  0.0056, -0.0052,  0.0080,  0.0000,  0.0000]]),
  tensor([[0.0951, 0.3398]]),
  tensor([[-0.3845]]),
  tensor([[-0.3025,  0.0552, -0.3707, -0.0038, -0.0048,  0.0088,  0.0000,  0.0000]]),
  tensor([[0.]])],
 [-1.303085207939148,

In [52]:
from heapq import heappop
for i in range(1000):
    print(i, end='\r')
    
    heappop(rbuff.minheap.heap[:3])
    # rbuff.add(oldobs, a, r, obs, d)

0

RuntimeError: bool value of Tensor with more than one value is ambiguous

In [23]:
oldobs, a, r, obs, d = rbuff.get(bsize)

In [73]:
from heapq import heappop
heappop(rbuff.minheap.heap)

RuntimeError: bool value of Tensor with more than one value is ambiguous

In [54]:
for i in range(len(rbuff)-512):
    print(i, end='\r')
    if type(rbuff.minheap.heap[i][0]) != type(2.2) or 1:
        print(rbuff.minheap.heap[i][0])

4167480469
80.78876495361328
77.13992309570312
76.94164276123047
84.19696044921875
77.95185852050781
77.52365112304688
79.0836410522461
79.05925750732422
105.01483917236328
120.8262939453125
127.11097717285156
138.15594482421875
139.5283660888672
126.37893676757812
138.97605895996094
108.20635986328125
126.78392028808594
126.03907012939453
105.32546997070312
110.4582748413086
138.41761779785156
138.55613708496094
103.26380157470703
107.60101318359375
107.34993743896484
124.75959777832031
114.57164001464844
111.92471313476562
121.66626739501953
138.2199249267578
140.74972534179688
138.24034118652344
141.03289794921875
133.9188995361328
138.0693359375
104.6457748413086
111.25828552246094
207.53164672851562
138.18309020996094
138.4364471435547
138.85540771484375
110.75657653808594
147.15150451660156
138.9591827392578
112.87947845458984
139.85845947265625
138.87672424316406
114.1255874633789
138.19126892089844
218.67672729492188
138.2176971435547
165.14398193359375
138.22959899902344
116.9

IndexError: list index out of range

In [51]:
for i in range(len(rbuff)):
    print(type(rbuff.minheap.heap[i][0]))

<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class '

IndexError: list index out of range

In [6]:
rbuff.minheap.heap[:10]

[(-9410.3427734375,
  tensor([[ 0.3903,  0.0984,  0.6862, -0.2911, -2.2004, -2.1916,  0.0000,  0.0000]]),
  tensor([[-0.3205, -0.4049]]),
  tensor([[-100.]]),
  tensor([[ 0.3984,  0.0981,  0.7741, -0.0522, -2.2540, -1.1055,  0.0000,  0.0000]]),
  tensor([[1.]])),
 (-9410.3427734375,
  tensor([[ 0.3903,  0.0984,  0.6862, -0.2911, -2.2004, -2.1916,  0.0000,  0.0000]]),
  tensor([[-0.3205, -0.4049]]),
  tensor([[-100.]]),
  tensor([[ 0.3984,  0.0981,  0.7741, -0.0522, -2.2540, -1.1055,  0.0000,  0.0000]]),
  tensor([[1.]])),
 (-1.0529377460479736,
  tensor([[ 0.0505,  1.4812,  0.1638, -0.0376,  0.0190,  0.0817,  0.0000,  0.0000]]),
  tensor([[-0.9213, -0.4799]]),
  tensor([[-1.0936]]),
  tensor([[ 0.0522,  1.4797,  0.1638, -0.0652,  0.0231,  0.0817,  0.0000,  0.0000]]),
  tensor([[0.]])),
 (-738.4655151367188,
  tensor([[-0.0057,  1.4396, -0.1884,  0.4452,  0.0037,  0.0176,  0.0000,  0.0000]]),
  tensor([[ 0.7040, -0.6778]]),
  tensor([[-3.7636]]),
  tensor([[-0.0075,  1.4501, -0.1907,  0

In [5]:
rbuff.minheap.heap

[[ 0.6434, -0.3702]]),
  tensor([[3.9034]]),
  tensor([[-0.0140,  0.4453, -0.0687, -0.9272,  0.0980, -0.0933,  0.0000,  0.0000]]),
  tensor([[0.]])),
 (0.22102661430835724,
  tensor([[ 0.0133,  1.4321,  0.2022,  0.1684, -0.0069,  0.0168,  0.0000,  0.0000]]),
  tensor([[0.9590, 0.9289]]),
  tensor([[-3.1459]]),
  tensor([[ 0.0154,  1.4364,  0.2131,  0.1908, -0.0078, -0.0195,  0.0000,  0.0000]]),
  tensor([[0.]])),
 (0.21037587523460388,
  tensor([[-0.0191,  1.5021, -0.1917,  0.3286,  0.0188,  0.0054,  0.0000,  0.0000]]),
  tensor([[ 0.1198, -0.4621]]),
  tensor([[-1.5380]]),
  tensor([[-0.0210,  1.5097, -0.1911,  0.3356,  0.0191,  0.0070,  0.0000,  0.0000]]),
  tensor([[0.]])),
 (0.4519810378551483,
  tensor([[ 0.0539,  1.4777,  0.1638, -0.0919,  0.0272,  0.0817,  0.0000,  0.0000]]),
  tensor([[-0.9598,  0.3461]]),
  tensor([[-1.5870]]),
  tensor([[ 0.0556,  1.4750,  0.1638, -0.1186,  0.0313,  0.0817,  0.0000,  0.0000]]),
  tensor([[0.]])),
 (0.9456573724746704,
  tensor([[ 0.1240,  1.1