In [1]:
import gym
import time
import cv2
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from pprint import pprint
import utils
import torch
from torch.utils.tensorboard import SummaryWriter
import os
from collections import deque
import random

envname = 'BipedalWalker-v3'
envname = 'LunarLanderContinuous-v2'
envname = 'Pendulum-v0'
env = gym.make(envname)

In [2]:
class ReplayBuffer():
    def __init__(self, max_len=100000):
        self.old_obs = deque([])
        self.a = deque([])
        self.r = deque([])
        self.obs = deque([])
        self.size = max_len
        self.length = 0
        self.max_len = max_len
    def __len__(self):
        return self.length
    def remover(self):
        if self.length > self.max_len:
            self.a.popleft()
            self.old_obs.popleft()
            self.r.popleft()
            self.obs.popleft()
            self.length -= 1
    def add(self, old_obs, a, r, obs):
        self.old_obs.append(old_obs)
        self.a.append(a)
        self.r.append(r)
        self.obs.append(obs)
        self.length += 1
        self.remover()
    def get(self, bsize):
        lidx = random.sample(range(0, self.length), bsize)
        old_obs = [self.old_obs[i] for i in lidx]
        a       = [self.a[i] for i in lidx]
        r       = [self.r[i] for i in lidx]
        obs     = [self.obs[i] for i in lidx]
        
        old_obs = torch.tensor(list(old_obs), dtype=torch.float).detach()
        a       = torch.tensor(list(a)      , dtype=torch.float).detach()
        r       = torch.tensor(list(r)      , dtype=torch.float).detach().unsqueeze(-1)
        obs     = torch.tensor(list(obs)    , dtype=torch.float).detach()
        return old_obs, a, r, obs
    
repbuff = ReplayBuffer(1000000)

In [3]:
class Model(nn.Module):
    def __init__(self, idim, h1dim, h2dim, rdim, obsdim):
        super().__init__()
        self.lin1 = nn.Linear(idim, h1dim)
        self.lin2 = nn.Linear(h1dim, h2dim)
        self.lin3 = nn.Linear(h2dim, rdim)
        self.lin4 = nn.Linear(h2dim, obsdim)
    def forward(self, oldobs, a):
        #x = torch.tensor(x, dtype=torch.float)
        x = torch.cat((oldobs, a), dim=1)
        h = self.lin1(x)
        h = F.relu(h)
        h = self.lin2(h)
        h = F.relu(h)
        r = self.lin3(h)
        obs = self.lin4(h) + oldobs
        return r, obs
    
adim = env.action_space.shape[0]
sdim = env.observation_space.shape[0]
rdim = 1
idim = adim + sdim
odim = rdim + sdim
model = Model(idim, 128, 64, rdim, sdim)
model

Model(
  (lin1): Linear(in_features=4, out_features=128, bias=True)
  (lin2): Linear(in_features=128, out_features=64, bias=True)
  (lin3): Linear(in_features=64, out_features=1, bias=True)
  (lin4): Linear(in_features=64, out_features=3, bias=True)
)

In [4]:
opt = torch.optim.Adam(model.parameters(), lr=5e-3)

In [5]:
exname = 'model_bip_lander_big_residual'
os.system('mkdir tb/' + exname)
writer = SummaryWriter('tb/'+exname)

bsize = 2048
loss = 0; rloss=0; oloss=0;
for ep in range(10000):
    obs = env.reset()
    if ep%1==0:
        for step in range(200):
            a = env.action_space.sample()
            oldobs = obs
            obs, r, done, info = env.step(a)
            repbuff.add(oldobs, a, r, obs)

            if done:
                obs = env.reset()
                break
    if bsize*3 > len(repbuff):
        continue
    oldobs, a, r, obs = repbuff.get(bsize)

    for opt_step in range(3):
        pr, pobs = model(oldobs, a)
        rloss = ((pr-r)**2).mean()
        oloss = ((pobs-obs)**2).mean()
        loss = rloss + 10*torch.abs(rloss/oloss).detach()*oloss
        opt.zero_grad()
        loss.backward()
        opt.step()
    writer.add_scalar('loss/model', loss.item(), ep)
    writer.add_scalar('loss/reward', rloss.item(), ep)
    writer.add_scalar('loss/states', oloss.item(), ep)
    
    if ep%10==0:
        print(ep, loss.item(),rloss.item(), oloss.item())

30 577.809814453125 52.528167724609375 0.07369256019592285
40 296.0943908691406 26.917673110961914 0.0012950926320627332
50 208.63461303710938 18.96678352355957 0.0009625746752135456
60 161.9499969482422 14.722726821899414 0.0006962430197745562
70 132.30828857421875 12.028026580810547 0.00020194215176161379
80 115.37997436523438 10.48908805847168 0.00043812012881971896
90 105.11175537109375 9.555614471435547 0.0002664974599611014
100 96.4204330444336 8.765493392944336 0.000300785293802619
110 81.88284301757812 7.443894863128662 0.00013955438043922186
120 72.55297088623047 6.595724105834961 0.0002432800829410553
130 62.291873931884766 5.66289758682251 0.00012828654143959284
140 52.03126525878906 4.730114936828613 0.00024562733597122133
150 43.03444290161133 3.912222146987915 9.249544382328168e-05
160 33.83195877075195 3.0756325721740723 0.0001509079447714612
170 25.59825325012207 2.3271141052246094 0.00013125757686793804
180 21.262353897094727 1.9329413175582886 0.00016264778969343752
1

KeyboardInterrupt: 

In [185]:
len(repbuff)

349494

In [6]:
def policy(obs, nsmpls=100, tlen=10):
    maxr = float('-inf')
    obs = obs.repeat(nsmpls, 1)
    rs = torch.zeros(nsmpls, 1)
    for t in range(tlen):
        a = nsample(env.action_space.sample, nsmpls)
#         print(a, "ass")
#         print(obs.shape, a.shape)
        r, obs = model(obs, a)
        rs += r
        a = a.unsqueeze(1)
        if t==0:
            ass = a
        else:
            ass = torch.cat((ass, a), dim=1)
    ii = torch.argmax(rs)
    #print(rs[ii])
#     print(ass)
    return ass[ii]
    

In [11]:
policy(torch.zeros(1, 3))[0]

tensor([1.5685])

In [131]:
env.action_space.sample().shape, env.observation_space.sample().shape

((4,), (24,))

In [7]:
def nsample(smpl_func, n=100):
    toreturn = np.expand_dims(smpl_func(), axis=0)
    for i in range(n-1):
        new = np.expand_dims(smpl_func(), axis=0)
        toreturn = np.concatenate((toreturn, new), axis=0)
    return torch.tensor(toreturn, dtype=torch.float)

nsample(env.action_space.sample).shape


torch.Size([100, 1])

In [8]:
def showit():
    dons = 0
    observation = env.reset()
    for t in range(2000):
            env.render()
            obser = np.expand_dims(observation, axis=0)
            ass = policy(torch.tensor(obser).float(), 100, 30)
            observation, reward, done, info = env.step(ass[0].numpy())
            #observation, reward, done, info = env.step(env.action_space.sample())
            rp, op = model(torch.tensor(obser).float(), ass[0].unsqueeze(0))
            print(rp.item())
            print(reward)
            print(op)
            print(obs)
            print(("---"+str(t)+"---")*10)
            
            dons += done
            if done:
                print("r", reward)
                break
            time.sleep(0.005)
            print(t, "/200", reward, end='\r')

env.close()
cv2.destroyAllWindows()

In [10]:
showit()

AddBackward0>)
tensor([[ 0.9948,  0.1020,  1.7881],
        [-0.8947, -0.4468,  7.4100],
        [-0.8634,  0.5046, -5.6968],
        ...,
        [ 0.0617, -0.9981,  5.1603],
        [ 0.4373,  0.8993, -1.4300],
        [ 0.4022, -0.9155, -1.6348]])
---133------133------133------133------133------133------133------133------133------133---
133 /200 -0.009278881078197005-0.04819381237030029
-0.014081505706157266
tensor([[ 0.9962,  0.0425, -0.1520]], grad_fn=<AddBackward0>)
tensor([[ 0.9948,  0.1020,  1.7881],
        [-0.8947, -0.4468,  7.4100],
        [-0.8634,  0.5046, -5.6968],
        ...,
        [ 0.0617, -0.9981,  5.1603],
        [ 0.4373,  0.8993, -1.4300],
        [ 0.4022, -0.9155, -1.6348]])
---134------134------134------134------134------134------134------134------134------134---
134 /200 -0.014081505706157266-0.02733445167541504
-0.004701965602051974
tensor([[ 0.9986,  0.0429, -0.0174]], grad_fn=<AddBackward0>)
tensor([[ 0.9948,  0.1020,  1.7881],
        [-0.8947, -0.446

KeyboardInterrupt: 