In [1]:
import gym
import time
import cv2
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from pprint import pprint
import utils
import torch
from torch.utils.tensorboard import SummaryWriter
import os
from collections import deque
import random
from myrl.buffers import ReplayBuffer, PrioritizedReplayBuffer
from myrl.visualizer import showit
from myrl.utils import ExperimentWriter
from myrl.value_functions import polyak
from myrl.policies import RandomPolicy
from myrl.environments import Envs


wll = ExperimentWriter('tb/dqn_lunarlander-')
envname = 'BipedalWalker-v3'
envname = 'LunarLander-v2'
# envname = 'CartPole-v0'
# envname = 'MountainCar-v0'
# envname = 'Pendulum-v0'
# envname = 'Boxing-ram-v0'
# envname = 'Breakout-ram-v0'

env = gym.make(envname)
envs = Envs(envname, 1)

In [6]:
rbuff = ReplayBuffer(100000)

In [21]:
class DQN(nn.Module):
    def __init__(self, net_arch):
        super().__init__()
    
        self.layers = nn.ModuleList([nn.Linear(a, b) for a, b in zip(net_arch[:-1], net_arch[1:])])
        self.odim = net_arch[-1]
    def forward(self, h):
        for lay in self.layers[:-1]:
            h = F.relu(lay(h))
        h = self.layers[-1](h)
        h = torch.softmax(h, dim=-1)
        return h
    def act(self, obs, epsilon=0.1, debug=True):
        obs = torch.tensor(obs).float()
        qs = self.forward(obs)
        if random.uniform(0, 1) > epsilon:
            ii = torch.argmax(qs, dim=-1).unsqueeze(-1)
        else:
            try:
                qs.shape[3]
                ii = torch.randint(0, self.odim-1, (qs.shape[0], qs.shape[1], 1))
            except:
                ii = torch.randint(0, self.odim-1, (qs.shape[0], 1))     
        dummy = torch.tensor([[1]]).float()   
        # print(ii.shape)
        return ii.numpy(), (dummy, dummy, dummy)
    
print(env.action_space, env.observation_space)
adim = 4#env.action_space.shape[0]
sdim = env.observation_space.shape[0]
dqn = DQN([sdim, 256, 128, adim])
import copy
tdqn = copy.deepcopy(dqn)
dqn

Discrete(4) Box(8,)


DQN(
  (layers): ModuleList(
    (0): Linear(in_features=8, out_features=256, bias=True)
    (1): Linear(in_features=256, out_features=128, bias=True)
    (2): Linear(in_features=128, out_features=4, bias=True)
  )
)

In [22]:
opt = torch.optim.Adam(dqn.parameters(), lr=1e-5)

In [24]:
wll.new()
writer = wll.writer
bsize = 512
warmup = 100
gamma = 1

collector = lambda obs: dqn.act(obs, epsilon=0.2)
shower    = lambda obs: dqn.act(obs, epsilon=0)
random_policy = RandomPolicy(env).act


for ep in range(0, 100000):
    obs = env.reset()
    if ep%1==100:
        pi = collector if len(rbuff)>bsize*50 else random_policy
        oldobs, a, r, obs, d, _, _, _ = envs.rollout(pi, length=20, debug=0)
        # print(oldobs.shape, a.shape, r.shape, obs.shape, d.shape)

        rbuff.add(oldobs, a.unsqueeze(-1), r, obs, d)
        writer.add_scalar('dqn/reward', r.mean(), ep)
    rew = r.mean().item()

    if bsize*50 > len(rbuff):
        print(len(rbuff), end='\r')
        continue
    
    for jup in range(10):
        oldobs, a, r, obs, done = rbuff.get(bsize)
        for opt_step in range(3):
            target = torch.max(tdqn(obs), dim=-1)[0].detach().unsqueeze(-1) * (1-done)
            frst = torch.tensor(list(range(bsize)))
            calc = dqn(oldobs)
            # print(frst)
            # print(a)showit(env, shower)
            calc = calc[frst, a.long().squeeze(-1)].unsqueeze(-1)

            loss1 = ((r + gamma*target - calc)**2)
            loss = loss1.mean()
            opt.zero_grad()
            loss.backward()
            opt.step()
        # print(oldobs.shape, a.shape, r.shape, obs.shape, done.shape, loss1.shape)
        # rbuff.add(oldobs, a, r, obs, done, vals=loss1)
    writer.add_scalar('dqn/model', loss.item(), ep)
    polyak(dqn, tdqn, 1-1/100)
    
    if ep%30==0:
        print(ep, loss.item(), rew)
    if ep%500==0:
        print("showit = ", showit(env, collector, time_delta=0.01, max_steps=1000))

0 146.79052734375 -2.276033401489258
showit =  -81.71330661426961
30 124.9271240234375 -1.5734354257583618
60 126.68909454345703 -1.7204259634017944
90 69.67231750488281 -2.405886650085449
120 191.80380249023438 -2.4016318321228027
150 148.9420623779297 -2.5640714168548584
180 136.72557067871094 -1.9324913024902344
210 183.9332733154297 -1.9276511669158936


KeyboardInterrupt: 

In [None]:
novishower = lambda obs: dqn.act(obs, 0.0)
showit(env, novishower ,max_steps=1000)

In [22]:
len(rbuff)//2000

34

In [11]:
for i in range(5):  
    print(len(rbuff.deqs[i]))
    # print(rbuff.deqs[])

20
1
20
20
20


In [43]:
def showit(env, act, time_delta=0.005, max_steps=2000):
    dons = 0
    observation = env.reset()
    total_rew = 0
    for t in range(max_steps):
            env.render()
            obser = np.expand_dims(observation, axis=0)
            action, _= act(obser)
            #print(action, _)
            if type(env.action_space) is gym.spaces.box.Box:
                observation, reward, done, info = env.step(action.squeeze(0))
            else:
                observation, reward, done, info = env.step(action.squeeze(0).squeeze(-1))
            print(action, reward)
            dons += done
            total_rew += reward
            if done:
                env.close()
                break
            time.sleep(time_delta)
            print(t, "/"+str(max_steps), end='\r')
    env.close()
    return total_rew