In [1]:
# https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
import matplotlib.pyplot as plt

In [1]:
import gym
import os
import numpy as np
from pyvirtualdisplay import Display
from collections import deque
import torch

from cartpole.config import get_cfg_defaults
from cartpole.utils import ReplayMemory, screen_to_state
#from cartpole.model import DQN

cfg = get_cfg_defaults()

devices = ",".join(str(i) for i in cfg.SYSTEM.DEVICES)
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = devices

torch_devices = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
"""
If you want to access the behind-the.scenes dynamics of a specific environment, 
then you use the unwrapped property.
"""
display = Display(visible=0, size=cfg.SYSTEM.VIRTUAL_SCREEN)
display.start()
env = gym.make("CartPole-v0").unwrapped

In [13]:
# model.py
import torch
import torch.nn as nn
import torch.nn.functional as F

class DQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=2)
        self.bn1 = nn.BatchNorm2d(32)
        
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=2)
        self.bn2 = nn.BatchNorm2d(64)
        
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=2)
        self.bn3 = nn.BatchNorm2d(64)
        
        h,w, *_ = input_size
        conv_w_out = conv2d_size_out(conv2d_size_out(conv2d_size_out(w, kernel_size=3, stride=2), 
                                                     kernel_size=3, stride=2), 
                                     kernel_size=3, stride=2)
        conv_h_out = conv2d_size_out(conv2d_size_out(conv2d_size_out(h, kernel_size=3, stride=2), 
                                                     kernel_size=3, stride=2), 
                                     kernel_size=3, stride=2)
        
        linear_input_size = conv_w_out * conv_h_out * 64
        self.head = nn.Linear(linear_input_size, output_size)
        
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        return self.head(x.view(x.size(0), -1))
        
        
def conv2d_size_out(size, kernel_size = 5, stride = 2):
    return (size - (kernel_size - 1) - 1) // stride  + 1

In [5]:
# agent.py
import torch
import torch.optim as optim

from cartpole.utils import Transition, ReplayMemory
import random

class DqnAgent():
    def __init__(self, 
                 state_shape, action_space,
                 device,
                 soft_update_ratio=0.01, 
                 learning_rate=1e-4, 
                 gamma=0.99, 
                 batch_size=128,
                 update_every = 10,
                 memory_size=10000
                 ):
        self.state_shape = state_shape,
        self.action_space = action_space
        self.soft_update_ratio = soft_update_ratio
        self.gamma = gamma
        self.update_every = update_every
        self.batch_size = batch_size
        self.device = device
        
        # -- init -- #
        self.q_policy_net = DQN(input_size=state_shape, output_size=action_space).to(device)
        self.q_target_net = DQN(input_size=state_shape, output_size=action_space).to(device)
        self.q_target_net.load_state_dict(self.q_policy_net.state_dict()) # sync weights
        self.q_target_net.eval()
        
        self.optimizer = optim.RMSprop(self.q_policy_net.parameters(), lr=learning_rate)
        
        self.memory = ReplayMemory(capacity=memory_size)
        self.t_step = 0
        
    def step(self, state, action, next_state, reward, is_done):
        """Add memory and learn"""
        self.memory.push(state, action, next_state, reward, is_done*1.)

        if len(self.memory) > self.batch_size:
            experience = self.memory.sample(self.batch_size)
            batch = Transition(*zip(*experience))

            state = torch.tensor(batch.state).permute(0, 3, 1, 2)
            action = torch.tensor(batch.action)
            reward = torch.tensor(batch.reward)
            next_state = torch.tensor(batch.next_state).permute(0, 3, 1, 2)
            is_done = torch.tensor(batch.done)

            q_targets_next = self.q_target_net(next_state).max(1)[0].detach() # Q(s', a)
            q_targets = reward + (self.gamma * q_targets_next * (1 - is_done)) # R + gamma*Q(s',a)

            q_expected = self.q_policy_net(state).gather(1, action.unsqueeze(1)) # Q(s,a)

            # optimize: l = R + gamma*Q(s', a) - Q(s, a)
            loss = F.smooth_l1_loss(q_expected, q_targets.unsqueeze(1)) # batch x 1

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            self.t_step = (self.t_step + 1) % self.update_every
            if self.t_step == 0:
                self._soft_update(target_model=self.q_target_net,
                                  local_model=self.q_policy_net,
                                  tau=self.soft_update_ratio)
    
    def act(self, state, eps=0.):
        """Generate actions"""
        
        if random.random() > eps:
            state = torch.tensor(state[np.newaxis, :,:,:]).permute(0, 3, 1, 2)
            self.q_policy_net.eval() # swap to evaluation mode
            with torch.no_grad():
                action_values = self.q_policy_net(state)
            self.q_policy_net.train() # swap to training mode
            
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_space))
    
    def _soft_update(self, target_model, local_model, tau):
        for target_params, local_params in zip(target_model.parameters(), loacl_model.parameters()):
            target_params.data.copy_(tau*local_params.data + (1.-tau)*target_params.data)

In [6]:
memory = ReplayMemory(capacity=int(cfg.AGENT.NUM_MEMORY_CAPACITY))
agent = DqnAgent(state_shape=cfg.MODEL.INPUT_SIZE,
                 action_space=env.action_space.n, 
                 device=torch_devices,
                 gamma=cfg.AGENT.GAMMA, 
                 batch_size=cfg.AGENT.BATCH_SIZE, 
                 )

In [7]:
import cv2
def process_image(state_image, target_size):
    w, h, *_ = target_size
    state = cv2.resize(state_image, (w, h))
    state = state / 255.
    return state
    
#test_im = env.render(mode='rgb_array')
#test_im = process_image(test_im, (128, 128, 3))

In [12]:
env.reset()
xx = env.render(mode="rgb_array")

In [15]:
np.zeros((128, 128, 3), dtype=np.float32)

array([[[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       ...,

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]], dtype=float32)

In [9]:
#cfg.AGENT.NUM_EPISODE
scores = []
scores_window = deque(maxlen=100)
eps = cfg.AGENT.EPS_START
for i_episode in range(cfg.AGENT.NUM_EPISODE):
    env.reset()
    last_screen = screen_to_state(env, target_size=cfg.MODEL.INPUT_SIZE[:2])
    current_screen = screen_to_state(env, target_size=cfg.MODEL.INPUT_SIZE[:2])
    state = current_screen - last_screen
    total_rewards = 0
    for t_counter in range(cfg.AGENT.MAX_T):
        action = agent.act(state, eps)
        
        vector_state, reward, is_done, _ = env.step(action)
        total_rewards += reward
        
        last_screen = current_screen
        current_screen = screen_to_state(env, target_size=cfg.MODEL.INPUT_SIZE[:2])
        if not is_done:
            next_state = current_screen - last_screen
        else:
            next_state = current_screen - current_screen
        state = next_state
        
        # Train the model
        agent.step(state, action, next_state, reward, is_done*1)
        #print("\rEpisode {}, Accumulated Reward: {:.3f}, remain time: {}".format(i_episode, total_rewards, t_counter), end='')
        
        if is_done:
            break
    scores.append(total_rewards)
    scores_window.append(total_rewards)
    print("\rEpisode {}, Accumulated Reward: {:.1f}, passed n mean reward: {:.1f}".format(i_episode, 
                                                                                          total_rewards, 
                                                                                          np.mean(scores_window)
                                                                                         ))
    eps = max(cfg.AGENT.EPS_END, eps*cfg.AGENT.EPS_DECAY)

Episode 0, Accumulated Reward: 39.0, passed n mean reward: 39.0
Episode 1, Accumulated Reward: 37.0, passed n mean reward: 38.0
Episode 2, Accumulated Reward: 20.0, passed n mean reward: 32.0
Episode 3, Accumulated Reward: 14.0, passed n mean reward: 27.5


KeyboardInterrupt: 

In [None]:
s = torch.tensor(batch.state).permute(0, 3, 1, 2)
ns = torch.tensor(batch.next_state).permute(0, 3, 1, 2)
r = torch.tensor(batch.reward)
d = torch.tensor(batch.done)
a = torch.tensor(batch.action)

vnxt = agent.q_target(ns).max(1)[0].detach()
vt = r + (0.9 * vnxt * (1 - d))
qe = agent.q_policy(s).gather(1, a.unsqueeze(1))

#q_targets_next = self.q_target(next_state).max(axis=1)[0].detach()
#q_targets = reward + (gamma * q_targets_next * (1 - is_done))
#q_expected = self.policy_net(state).gather(1, action)

In [None]:
vt.unsqueeze(1).shape

In [None]:
b = memory.sample(32)
batch = Transition(*zip(*b))

#state = torch.cat(batch.state)
#action = torch.cat(batch.action)
#reward = torch.cat(batch.reward)
#next_state = torch.cat(batch.next_state)
#is_done = torch.cat(batch.is_done)

In [None]:
t = torch.tensor(batch.state)
#t.permute(0,3,1,2).shape
t

In [None]:
t.permute(0, 3, 1, 2).shape

In [None]:
b = Transition(*zip(*m))
batch = [np.array(b.state, np.float32), 
         np.array(b.action, dtype=np.float32), 
         np.array(b.next_state, dtype=np.float32),
         np.array(b.reward, dtype=np.float32), 
         np.array(b.done, dtype=np.float32)]

In [None]:
new_brain = Brain(policy_net=agent.policy_net, target_net=agent.target_net, gamma=0.9)
optim = tf.keras.optimizers.Adam(lr=1e-3)

In [None]:
def compute_loss(y_true, y_pred):
    return K.square(K.mean(y_true-y_pred))

In [None]:
w = new_brain.policy_net.layers[0]
for pl,tl in zip(new_brain.policy_net.layers, new_brain.target_net.layers):
    #for wp, tp in zip(pl.get_weights(), tl.get_weights()):
    #    pass
    pl.set_weights([wp+tp for wp,tp in zip(pl.get_weights(), tl.get_weights())])

In [None]:
new_brain.policy_net.layers[0].get_weights()

In [None]:
new_brain.target_net.layers[0].get_weights()

In [None]:
"""
# Not implementedError ...
with tf.GradientTape() as tape:
    target, estimate = new_brain(batch)
    loss = compute_loss(target, estimate)    
grads = tape.gradient(loss, new_brain.trainable_variables)
"""


In [None]:
target, estimate = new_brain(batch)
loss = compute_loss(target, estimate)

In [None]:
grads = tf.gradients(loss, new_brain.trainable_variables)
_ = optim.apply_gradients(zip(grads, new_brain.trainable_variables))

In [None]:
new_brain.losses

In [None]:
import matplotlib.pyplot as plt

plt.imshow(im)
plt.show()

In [None]:
env.reset()
plt.figure()
plt.imshow(screen_to_state(env))
plt.title('Example extracted screen')
plt.show()

In [None]:
for i in range(100):
    print("\r{}".format(i), end='')