# Deep Reinforcement Learning in Action
### by Alex Zai and Brandon Brown

#### Chapter 5

##### Listing 5.1

In [1]:
import multiprocessing as mp
from multiprocessing import queues

import numpy as np
def square(x):
    return np.square(x)
x = np.arange(64)
print(x)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63]


In [2]:
mp.cpu_count()

2

In [3]:
if __name__ == '__main__': # added this line for process safety
    pool = mp.Pool(2)
    squared = pool.map(square, [x[2*i:2*i+2] for i in range(8)])
    squared

##### Listing 5.2

In [4]:
def square(i, x, queue):
    print("In process {}".format(i,))

queue = mp.Queue()
queue.put(np.square(x))
processes = []
if __name__ == '__main__': #adding this for process safety
    x = np.arange(64)
    for i in range(2):
        start_index = 2*i
        proc = mp.Process(target=square,args=(i,x[start_index:start_index+2],
                         queue)) 
        proc.start()
        processes.append(proc)

    for proc in processes:
        proc.join()

    for proc in processes:
        proc.terminate()

    results = []
    while not queue.empty():
        results.append(queue.get())

In process 0
In process 1


In [5]:
results

[array([   0,    1,    4,    9,   16,   25,   36,   49,   64,   81,  100,
         121,  144,  169,  196,  225,  256,  289,  324,  361,  400,  441,
         484,  529,  576,  625,  676,  729,  784,  841,  900,  961, 1024,
        1089, 1156, 1225, 1296, 1369, 1444, 1521, 1600, 1681, 1764, 1849,
        1936, 2025, 2116, 2209, 2304, 2401, 2500, 2601, 2704, 2809, 2916,
        3025, 3136, 3249, 3364, 3481, 3600, 3721, 3844, 3969])]

##### Listing 5.3: Pseudocode (not shown)

##### Listing 5.4

In [6]:
import torch
from torch import nn
from torch import optim
import numpy as np
from torch.nn import functional as F
import gym
import torch.multiprocessing as mp

class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.l1 = nn.Linear(4,25)
        self.l2 = nn.Linear(25,50)
        self.actor_lin1 = nn.Linear(50,2)
        self.l3 = nn.Linear(50,25)
        self.critic_lin1 = nn.Linear(25,1)
    def forward(self,x):
        x = F.normalize(x,dim=0)
        y = F.relu(self.l1(x))
        y = F.relu(self.l2(y))
        actor = F.log_softmax(self.actor_lin1(y),dim=0)
        c = F.relu(self.l3(y.detach()))
        critic = torch.tanh(self.critic_lin1(c))
        return actor, critic

##### Listing 5.5 
##### NOTE 1: This will not run on its own, you need to run listing 5.6 - 5.8 first then come back and run this cell.
##### NOTE 2: This will not record losses for plotting. If you want to record losses, you'll need to create a multiprocessing shared array and modify the `worker` function to write each loss to it. See < https://docs.python.org/3/library/multiprocessing.html > Alternatively, you could use process locks to safely write to a file.

In [11]:
MasterNode = ActorCritic()
MasterNode.share_memory()
processes = []
params = {
    'epochs':1000,
    'n_workers':1,
}
counter = mp.Value('i',0)
if __name__ == '__main__': #adding this for process safety
    for i in range(params['n_workers']):
        p = mp.Process(target=worker, args=(i,MasterNode,counter,params))
        p.start() 
        processes.append(p)
    for p in processes:
        p.join()
    for p in processes:
        p.terminate()
    
print(counter.value,processes[0].exitcode)

  result = entry_point.load(False)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
1000 0


##### Listing 5.6

In [7]:
def worker(t, worker_model, counter, params):
    worker_env = gym.make("CartPole-v1")
    worker_env.reset()
    worker_opt = optim.Adam(lr=1e-4,params=worker_model.parameters())
    worker_opt.zero_grad()
    for i in range(params['epochs']):
        worker_opt.zero_grad()
        values, logprobs, rewards = run_episode(worker_env,worker_model)
        actor_loss,critic_loss,eplen = update_params(worker_opt,values,logprobs,rewards)
        counter.value = counter.value + 1

##### Listing 5.7

In [8]:
def run_episode(worker_env, worker_model):
    state = torch.from_numpy(worker_env.env.state).float()
    values, logprobs, rewards = [],[],[]
    done = False
    j=0
    while (done == False):
        j+=1
        policy, value = worker_model(state)
        values.append(value)
        logits = policy.view(-1)
        action_dist = torch.distributions.Categorical(logits=logits)
        action = action_dist.sample()
        logprob_ = policy.view(-1)[action]
        logprobs.append(logprob_)
        state_, _, done, info = worker_env.step(action.detach().numpy())
        state = torch.from_numpy(state_).float()
        if done:
            reward = -10
            worker_env.reset()
        else:
            reward = 1.0
        rewards.append(reward)
    return values, logprobs, rewards

##### Listing 5.8

In [9]:
def update_params(worker_opt,values,logprobs,rewards,clc=0.1,gamma=0.95):
    rewards = torch.Tensor(rewards).flip(dims=(0,)).view(-1)
    logprobs = torch.stack(logprobs).flip(dims=(0,)).view(-1)
    values = torch.stack(values).flip(dims=(0,)).view(-1)
    Returns = []
    ret_ = torch.Tensor([0])
    for r in range(rewards.shape[0]):
        ret_ = rewards[r] + gamma * ret_
        Returns.append(ret_)
    Returns = torch.stack(Returns).view(-1)
    Returns = F.normalize(Returns,dim=0)
    actor_loss = -1*logprobs * (Returns - values.detach())
    critic_loss = torch.pow(values - Returns,2)
    loss = actor_loss.sum() + clc*critic_loss.sum()
    loss.backward()
    worker_opt.step()
    return actor_loss, critic_loss, len(rewards)

##### Supplement
##### Test the trained model

In [15]:
env = gym.make("CartPole-v1")
env.reset()
!pip install gl
for i in range(100):
    state_ = np.array(env.env.state)
    state = torch.from_numpy(state_).float()
    logits,value = MasterNode(state)
    action_dist = torch.distributions.Categorical(logits=logits)
    action = action_dist.sample()
    state2, reward, done, info = env.step(action.detach().numpy())
    if done:
        print("Lost")
        env.reset()
    state_ = np.array(env.env.state)
    state = torch.from_numpy(state_).float()
    env.render()

  result = entry_point.load(False)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[31mERROR: Could not find a version that satisfies the requirement gl (from versions: none)[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
[31mERROR: No matching distribution found for gl[0m[31m
[0m

ReraisedException: Error occured while running `from pyglet.gl import *`
The original exception was:

ImportError: Library "GL" not found.

HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get install python-opengl'. If you're running on a server, you may need a virtual frame buffer; something like this should work: 'xvfb-run -s "-screen 0 1400x900x24" python <your_script.py>'

In [13]:
env.close()

### N-step actor-critic

##### Listing 5.9

In [16]:
def run_episode(worker_env, worker_model, N_steps=10):
    raw_state = np.array(worker_env.env.state)
    state = torch.from_numpy(raw_state).float()
    values, logprobs, rewards = [],[],[]
    done = False
    j=0
    G=torch.Tensor([0])
    while (j < N_steps and done == False):
        j+=1
        policy, value = worker_model(state)
        values.append(value)
        logits = policy.view(-1)
        action_dist = torch.distributions.Categorical(logits=logits)
        action = action_dist.sample()
        logprob_ = policy.view(-1)[action]
        logprobs.append(logprob_)
        state_, _, done, info = worker_env.step(action.detach().numpy())
        state = torch.from_numpy(state_).float()
        if done:
            reward = -10
            worker_env.reset()
        else:
            reward = 1.0
            G = value.detach()
        rewards.append(reward)
    return values, logprobs, rewards, G