In [1]:
!echo $CONDA_DEFAULT_ENV

tensorflow2.0


In [3]:
import tensorflow as tf
print(tf.__version__)

2.0.0


# Simpel Env

In [42]:
import numpy as np

class Env:
    def __init__(self):
        self.action_dim = 1.1
        self.action_low = 2
        self.action_high = 10000
    
    def reset(self):
        self.state = np.array([5345.6543, 1.])
        return self.state
        
    def obs(self):
        return self.state
        
    def step(self, action):
        self.state[0] /= action
        self.state[1] *= action
        
        if self.state[0] < self.state[1]:
            return self.state, -10, True
        elif self.state[0] - self.state[1] < 1.:
            return self.state, 10, True
        else:
            return self.state, 0, False

In [30]:
env = Env()
print(env.reset())
print(env.step(2))
print(env.step(2))
print(env.step(2))
print(env.step(2))
print(env.step(2))
print(env.step(2))
print(env.step(2))

[5.3456543e+03 1.0000000e+00]
(array([2.67282715e+03, 2.00000000e+00]), 0, False)
(array([1336.413575,    4.      ]), 0, False)
(array([668.2067875,   8.       ]), 0, False)
(array([334.10339375,  16.        ]), 0, False)
(array([167.05169688,  32.        ]), 0, False)
(array([83.52584844, 64.        ]), 0, False)
(array([ 41.76292422, 128.        ]), -10, True)


# A2C with Continuous Actions

## 모델 구성

In [31]:
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras as keras

class Model(tf.keras.Model):
    def __init__(self):
        super().__init__('mlp_policy')
        
        # actor
        self.hidden_p1 = keras.layers.Dense(40, activation='elu')
        self.hidden_p2 = keras.layers.Dense(40, activation='elu')
        self.mu = keras.layers.Dense(1)
        self.sigma = keras.layers.Dense(1)
        
        # critic
        self.hidden_v1 = keras.layers.Dense(400, activation='elu')
        self.hidden_v2 = keras.layers.Dense(400, activation='elu')
        self.value = keras.layers.Dense(1, name='value')
        
    def call(self, inputs):
        # inputs is a numpy array, convert to tensor
        x = tf.convert_to_tensor(inputs, dtype=tf.float32)
        
        # actor
        hidden_p = self.hidden_p1(x)
        hidden_p = self.hidden_p2(hidden_p)
        mu = self.mu(hidden_p)
        sigma = self.sigma(hidden_p)
        sigma = tf.math.softplus(sigma) + 1e-5
        self.norm_dist = tfp.distributions.Normal(mu, sigma)
        action_tf_var = tf.squeeze(self.norm_dist.sample(1), axis=0)
        action_tf_var = tf.clip_by_value(action_tf_var, env.action_low, env.action_high)
        
        # critic
        hidden_v = self.hidden_v1(x)
        hidden_v = self.hidden_v2(hidden_v)
        out_value = self.value(hidden_v)
    
        return action_tf_var, out_value
    
    def action_value(self, obs):
        policy, value = self.predict(obs)

        return np.squeeze(policy, axis=-1), np.squeeze(value, axis=-1)

We'll build our stochastic policy function, estimated by the fully-connected network below. The network input is the state and output are two scalar functions, mu and delta, which are used as the mean and standard deviation of a Gaussian (normal) distribution. We will choose our actions by smapling from this distribution. The stochastic policy provides some dgree of bulit-in exploration mechanism, since the network initialization will cause a non-zero sigma value.

In [32]:
model = Model()
a, b = model.action_value(np.array([[44., 20.]]))
print(a, b)

[2.] [1.532644]


# Random Agent

In [33]:
class A2CAgent:
    def __init__(self, model):
        self.model = model
        
    def test(self, env):
        state, done, ep_reward = env.reset(), False, 0
        while not done:
            action, _ = self.model.action_value(state[None, :])
            state, reward, done = env.step(action)
            ep_reward += reward
        return ep_reward

학습이 되지 않았을 때의 성능 테스트

In [34]:
agent = A2CAgent(model)
iter_cnt = 100
succ_cnt = 0
for _ in range(100):
    rewards_sum = agent.test(env)
    if rewards_sum == 10:
        succ_cnt += 1
print(succ_cnt / 100 * 100, "%")

0.0 %


## Loss / Objective Function

In [53]:
class A2CAgent:
    def __init__(self, model):
        self.model = model
        self.params = {
            'value': 0.5,
            'entropy': 0.001
        }
        self.model.compile(
            optimizer=keras.optimizer.Adam(lr=0.007),
            loss=[self._policy_loss, self._value_loss]
        )
        
    def test(self, env):
        state, done, ep_reward = env.reset(), False, 0
        while not done:
            action, _ = self.model.action_value(state[None, :])
            state, reward, done = env.step(action)
            ep_reward += reward
        return ep_reward
    
    def _value_loss(self, returns, value):
        return self.params['value'] * keras.losses.mean_squared_error(returns, value)
    
    def _policy_loss(self, action, delta):
        return -tf.math.log(model.norm_dist.prob(action) + 1e-5) * delta

## Agent Trainning Loop

In [62]:
class A2CAgent:
    def __init__(self, model):
        self.model = model
        self.params = {
            'value': 0.5,
            'entropy': 0.001,
            'gamma':0.99
        }
        self.model.compile(
            optimizer=keras.optimizers.Adam(lr=0.007),
            loss=[self._policy_loss, self._value_loss],
            run_eagerly=True
        )
        
    def test(self, env):
        state, done, ep_reward = env.reset(), False, 0
        while not done:
            action, _ = self.model.action_value(state[None, :])
            state, reward, done = env.step(action)
            ep_reward += reward
        return ep_reward
    
    def train(self, env, batch_size=32, updates=500):
        # storage helpers for a single batch of data
        actions = np.empty((batch_size,), dtype=np.int32)
        rewards, dones, values = np.empty((3, batch_size))
        states = np.empty((batch_size, ) + env.state.shape)
        
        # training loop: collect samples, send to optimize, repeat updates times
        ep_rews = [0.0]
        next_state = env.reset()
        for update in range(updates):
            # batch_size 만큼 시도해보면서 데이터를 모음
            for step in range(batch_size):
                states[step] = next_state.copy()
                actions[step], values[step] = self.model.action_value(np.expand_dims(next_state, axis=0))
                next_state, rewards[step], dones[step] = env.step(actions[step])

                ep_rews[-1] += rewards[step]
                if dones[step]:
                    ep_rews.append(0.0)
                    next_obs = env.reset()

            _, next_value = self.model.action_value(next_state[None, :])
            returns, advs = self._returns_advantages(rewards, dones, values, next_value)
            # a trick to input actions and advantages through same API
            acts_and_advs = np.concatenate([actions[:, None], advs[:, None]], axis=-1)
            # performs a full training step on the collected batch
            # note: no need to mess around with gradients, Keras API handles it
            losses = self.model.train_on_batch(states, [acts_and_advs, returns])
        return ep_rews
    
    def _returns_advantages(self, rewards, dones, values, next_value):
        # next_value is the bootstrap value estimate of a future state (the critic)
        returns = np.append(np.zeros_like(rewards), next_value, axis=-1)
        # returns are calculated as discounted sum of future rewards
        for t in reversed(range(rewards.shape[0])):
            returns[t] = rewards[t] + self.params['gamma'] * returns[t+1] * (1-dones[t])
        returns = returns[:-1]
        # advantages are returns - baseline, value estimates in our case
        advantages = returns - values
        return returns, advantages
    
    def _value_loss(self, returns, value):
        return self.params['value'] * keras.losses.mean_squared_error(returns, value)
    
    def _policy_loss(self, action, delta):
        return -tf.math.log(model.norm_dist.prob(action) + 1e-5) * delta

In [63]:
import time
st_time = time.time()
agent = A2CAgent(model)
env = Env()
env.reset()
rewards_history = agent.train(env)
print("Training Runtime: %f sec" % (time.time() - st_time))

Training Runtime: 539.118606 sec


In [None]:
iter_cnt = 100
succ_cnt = 0

for _ in range(iter_cnt):
    rewards = agent.test(env)
    print(rewards)
    if rewards == 10:
        succ_cnt += 1

print("%d %% succecss" % (succ_cnt / iter_cnt * 100))

In [97]:
state = np.array([1., 1.])
model.predict(np.expand_dims(state, axis=0))

[array([[2.]], dtype=float32), array([[-1.4321558]], dtype=float32)]

In [23]:
lr_actor = 0.00001
lr_critic = 0.00056

state = np.array([1., 1.])

# instantiate state-value function & policy network
action_tf_var, V = model.action_value(np.expand_dims(state, axis=0))

# define actor (policy) loss function
norm_dist = model.norm_dist
loss_actor = -tf.math.log(norm_dist.prob(action))

print(loss_actor)

Tensor("Neg_2:0", shape=(None, 2), dtype=float32)
