In [61]:
!echo $CONDA_DEFAULT_ENV

tensorflow2.0


In [60]:
import tensorflow as tf
print(tf.__version__)

2.0.0


# Simpel Env

In [22]:
import numpy as np

class Env:
    def __init__(self):
        self.action_dim = 1.1
        self.action_low = 2
        self.action_high = 10000
    
    def reset(self):
        self.state = np.array([5345.6543, 1.])
        return self.state
        
    def obs(self):
        return self.state
        
    def step(self, action):
        self.state[0] /= action
        self.state[1] *= action
        
        if self.state[0] < self.state[1]:
            return self.state, -10, True
        elif self.state[0] - self.state[1] < 1.:
            return self.state, 10, True
        else:
            return self.state, 0, False

In [23]:
env = Env()
print(env.reset())
print(env.step(2))
print(env.step(2))
print(env.step(2))
print(env.step(2))
print(env.step(2))
print(env.step(2))
print(env.step(2))

[5.3456543e+03 1.0000000e+00]
(array([2.67282715e+03, 2.00000000e+00]), 0, False)
(array([1336.413575,    4.      ]), 0, False)
(array([668.2067875,   8.       ]), 0, False)
(array([334.10339375,  16.        ]), 0, False)
(array([167.05169688,  32.        ]), 0, False)
(array([83.52584844, 64.        ]), 0, False)
(array([ 41.76292422, 128.        ]), -10, True)


# A2C with Continuous Actions

## 모델 구성

In [24]:
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras as keras

class Model(tf.keras.Model):
    def __init__(self):
        super().__init__('mlp_policy')
        
        # actor
        self.hidden_p1 = keras.layers.Dense(40, activation='elu')
        self.hidden_p2 = keras.layers.Dense(40, activation='elu')
        self.mu = keras.layers.Dense(1)
        self.sigma = keras.layers.Dense(1)
        
        # critic
        self.hidden_v1 = keras.layers.Dense(400, activation='elu')
        self.hidden_v2 = keras.layers.Dense(400, activation='elu')
        self.value = keras.layers.Dense(1, name='value')
        
    def call(self, inputs):
        # inputs is a numpy array, convert to tensor
        x = tf.convert_to_tensor(inputs, dtype=tf.float32)
        
        # actor
        hidden_p = self.hidden_p1(x)
        hidden_p = self.hidden_p2(hidden_p)
        mu = self.mu(hidden_p)
        sigma = self.sigma(hidden_p)
        sigma = tf.math.softplus(sigma) + 1e-5
        self.norm_dist = tfp.distributions.Normal(mu, sigma)
        action_tf_var = tf.squeeze(self.norm_dist.sample(1), axis=0)
        action_tf_var = tf.clip_by_value(action_tf_var, env.action_low, env.action_high)
        
        # critic
        hidden_v = self.hidden_v1(x)
        hidden_v = self.hidden_v2(hidden_v)
        out_value = self.value(hidden_v)
    
        return action_tf_var, out_value
    
    def action_value(self, obs):
        policy, value = self.predict(obs)

        return np.squeeze(policy, axis=-1), np.squeeze(value, axis=-1)

We'll build our stochastic policy function, estimated by the fully-connected network below. The network input is the state and output are two scalar functions, mu and delta, which are used as the mean and standard deviation of a Gaussian (normal) distribution. We will choose our actions by smapling from this distribution. The stochastic policy provides some dgree of bulit-in exploration mechanism, since the network initialization will cause a non-zero sigma value.

In [25]:
model = Model()
a, b = model.action_value(np.array([[44., 20.]]))
print(a, b)

[2.] [0.40011925]


# Random Agent

In [29]:
class A2CAgent:
    def __init__(self, model):
        self.model = model
        
    def test(self, env):
        state, done, ep_reward = env.reset(), False, 0
        while not done:
            action, _ = self.model.action_value(state[None, :])
            state, reward, done = env.step(action)
            ep_reward += reward
        return ep_reward

In [30]:
agent = A2CAgent(model)
iter_cnt = 100
succ_cnt = 0
for _ in range(100):
    rewards_sum = agent.test(env)
    if rewards_sum == 10:
        succ_cnt +=1
print(succ_cnt / 100 * 100, "%")

0.0 %


In [63]:
lr_actor = 0.00001
lr_critic = 0.00056

action = 0.

# instantiate state-value function & policy network
action_tf_var, V = model.predict(np.array([1.]))
print(action_tf_var, V)

# define actor (policy) loss function
norm_dist = model.norm_dist
loss_actorr = -tf.log(norm_dist.prob(action))

[[1.]] [[0.12043588]]


AttributeError: module 'tensorflow' has no attribute 'log'