## Checking tensorflow version

In [2]:
import os
import tensorflow as tf
os.environ['CUDA_VISIBLE_DEVICES']='1'
print(tf.__version__)
print(tf.executing_eagerly())

2.0.0
True


In [4]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


Note that we're now in eager mode by default!

---

# Simple Environment

### Env test

In [5]:
from env import Env

e = Env()
print(e.reset())
print(e.step(0)) # move to down
print(e.step(1)) # move to right
print(e.step(0)) # move to down
print(e.step(1)) # move to right
print()
print(e.reset())
print(e.step(0)) # move to down
print(e.step(0)) # move to down
print(e.step(0)) # move to down

[1. 1.]
(array([2., 1.]), 1, False)
(array([2., 2.]), 1, False)
(array([3., 2.]), 1, False)
(array([3., 3.]), 1, True)

[1. 1.]
(array([2., 1.]), 1, False)
(array([3., 1.]), -1, False)
(array([4., 1.]), -1, True)


---

# A2C

## 모델 구성

기본적으로 구성한 모델이 잘 동작하는지 확인하기 위해서 테스트(feeding), 학습이 되지 않은 모델임

In [6]:
from a2c import Model
import numpy as np

env = Env()
model = Model(num_actions=env.action_dim)

state = env.reset()
state = np.expand_dims(state, axis=0)
# no feed_dict or tf.ession() needed at all
logits, action, value = model.action_value(state)
print(logits, action, value)

[0.43724027 0.56275976] 1 [-0.25288045]


## Training

In [7]:
import time
from env import Env
from a2c import Model
from agent import A2CAgent

st_time = time.time()
env = Env()
model = Model(num_actions=env.action_dim)
agent = A2CAgent(model)
env = Env()
rewards_history = agent.train(env)
print("Training Runtime: %f sec" % (time.time() - st_time))

.

AttributeError: 'A2CAgent' object has no attribute '_returns_advantages'

### Results

In [5]:
import numpy as np

state = env.reset()
state = np.array([1., 1.])
state = np.expand_dims(state, axis=0)

# no feed_dict or tf.ession() needed at all
logits, action, value = model.action_value(state)
print(logits, action, value)

[0.445631   0.55436903] 1 [1.8962058]


In [6]:
iter_cnt = 100
succ_cnt = 0

rewards = []
for _ in range(iter_cnt):
    reward = agent.test(env)
    rewards.append(reward)
    if reward == 4:
        succ_cnt += 1

print(rewards)
print("%d %% succecss" % (succ_cnt / iter_cnt * 100))

[4, -1, 2, 4, -1, -1, 4, 4, 4, -1, 4, 4, 4, 4, 2, 0, 4, 4, 2, -1, 4, 2, 4, 4, 2, 4, 4, 2, 4, 2, 4, 4, 4, 4, 4, 2, 4, 2, 4, 0, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 4, 2, 4, 4, 2, 4, 2, 4, 2, -1, 2, 4, 4, 4, 4, 4, 4, 2, 4, -1, 4, 2, -1, 4, 2, 4, 4, 4, 4, 0, -1, 4, 2, -1, 2, 2]
54 % succecss
