In [1]:
import numpy as np
import _pickle
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
import math

In [2]:
# Load the CartPole Env.
import gym
env = gym.make('CartPole-v0')

In [3]:
# hyperparameters
H = 10                 # 히든 레이어의 뉴런 수
batch_size = 5         # 5번을 실행하고 그 중에서 하나를 고른다.
learning_rate = 1e-2   # 학습률
                       # 학습률을 높게 하면 시스템이 데이터에 빠르게 적응하지만 
                       # 예전 데이터를 금방 잊어버릴 것입니다
                       # 학습률이 낮으면 더 느리게 학습됨. 
                       # 하지만 새로운 데이터에 있는 잡음이나 대표성 없는 데이터 포인트에 덜 민감해집니다.
gamma = 0.99           # discount factor for reward
                       # 감마가 0이라면 오직 다음 시간의 보상만을 고려한다.(빠르게 최적의 행동 결정 가능)
                       # 감마가 1이라면 바로 앞의 보상뿐만 아니라 미래의 보상도 고려한다.
D = 4                  # 입력값의 갯수
                       # x : position of cart on the track
                       # θ : angle of the pole with the vertical
                       # dx/dt : cart velocity
                       # dθ/dt : rate of change of the angle
                
                       # output으로 [1,0] 또는 [0,1]이 나오는데 이는 오른쪽으로 움직여라 or 왼쪽으로 움직이라는 뜻.

In [4]:
tf.reset_default_graph()

# 네트워크를 만들기 위해 필요한 요소들을 선언해 놓는다.
# (output이 1,0 또는 0,1이 나올 수 있도록 도와주는 요소라고 생각하면 편하다.)
observations = tf.placeholder(tf.float32, [None,D] , name="input_x")
# 처음에는 input layer에서 4개의 입력값이 첫번째 히든레이어(10개의 뉴런을 가짐)로 들어간다.
W1 = tf.get_variable("W1", shape=[D, H],
           initializer=tf.contrib.layers.xavier_initializer())
# layer 1의 activation 함수는 Lelu이다.
layer1 = tf.nn.relu(tf.matmul(observations,W1))
# 그 다음에는 다시한번 히든레이어1의 결과값들을 히든레이어2에 집어넣고 1개의 output으로 뽑아낸다.
W2 = tf.get_variable("W2", shape=[H, 1],
           initializer=tf.contrib.layers.xavier_initializer())

score = tf.matmul(layer1,W2)

# Neural Network를 Non-Linear하게 만들어주기 위한 Activation funtion.
# 'W2'에서 action에 대한 결과를 0~1로 만들어 주기 위해서 sigmoid를 이용.
probability = tf.nn.sigmoid(score)

# good policy를 얻기 위한 네트워크를 정의
tvars = tf.trainable_variables()                                 # weight
input_y = tf.placeholder(tf.float32,[None,1], name="input_y")    # output : Prob.
advantages = tf.placeholder(tf.float32,name="reward_signal")

# loss funtion으로 가중치를 좋은 이익을 주는 행위와 가능성이 낮지 않은 행동을 하는 방향으로 가중치를 보낸다. 
loglik = tf.log(input_y*(input_y - probability) + (1 - input_y)*(input_y + probability))
loss = -tf.reduce_mean(loglik * advantages) 
newGrads = tf.gradients(loss,tvars) # 새로운 정책을 얻기위한 gradient를 얻는다.

# 여러 에피소드에서 일련의 기울기를 수집한다.
# We don't just apply gradeients after every episode in order to account for noise in the reward signal.
adam = tf.train.AdamOptimizer(learning_rate=learning_rate) # Our optimizer
W1Grad = tf.placeholder(tf.float32,name="batch_grad1") # Placeholders to send the final gradients through when we update.
W2Grad = tf.placeholder(tf.float32,name="batch_grad2")
batchGrad = [W1Grad,W2Grad]
updateGrads = adam.apply_gradients(zip(batchGrad,tvars))

W0204 20:54:50.113221 11072 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [5]:
# reward를 1차원배열로 만들어 gamma를 통해 깎인 보상을 계산하여 그 결과를 리턴값으로 주는 함수
# 편의를 위한 도우미 함수임.
def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [6]:
xs,drs,ys = [],[],[]
running_reward = None
reward_sum = 0
episode_number = 1
total_episodes = 10000
init = tf.global_variables_initializer() # 모든 변수 초기화

# Launch the graph
with tf.Session() as sess:
    rendering = False
    sess.run(init)
    observation = env.reset() # cartpole 환경을 초기화

    # gradient placeholder를 리셋한다.
    # 우리는 정책 네트워크를 업데이트할 준비가 될 때까지 GradBuffer의 기울기를 수집할 것이다.
    gradBuffer = sess.run(tvars)  #gradient buffer에 weight 값을 넣어라.
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
    
    while episode_number <= total_episodes:         
        # 환경을 조성하면서 학습을 진행하면 학습 속도가 느려지기 때문에 어느정도 성능이 높아진다면,
        # 아래 if문의 코드를 실행하여 cartpole 환경을 보여준다.
        # 처음에는 안보임.
        if reward_sum/batch_size > 100 or rendering == True : 
            env.render()
            rendering = True
            
        # 임의의 값을 넣어서 나오는 결과값들(관측치)이 네트워크가 처리할 수 있는 모양으로 만들어라.
        x = np.reshape(observation,[1,D])
        
        # 위에서 만들어둔 policy network를 실행하고 수행할 작업을 가져와라.
        tfprob = sess.run(probability,feed_dict={observations: x}) #학습이 진행됨.
        action = 1 if np.random.uniform() < tfprob else 0
        
        # 진행되면서 생기는 (나중에 역전파를 위해 필요한 다양한) 중간 변수를 기록한다.
        xs.append(x) # observation
        y = 1 if action == 0 else 0 # a "fake label"
        ys.append(y)

        # cartpole 시뮬레이션을 진행하고, 이를통해 새로운 측정치를 얻는다.
        observation, reward, done, info = env.step(action)
        reward_sum += reward
        
        # reward를 기록. 
        # 이전 action에 대한 보상을 받기 위해 반드시 step()을 호출한 뒤 수행해야한다.
        drs.append(reward) 
        
        if done: 
            # 다음 에피소드로 가기위해 변수에 +1
            episode_number += 1
            # 현재 에피소드의 모든 입력값, 은닉 상태, 액션 경사, 보상을 쌓는다.
            epx = np.vstack(xs)
            epy = np.vstack(ys)
            epr = np.vstack(drs)
            # 배열 메모리를 리셋한다.
            xs,drs,ys = [],[],[]

            # 학습이 진행되면서 생긴 discounted reward를 역으로 계산한다.
            # reward의 크기를 표준화한다.(표준화 방법 = 평균을 빼주고 표준편차로 나눠준다.)
            # cartpole은 항상 1의 보상을 받기 때문에 모든 action에 대한 return값이 항상 0보다 크다.
            # 따라서 그대로 return을 사용하면 모든 action을 수용하게되고 그 정도에만 차이가 생기게 된다.
            # 하지만 이렇게 표준화한다면 변화가 커서 수렴을 잘 안하기 때문에 이 방법을 쓴다.
            # 표준화를 통해 좋지 못한 action은 버리고 좋은 action만을 수용하기 때문에 수학적으로 변화가 줄어든다.
            # 즉, 정규화 ((reward-mean)/std)가 일종의 advatage function이 될 수 있음.
            discounted_epr = discount_rewards(epr)
            discounted_epr -= np.mean(discounted_epr)
            discounted_epr /= np.std(discounted_epr)
            
            # 이번 에피소드의 gradient를 얻고 gradBuffer에 gradient를 저장하라.
            tGrad = sess.run(newGrads,feed_dict={observations: epx, input_y: epy, advantages: discounted_epr})
            for ix,grad in enumerate(tGrad):
                gradBuffer[ix] += grad
                
            # batch_size만큼 에피소드를 완료했다면, 정책 네트워크를 우리가 얻은 gradient와 함께 갱신한다.
            if episode_number % batch_size == 0: 
                sess.run(updateGrads,feed_dict={W1Grad: gradBuffer[0],W2Grad:gradBuffer[1]})
                for ix,grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0
                
                # 학습이 어떻게 진행됐는지 확인한다.
                running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
                print('Average reward for episode %.3f.  Total average reward %.3f' % (reward_sum/batch_size, running_reward/batch_size))
                # 만약 reward_sum/batch_size이 결과가 200보다 크다면 cartpole 예제가 해결된것이다.
                if reward_sum/batch_size > 200: 
                    print("Task solved in",episode_number,'episodes!')
                    break
                    
                reward_sum = 0
            
            observation = env.reset()
        
print(episode_number,'Episodes completed.')

Average reward for episode 12.200.  Total average reward 12.200
Average reward for episode 23.800.  Total average reward 12.316
Average reward for episode 13.000.  Total average reward 12.323
Average reward for episode 15.800.  Total average reward 12.358
Average reward for episode 16.600.  Total average reward 12.400
Average reward for episode 19.800.  Total average reward 12.474
Average reward for episode 18.800.  Total average reward 12.537
Average reward for episode 23.000.  Total average reward 12.642
Average reward for episode 23.800.  Total average reward 12.754
Average reward for episode 25.200.  Total average reward 12.878
Average reward for episode 12.200.  Total average reward 12.871
Average reward for episode 20.000.  Total average reward 12.942
Average reward for episode 31.200.  Total average reward 13.125
Average reward for episode 17.400.  Total average reward 13.168
Average reward for episode 24.400.  Total average reward 13.280
Average reward for episode 23.000.  Tota

Average reward for episode 123.200.  Total average reward 48.621
Average reward for episode 135.800.  Total average reward 49.493
Average reward for episode 153.400.  Total average reward 50.532
Average reward for episode 177.200.  Total average reward 51.799
Average reward for episode 153.200.  Total average reward 52.813
Average reward for episode 164.600.  Total average reward 53.931
Average reward for episode 84.000.  Total average reward 54.231
Average reward for episode 181.200.  Total average reward 55.501
Average reward for episode 155.400.  Total average reward 56.500
Average reward for episode 162.800.  Total average reward 57.563
Average reward for episode 135.600.  Total average reward 58.343
Average reward for episode 150.200.  Total average reward 59.262
Average reward for episode 142.200.  Total average reward 60.091
Average reward for episode 142.000.  Total average reward 60.910
Average reward for episode 137.400.  Total average reward 61.675
Average reward for episode

Average reward for episode 194.600.  Total average reward 146.873
Average reward for episode 200.000.  Total average reward 147.404
Average reward for episode 200.000.  Total average reward 147.930
Average reward for episode 200.000.  Total average reward 148.451
Average reward for episode 200.000.  Total average reward 148.966
Average reward for episode 197.800.  Total average reward 149.455
Average reward for episode 200.000.  Total average reward 149.960
Average reward for episode 193.400.  Total average reward 150.395
Average reward for episode 178.800.  Total average reward 150.679
Average reward for episode 182.400.  Total average reward 150.996
Average reward for episode 199.800.  Total average reward 151.484
Average reward for episode 194.400.  Total average reward 151.913
Average reward for episode 200.000.  Total average reward 152.394
Average reward for episode 200.000.  Total average reward 152.870
Average reward for episode 200.000.  Total average reward 153.341
Average re

Average reward for episode 189.200.  Total average reward 180.726
Average reward for episode 191.800.  Total average reward 180.837
Average reward for episode 200.000.  Total average reward 181.028
Average reward for episode 200.000.  Total average reward 181.218
Average reward for episode 192.200.  Total average reward 181.328
Average reward for episode 200.000.  Total average reward 181.514
Average reward for episode 200.000.  Total average reward 181.699
Average reward for episode 200.000.  Total average reward 181.882
Average reward for episode 200.000.  Total average reward 182.063
Average reward for episode 200.000.  Total average reward 182.243
Average reward for episode 187.000.  Total average reward 182.290
Average reward for episode 200.000.  Total average reward 182.468
Average reward for episode 200.000.  Total average reward 182.643
Average reward for episode 199.400.  Total average reward 182.810
Average reward for episode 200.000.  Total average reward 182.982
Average re

Average reward for episode 200.000.  Total average reward 192.469
Average reward for episode 198.400.  Total average reward 192.529
Average reward for episode 200.000.  Total average reward 192.603
Average reward for episode 200.000.  Total average reward 192.677
Average reward for episode 200.000.  Total average reward 192.750
Average reward for episode 193.400.  Total average reward 192.757
Average reward for episode 200.000.  Total average reward 192.829
Average reward for episode 200.000.  Total average reward 192.901
Average reward for episode 200.000.  Total average reward 192.972
Average reward for episode 200.000.  Total average reward 193.042
Average reward for episode 200.000.  Total average reward 193.112
Average reward for episode 200.000.  Total average reward 193.181
Average reward for episode 200.000.  Total average reward 193.249
Average reward for episode 200.000.  Total average reward 193.317
Average reward for episode 200.000.  Total average reward 193.383
Average re

KeyboardInterrupt: 