In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !apt update && apt install -y libpq-dev libsdl2-dev swig xorg-dev xvfb
    !pip install -q -U tf-agents-nightly pyvirtualdisplay gym[atari]
    IS_COLAB = True
except Exception:
    IS_COLAB = False

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To get smooth animations
import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "rl"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# 18.3 OpenAI 짐

In [None]:
import gym

시각화

In [None]:
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim
def render_policy_net(model, n_max_steps=200, seed=42):
    frames = []
    env = gym.make("CartPole-v1")
    env.seed(seed)
    np.random.seed(seed)
    obs = env.reset()
    for step in range(n_max_steps):
        frames.append(env.render(mode="rgb_array"))
        left_proba = model.predict(obs.reshape(1, -1))
        action = int(np.random.rand() > left_proba)
        obs, reward, done, info = env.step(action)
        if done:
            break
    env.close()
    return frames

환경목록

In [None]:
gym.envs.registry.all()

In [None]:
env = gym.make('CartPole-v1')
obs = env.reset()
obs

#각 관측 obs는 수평위치, 카트속도, 막대 각도, 막대의 가속도를 나타냄.

In [None]:
try:
    import pyvirtualdisplay
    display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()
except ImportError:
    pass

이미지 렌더링하여 넘파이로 배열로 받기

In [None]:
#render() 메서드에서 변환된 렌더링된 이미지를 넘파이 배열로 받기위해
#mode="rgb_array" 지정
img = env.render(mode="rgb_array")
img.shape

In [None]:
def plot_environment(env, figsize=(5,4)):
    plt.figure(figsize=figsize)
    img = env.render(mode="rgb_array")
    plt.imshow(img)
    plt.axis("off")
    return img

In [None]:
plot_environment(env)
plt.show()

가능한 행동공간 보기

In [None]:
env.action_space

#Discrete(2) == 가능한 행동 정수 0,1 2개
#각각 왼쪽가속(0) 오른쪽가속(1)

In [None]:
#obs[2]>0 으로 오른쪽으로 기울어져 있기 때문에 오른쪽으로 가속해봄.

action=1
obs,reward,done,info = env.step(action)

In [None]:
obs

In [None]:
reward

In [None]:
done

In [None]:
info

In [None]:
if done:
    obs = env.reset()

In [None]:
env.seed(42)

def basic_policy(obs):
  angle = obs[2]
  return 0 if angle<0 else 1

totals = []

for episode in range(500):
  episode_rewards=0
  #새로운 관찰
  obs = env.reset()
  for step in range(200):
    #정책에 따른 행동
    action = basic_policy(obs)
    obs,reward,done,info = env.step(action)
    episode_rewards +=reward
    #done==1 <-- 넘어졌거나 200번넘음.
    if done:
      break
  totals.append(episode_rewards)

In [None]:
import numpy as np
np.mean(totals),np.std(totals),np.min(totals),np.max(totals)

# 18.4 신경망 정책

신경망을 사용해 관측(obs)을 바탕으로 행동(action) 출력

cartpole의 경우엔 'p' or '1-p'의 출력


In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
n_inputs = env.observation_space.shape[0]

model = keras.models.Sequential([
                                 keras.layers.Dense(5,activation="elu", input_shape=[n_inputs]),
                                 keras.layers.Dense(1,activation="sigmoid")#p , 1-p
])

# 18.5 행동 평가

신용할당문제로 현재에서 멀어질수록 보상에 감마를 곱하여 다 더함.

많은 에피소드를 실행해 행동이익을 정규화 (평균, 표준편차)

--> 행동이익 음수는 나쁨

--> 행동이익 양수는 좋음


# 18.6 정책 그레디언트

스탭 1번 함수

In [None]:
def play_one_step(env, obs, model, loss_fn):
  
    with tf.GradientTape() as tape:
        #하나의 관측과 함께 모델 호출
        #왼쪽으로 이동할 확률 하나 출력
        left_proba = model(obs[np.newaxis])
        
        #0~1사이의 랜덤한 실수를 샘플링
        # left_proba보다 크면 action=0, 아니면 1. --> 왼쪽0 오른쪽1으로 행동 정해짐
        action = (tf.random.uniform([1, 1]) > left_proba)
        
        #왼쪽으로 이동할 타깃확률 정의. 1-행동. ex)왼쪽으로 가면 1-0=1
        y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)

        #손실함수를 사용해 손실을 계산
        loss = tf.reduce_mean(loss_fn(y_target, left_proba))
    #훈련가능 변수에 대한 손실의 그레디언트 계산.    
    
    grads = tape.gradient(loss, model.trainable_variables)
    #선택한 행동을 플레이.==> [새로운 관측, 보상, 에피소드종료여부, 계산한 그래디언트] 반환
    obs, reward, done, info = env.step(int(action[0, 0].numpy()))
    return obs, reward, done, grads

스탭함수를 이용해 여러 에피소드 플레이

In [None]:
def play_multiple_episodes(env, n_episodes,n_max_steps, model,loss_fn):
  all_rewards=[]
  all_grads=[]
  for episode in range(n_episodes):
    current_rewards =[]
    current_grads=[]
    obs = env.reset()
    for step in range(n_max_steps):
      obs,reward,done,grads = play_one_step(env,obs,model,loss_fn)
      current_rewards.append(reward)
      current_grads.append(grads)
      if done:
        break
    all_rewards.append(current_rewards)
    all_grads.append(current_grads)
  return all_rewards, all_grads

할인계수를 적용한 보상

In [None]:
def discount_rewards(rewards, discount_rate):
    discounted = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_rate
    return discounted

할인적용한 보상 정규화

In [None]:
def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate)
                              for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std
            for discounted_rewards in all_discounted_rewards]

In [None]:
discount_rewards([10,0,-50], discount_rate=0.8)

In [None]:
discount_and_normalize_rewards([[10,0,-50],[10,20]],discount_rate= 0.8)

In [None]:
n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 200
discount_rate = 0.95

In [None]:
optimizer = keras.optimizers.Adam(lr=0.01)
loss_fn = keras.losses.binary_crossentropy

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
    keras.layers.Dense(5, activation="elu", input_shape=[4]),
    keras.layers.Dense(1, activation="sigmoid"),
])

In [None]:
env = gym.make("CartPole-v1")
env.seed(42);

for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(
        env, n_episodes_per_update, n_max_steps, model)
    total_rewards = sum(map(sum, all_rewards))                     # Not shown in the book
    print("\rIteration: {}, mean rewards: {:.1f}".format(          # Not shown
        iteration, total_rewards / n_episodes_per_update), end="") # Not shown
    all_final_rewards = discount_and_normalize_rewards(all_rewards,
                                                       discount_rate)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean(
            [final_reward * all_grads[episode_index][step][var_index]
             for episode_index, final_rewards in enumerate(all_final_rewards)
                 for step, final_reward in enumerate(final_rewards)], axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

env.close()

In [None]:
frames = render_policy_net(model)
plot_animation(frames)

# 18.7 마르코프 결정과정

In [None]:
transition_probabilities = [ # shape=[s, a, s']
        [[0.7, 0.3, 0.0], [1.0, 0.0, 0.0], [0.8, 0.2, 0.0]],
        [[0.0, 1.0, 0.0], None, [0.0, 0.0, 1.0]],
        [None, [0.8, 0.1, 0.1], None]]
rewards = [ # shape=[s, a, s']
        [[+10, 0, 0], [0, 0, 0], [0, 0, 0]],
        [[0, 0, 0], [0, 0, 0], [0, 0, -50]],
        [[0, 0, 0], [+40, 0, 0], [0, 0, 0]]]
possible_actions = [[0, 1, 2], [0, 2], [1]]

In [None]:
Q_value =np.full((3,3),-np.inf)

In [None]:
Q_value

In [None]:
for state, actions in enumerate(possible_actions):
  Q_value[state][actions]=0.0

In [None]:
Q_value

In [None]:
gamma = 0.90

for iteration in range(50):
  Q_prev = Q_value.copy()
  for s in range(3):
    for a in possible_actions[s]:
      Q_value[s,a]=np.sum([
      transition_probabilities[s][a][sp]*(rewards[s][a][sp]+ gamma * 
                                          np.max(Q_prev[sp]))
      for sp in range(3)
      ])

In [None]:
Q_value

각 상태에 대해 가장 높은 Q value를 갖는 행동 a 찾기
Q(s,a)

In [None]:
np.argmax(Q_value,axis=1)

#array([0, 0, 1])
#상태 0에서는 행동a0
#상태 1에서는 행동a0
#상태 2에서는 행동a1

# 18.9 Q러닝

http://blog.quantylab.com/rl.html

상태 가치함수와 상태-행동 가치함수의 차이

(가치반복, q-가치반복 차이)

에이전트가 행동을 하나 실행하고 결과상태와 보상을 받는 스텝함수

In [None]:
def step(step,action):
  probas = transition_probabilities[state][action]
  next_state = np.random.choice([0,1,2],p=probas)
  reward  = rewards[state][action][next_state]
  return next_state, reward

랜덤한 정책

In [None]:
def exploration_policy(state):
  return np.random.choice(possible_actions[state])

In [None]:
np.random.seed(42)

Q_values = np.full((3, 3), -np.inf)
for state, actions in enumerate(possible_actions):
    Q_values[state][actions] = 0

alpha0 = 0.05 # initial learning rate
decay = 0.005 # learning rate decay
gamma = 0.90 # discount factor
state = 0 # initial state
history2 = [] # Not shown in the book

for iteration in range(10000):
    history2.append(Q_values.copy()) # Not shown
    action = exploration_policy(state)
    next_state, reward = step(state, action)
    next_value = np.max(Q_values[next_state]) # greedy policy at the next step
    alpha = alpha0 / (1 + iteration * decay)
    Q_values[state, action] *= 1 - alpha
    Q_values[state, action] += alpha * (reward + gamma * next_value)
    state = next_state

In [None]:
Q_values

최적의 행동

In [None]:
np.argmax(Q_values,axis=1)

#array([0, 0, 1])
#s0에서는 행동0
#s1에서는 행동0
#s2에서는 행동1

## 18.10 심층 Q-러닝 구현하기

DQN 구현

In [None]:
keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

In [None]:
env = gym.make("CartPole-v1")
input_shape=[4]
n_outputs=2


model = keras.models.Sequential([
                                 keras.layers.Dense(32,activation="elu",
                                                    input_shape=input_shape),
                                 keras.layers.Dense(32,activation="elu"),
                                 keras.layers.Dense(n_outputs)
])

입실론 그리디 구현

입실론 확률로 랜덤한 행동,
1-입실론 확률로 q-value를 가장 크게하는 행동

In [None]:
def epsilon_greedy_policy(state, epsilon=0):
  if np.random.rand() < epsilon:
    return np.random.randint(n_outputs)
  else:
    Q_values = model.predict(state[np.newaxis])
    return np.argmax(Q_values[0])

In [None]:
from collections import deque

replay_memory = deque(maxlen=2000)

In [None]:
def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_memory), size=batch_size)
    batch = [replay_memory[index] for index in indices]
    states, actions, rewards, next_states, dones = [
        np.array([experience[field_index] for experience in batch])
        for field_index in range(5)]
    return states, actions, rewards, next_states, dones

In [None]:
def play_one_step(env, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, done, info = env.step(action)
    replay_memory.append((state, action, reward, next_state, done))
    return next_state, reward, done, info

In [None]:
batch_size = 32
discount_rate = 0.95
optimizer = keras.optimizers.Adam(lr=1e-3)
loss_fn = keras.losses.mean_squared_error

def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences
    next_Q_values = model.predict(next_states)
    max_next_Q_values = np.max(next_Q_values, axis=1)
    target_Q_values = (rewards +
                       (1 - dones) * discount_rate * max_next_Q_values)
    target_Q_values = target_Q_values.reshape(-1, 1)
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [None]:
env.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

rewards = [] 
best_score = 0

In [None]:
for episode in range(600):
    obs = env.reset()    
    for step in range(200):
        epsilon = max(1 - episode / 500, 0.01)
        obs, reward, done, info = play_one_step(env, obs, epsilon)
        if done:
            break
    rewards.append(step) # Not shown in the book
    if step > best_score: # Not shown
        best_weights = model.get_weights() # Not shown
        best_score = step # Not shown
    print("\rEpisode: {}, Steps: {}, eps: {:.3f}".format(episode, step + 1, epsilon), end="") # Not shown
    if episode > 50:
        training_step(batch_size)

model.set_weights(best_weights)


In [None]:
plt.figure(figsize=(8, 4))
plt.plot(rewards)
plt.xlabel("Episode", fontsize=14)
plt.ylabel("Sum of rewards", fontsize=14)
save_fig("dqn_rewards_plot")
plt.show()

# 18.11 심층 Q-러닝 변종

### 18.11.1 고정 Q-가치 타깃

꼬리가 머리를 쫒는 상황이 발생하는것을 방지.

훈련은 온라이모델.

타겟모델은 일정 에피소드마다 따로 업데이트

In [None]:
target = keras.models.clone_model(model)
target.set_weights(model.get_weights())

훈련시 다음과 같이 일정 에피소드마다 가중치 복사

In [None]:
# next_Q_values = target.predict(next_states)
# if episode % 50 ==0:
#   target.set_weights(model.get_weights())

### 18.11.2 더블 DQN

최선의 행동은 온라인모델에서 선택,

최선 행동에 대한 Q가치는 타겟모델에서 선택.

In [None]:
def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences
    next_Q_values = model.predict(next_states)
    best_next_actions = np.argmax(next_Q_values, axis=1)
    next_mask = tf.one_hot(best_next_actions, n_outputs).numpy()
    next_best_Q_values = (target.predict(next_states) * next_mask).sum(axis=1)
    target_Q_values = (rewards + 
                       (1 - dones) * discount_rate * next_best_Q_values)
    target_Q_values = target_Q_values.reshape(-1, 1)
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

### 18.11.4 듀얼링 DQN

Q-value = V(s) + A(s,a) 로 표현

A는 상태 s에서 최선의 행동을하여 다른행동 a를 했을떄보다 얻는 이득

V와 A(s,a)를 모두 추정

In [None]:
keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)
n_outputs=2

K= keras.backend
input_states = keras.layers.Input(shape=[4])
hidden1 = keras.layers.Dense(32,activation="elu")(input_states)
hidden2 = keras.layers.Dense(32,activation="elu")(hidden1)
state_values = keras.layers.Dense(1)(hidden2)
#모든 이익 추정
raw_advantages = keras.layers.Dense(n_outputs)(hidden2)
#이익 최대치를 빼서 최선 행동의 경우 0이 되도록.
#--> 최선행동을 하면 최선행동을 하여 다른 행동에 비해 얻는 상대적 이익이 0 이기떄문.
advantages = raw_advantages-K.max(raw_advantages,axis=1,keepdims=True)
Q_values = state_values +advantages

model = keras.Model(inputs=[input_states], outputs = [Q_values])

# 18.12 TF-Agents 라이브러리

In [None]:
pip install -U 'gym[atari]'

In [None]:
# pip install tensorflow-probability

In [None]:
# pip install tf_agents

In [None]:
from tf_agents.environments import suite_gym

env = suite_gym.load("Breakout-v4")
env

In [None]:
env.gym

In [None]:
env.seed(42)
env.reset()

step

timestep 반환

In [None]:
env.step(1)

In [None]:
img = env.render(mode = "rgb_array")

plt.figure(figsize=(6,8))
plt.imshow(img)
plt.axis("off")
plt.show()

In [None]:
env.current_time_step()

### 18.12.3 환경스팩

관측,행동,타임스텝크기, 데이터타입, 이름과 최솟값,최댓값을 포함하는 스펙 제공

관측스펙.

In [None]:
env.observation_spec()

행동스펙

In [None]:
env.action_spec()

타임스텝스펙

In [None]:
env.time_step_spec()

각 행동이 무엇인지

In [None]:
env.gym.get_action_meanings()

### 18.12.4 환경래퍼와 아타리 전처리

tf_agents.environments.wrappers 패키지에 여러가지 환경래퍼 제공.

In [None]:
import tf_agents.environments.wrappers

for name in dir(tf_agents.environments.wrappers):
    obj = getattr(tf_agents.environments.wrappers, name)
    if hasattr(obj, "__base__") and issubclass(obj, tf_agents.environments.wrappers.PyEnvironmentBaseWrapper):
        print("{:27s} {}".format(name, obj.__doc__.split("\n")[0]))

In [None]:
from tf_agents.environments.wrappers import ActionRepeat

모든 행동을 4번씩 반복하게 해주는 래퍼

In [None]:
repeating_env = ActionRepeat(env, times=4)
repeating_env

In [None]:
from functools import partial
from gym.wrappers import TimeLimit

limited_repeating_env = suite_gym.load(
    "Breakout-v4",
    gym_env_wrappers=[partial(TimeLimit, max_episode_steps=10000)],
    env_wrappers=[partial(ActionRepeat, times=4)],
)

In [None]:
limited_repeating_env

In [None]:
limited_repeating_env.unwrapped

In [None]:
from tf_agents.environments import suite_atari
from tf_agents.environments.atari_preprocessing import AtariPreprocessing
from tf_agents.environments.atari_wrappers import FrameStack4

max_episode_steps = 27000 # <=> 108k ALE frames since 1 step = 4 frames
environment_name = "BreakoutNoFrameskip-v4"

env = suite_atari.load(
    environment_name,
    max_episode_steps=max_episode_steps,
    gym_env_wrappers=[AtariPreprocessing, FrameStack4])

In [None]:
env

In [None]:
env.seed(42)
env.reset()
time_step = env.step(1) # FIRE
for _ in range(4):
    time_step = env.step(3) # LEFT

In [None]:
def plot_observation(obs):
    # Since there are only 3 color channels, you cannot display 4 frames
    # with one primary color per frame. So this code computes the delta between
    # the current frame and the mean of the other frames, and it adds this delta
    # to the red and blue channels to get a pink color for the current frame.
    obs = obs.astype(np.float32)
    img = obs[..., :3]
    current_frame_delta = np.maximum(obs[..., 3] - obs[..., :3].mean(axis=-1), 0.)
    img[..., 0] += current_frame_delta
    img[..., 2] += current_frame_delta
    img = np.clip(img / 150, 0, 1)
    plt.imshow(img)
    plt.axis("off")

In [None]:
plt.figure(figsize=(6, 6))
plot_observation(time_step.observation)
save_fig("preprocessed_breakout_plot")
plt.show()


In [None]:
from tf_agents.environments.tf_py_environment import TFPyEnvironment

tf_env = TFPyEnvironment(env)

In [None]:
tf.random.uniform(xv)

# 연습문제 8. 정책 그레디언트를 사용해 LunarLander -v2 환경을 해결해보기

In [None]:
pip install gym[box2d]

In [None]:
env = gym.make("LunarLander-v2")

In [None]:
env.observation_space.shape[0]

In [None]:
env.action_space.n

In [None]:
env.reset()

In [None]:
keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

n_inputs = 8

model = keras.models.Sequential([
    keras.layers.Dense(5, activation="elu", input_shape=[n_inputs]),
    keras.layers.Dense(4, activation="softmax"),
])

In [None]:
def play_one_step(env, obs, model):
    with tf.GradientTape() as tape:
        prob = model(obs[np.newaxis])
        # print(prob)
        p=prob.numpy()
        p/=p.sum()
        # print(p)
        action = np.random.choice(range(4), p=p[0])
        #액션을 하나 고름.
        # print(action)
        y_target=[[0,0,0,0]]
        y_target[0][action]=1
        y_target = tf.cast(y_target,tf.float32)
        # print(y_target) 
        loss = tf.reduce_mean(keras.losses.categorical_crossentropy(y_target, prob))
        # print(loss)
    grads = tape.gradient(loss, model.trainable_variables)
    # print(grads)
    obs, reward, done, info = env.step(action)
    return obs, reward, done, grads


In [None]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env, obs, model)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

In [None]:
def discount_rewards(rewards, discount_rate):
    discounted = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_rate
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate)
                              for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std
            for discounted_rewards in all_discounted_rewards]

In [None]:
n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 100
discount_rate = 1.0

In [None]:
optimizer = keras.optimizers.Adam(lr=0.01)

In [None]:
model.save('saved_model/my_model')

In [None]:
for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(
        env, n_episodes_per_update, n_max_steps, model)
    total_rewards = sum(map(sum, all_rewards))                     # Not shown in the book
    print("\rIteration: {}, mean rewards: {:.1f}".format(          # Not shown
        iteration, total_rewards / n_episodes_per_update), end="") # Not shown
    all_final_rewards = discount_and_normalize_rewards(all_rewards,
                                                       discount_rate)
    all_mean_grads = []
    for var_index in range(len(model.trainable_variables)):
        mean_grads = tf.reduce_mean(
            [final_reward * all_grads[episode_index][step][var_index]
             for episode_index, final_rewards in enumerate(all_final_rewards)
                 for step, final_reward in enumerate(final_rewards)], axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

env.close()

In [None]:
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim

In [None]:
def render_policy_net(model, n_max_steps=200, seed=30):
    frames = []
    env = gym.make("LunarLander-v2")
    env.seed(seed)
    np.random.seed(seed)
    obs = env.reset()
    for step in range(n_max_steps):
        frames.append(env.render(mode="rgb_array"))
        prob = model.predict(obs.reshape(1, -1))
        p=prob
        p/=p.sum()
        action = np.random.choice(range(4), p=p[0])
        obs, reward, done, info = env.step(action)
        if done:
            break
    env.close()
    return frames

In [None]:
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()



In [None]:
frames = render_policy_net(model,n_max_steps=200,seed=30)
plot_animation(frames)