In [0]:
#remove " > /dev/null 2>&1" to see what is going on under the hood
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

# 應用Keras-rl tensorflow 2.0 版在Gym環境學習Lunar Lander策略


In [0]:
!pip install tensorflow==2.0.0-beta > /dev/null 2>&1

In [5]:
!pip install box2d-py



In [6]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import tensorflow.compat.v2 as tf
import random
import numpy as np
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

import sys
import math

import Box2D
from Box2D.b2 import (edgeShape, circleShape, fixtureDef, polygonShape, revoluteJointDef, contactListener)

from gym import spaces
from gym.utils import colorize, seeding, EzPickle

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [7]:
tf.enable_v2_behavior()
print(tf.__version__)

2.0.0-beta0


In [0]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay

In [9]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!


<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [0]:
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

## 模擬隨機行為

In [0]:
from random import randrange
def action_my():
  return(randrange(4))

# Wrap env 纔可以在Colab上看Lunar Lander的模擬影片

In [12]:
env = wrap_env(gym.make('LunarLander-v2')) 

np.random.seed(123)
env.seed(123)

total_timesteps = 0
for i_episode in range(5):
    observation = env.reset()
    for t in range(500):
        env.render()
        action = action_my()
        # action = env.action_space.sample()
        observation, reward, done, info = env.step(action)

        if done:
            print("Episode finished after {} timesteps".format(t+1))
            total_timesteps += (t+1)
            break

print('Average score over {} random strategy games: {}'.format(i_episode+1, total_timesteps/(i_episode+1)))
env.close()

Episode finished after 83 timesteps
Episode finished after 88 timesteps
Episode finished after 95 timesteps
Episode finished after 84 timesteps
Episode finished after 154 timesteps
Average score over 5 random strategy games: 100.8


In [13]:
show_video()

## 安裝Keras-rl tensorflow 2.0 beta0版

In [0]:
!pip install keras-rl2 > /dev/null 2>&1

# 若要重新訓練，跑這以後的程式區塊即可 ...

In [15]:
env = gym.make('LunarLander-v2')
np.random.seed(3310)
env.seed(3310)

[3310]

## 定義神經網路的架構，輸入參數分別是狀態的維度及可執行的行動數量

In [16]:
def agent(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape = (1, states)))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(actions, activation='linear'))
#    model.add(Dense(actions, activation='sigmoid'))
    return model
  
model = agent(env.observation_space.shape[0], env.action_space.n)
# model = agent(env.observation_space.shape[0], env.action_space.shape[0])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 128)               1152      
_________________________________________________________________
dense_1 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_3 (Dense)              (None, 4)                 516       
Total params: 34,692
Trainable params: 34,692
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
from rl.agents import SARSAAgent
from rl.policy import EpsGreedyQPolicy, BoltzmannQPolicy

policy = BoltzmannQPolicy()

In [0]:
sarsa = SARSAAgent(model = model, policy = policy, nb_actions = env.action_space.n)

In [0]:
sarsa.compile('adam', metrics = ['mse'])

In [20]:
sarsa.fit(env, nb_steps = 150000, visualize = False, verbose = 1)

Training for 150000 steps ...
Interval 1 (0 steps performed)
141 episodes - episode_reward: -362.574 [-1017.734, -39.969] - loss: 152.672 - mse: 89492.171 - mean_q: -176.048

Interval 2 (10000 steps performed)
126 episodes - episode_reward: -157.550 [-571.473, 41.258] - loss: 187.306 - mse: 20911.967 - mean_q: -89.699

Interval 3 (20000 steps performed)
55 episodes - episode_reward: -166.093 [-723.394, 11.778] - loss: 225.836 - mse: 14876.596 - mean_q: -61.372

Interval 4 (30000 steps performed)
15 episodes - episode_reward: -73.397 [-327.904, 19.926] - loss: 4.679 - mse: 1018.396 - mean_q: -10.267

Interval 5 (40000 steps performed)
19 episodes - episode_reward: -36.867 [-203.643, 73.142] - loss: 9.264 - mse: 764.822 - mean_q: -6.648

Interval 6 (50000 steps performed)
19 episodes - episode_reward: -3.040 [-180.812, 217.534] - loss: 4.261 - mse: 709.607 - mean_q: -4.175

Interval 7 (60000 steps performed)
23 episodes - episode_reward: 45.651 [-155.107, 237.703] - loss: 5.226 - mse: 40

<tensorflow.python.keras.callbacks.History at 0x7f8b3b7b8860>

In [21]:
scores = sarsa.test(env, nb_episodes = 50, visualize= False)
print('Average score over 50 test games:{}'.format(np.mean(scores.history['episode_reward'])))

Testing for 50 episodes ...
Episode 1: reward: 141.169, steps: 657
Episode 2: reward: 135.758, steps: 535
Episode 3: reward: 230.162, steps: 324
Episode 4: reward: 183.328, steps: 400
Episode 5: reward: 121.921, steps: 1000
Episode 6: reward: 225.794, steps: 315
Episode 7: reward: 166.756, steps: 423
Episode 8: reward: 110.110, steps: 1000
Episode 9: reward: 186.658, steps: 394
Episode 10: reward: -4.622, steps: 285
Episode 11: reward: 149.593, steps: 560
Episode 12: reward: 230.697, steps: 222
Episode 13: reward: 165.552, steps: 579
Episode 14: reward: 74.597, steps: 948
Episode 15: reward: 77.932, steps: 1000
Episode 16: reward: 217.809, steps: 221
Episode 17: reward: 59.948, steps: 838
Episode 18: reward: 218.925, steps: 203
Episode 19: reward: 162.944, steps: 238
Episode 20: reward: 186.184, steps: 229
Episode 21: reward: 173.843, steps: 319
Episode 22: reward: 165.903, steps: 397
Episode 23: reward: 186.182, steps: 386
Episode 24: reward: 84.308, steps: 1000
Episode 25: reward: 18

In [0]:
# sarsa.save_weights('sarsa_weights.h5f', overwrite=True)

In [0]:
# sarsa.load_weights('sarsa_weights.h5f')

In [24]:
_ = sarsa.test(env, nb_episodes = 2, visualize= True)

Testing for 2 episodes ...
Episode 1: reward: 198.830, steps: 546
Episode 2: reward: 201.625, steps: 221


# 定義行動函數應用已訓練好的model，輸入參數為狀態向量

In [58]:
def action_sarsa(observation):
  observation = np.array(observation)
  observation.shape=(1,1,8)

  # print(observation)
  next_action_reference = model.predict(observation)
  # print(next_action_reference)

  if next_action_reference[0,0] > next_action_reference[0,2]: next_action = 0
  elif next_action_reference[0,1] > next_action_reference[0,2]: next_action = 1
  elif next_action_reference[0,2] > next_action_reference[0,3]: next_action = 2
  else: next_action = 3

  return(next_action)

env = wrap_env(gym.make('LunarLander-v2')) 

total_timesteps = 0
for i_episode in range(5):
    observation = env.reset()
    for t in range(500):
        env.render()
        action = action_sarsa(observation)
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            total_timesteps += (t+1)
            break

print('Average score over 50 test games:{}'.format(total_timesteps/(i_episode+1)))
env.close()  



Episode finished after 334 timesteps
Episode finished after 254 timesteps
Episode finished after 375 timesteps
Episode finished after 389 timesteps
Episode finished after 320 timesteps
Average score over 50 test games:334.4


In [59]:
show_video()