## ライブラリのインポート

In [182]:
import math
import copy
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
#os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE
from collections import defaultdict

## アニメーションの作成

In [183]:
def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps
    import moviepy.editor as mpy

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    return clip

## 定数の宣言

In [184]:
MIN_EXPLORING_RATE = 0.01
MIN_LEARNING_RATE = 0.5
ETA = 0.5
GAMMA = 0.99

## Agentクラス

In [185]:
class Agent:
    def __init__(self, num_actions):
        self.brain = Brain(num_actions)
    
    def update_Q_function(self, observation, action, reward, observation_next):
        self.brain.update_policy(observation, action, reward, observation_next)
    
    def get_action(self, observation, episode):
        action = self.brain.decide_action(observation, episode)
        return action

## Brainクラス

In [186]:
bucket_range_per_feature = {
  'next_next_pipe_bottom_y': 40,
  'next_next_pipe_dist_to_player': 512,
  'next_next_pipe_top_y': 40,
  'next_pipe_bottom_y': 20,
  'next_pipe_dist_to_player': 20,
  'next_pipe_top_y': 20,
  'player_vel': 4,
  'player_y': 16
}

class Brain:
    
    def __init__(self, num_actions):
        self.num_actions = num_actions
        
        self.q_table = defaultdict(lambda: np.zeros(num_actions))

    def decide_action(self, state, episode):
        # epsilon-greedy
        state_idx = self.get_state_idx(state)
        epsilon = 0.5 * (1 / (episode + 1))
        if epsilon <= np.random.uniform(0, 1):
            action = np.argmax(self.q_table[state_idx])  # Q値が最大の行動を選択する
        else:
            action = np.random.choice(self.num_actions)  # ランダムな行動を選択する
        return action

    def update_policy(self, state, action, reward, state_prime):
        state_idx = self.get_state_idx(state)
        state_prime_idx = self.get_state_idx(state_prime)
        # Q学習を用いてQ値を更新する
        best_q = np.max(self.q_table[state_prime_idx])
        self.q_table[state_idx][action] += ETA * (
            reward + GAMMA * best_q - self.q_table[state_idx][action])
    
    def get_state_idx(self, state):
        # パイプの絶対位置の代わりに相対位置を使用する
        state = copy.deepcopy(state)
        state['next_next_pipe_bottom_y'] -= state['player_y']
        state['next_next_pipe_top_y'] -= state['player_y']
        state['next_pipe_bottom_y'] -= state['player_y']
        state['next_pipe_top_y'] -= state['player_y']

        # sort to make list converted from dict ordered in alphabet order
        state_key = [k for k, v in sorted(state.items())]

        # do bucketing to decrease state space to speed up training
        state_idx = []
        for key in state_key:
            state_idx.append(int(state[key] / bucket_range_per_feature[key]))
        return tuple(state_idx)
        

## Environmentクラス

In [187]:
class Environment:
    
    def __init__(self):
        self.game = FlappyBird()
        #self.env = PLE(self.game, fps=30, display_screen=False)  # environment interface to game
        self.env = PLE(self.game, fps=30, display_screen=True)  # environment interface to game
        self.num_actions = len(self.env.getActionSet())
        self.agent = Agent(self.num_actions)
    
    def run(self):
        
        from IPython.display import Image, display

        reward_per_epoch = []
        lifetime_per_epoch = []
        exploring_rates = []
        learning_rates = []
        print_every_episode = 500
        show_gif_every_episode = 5000
        NUM_EPISODE = 50000
        for episode in range(0, NUM_EPISODE):
            # 環境のリセット
            self.env.reset_game()
            # record frame
            frames = [self.env.getScreenRGB()]

            # for every 500 episodes, shutdown exploration to see performance of greedy action
            #if episode % print_every_episode == 0:
            #    self.agent.shutdown_explore()

            # 状態の初期化
            state = self.game.getGameState()
            cum_reward = 0  # このエピソードにおける累積報酬の和
            t = 0

            while not self.env.game_over():

                # 行動の選択
                action = self.agent.get_action(state, episode)

                # 行動を実行し、報酬を得る
                reward = self.env.act(
                        self.env.getActionSet()[action])  # パイプを超えれば、reward +=1 失敗したら reward  -= 5

                frames.append(self.env.getScreenRGB())

                # 累積報酬
                cum_reward += reward

                # observe the result
                state_prime = self.game.getGameState()  # 次状態を得る

                # update agent
                self.agent.update_Q_function(state, action, reward, state_prime)

                # Setting up for the next iteration
                state = state_prime
                t += 1

            # update exploring_rate and learning_rate
            #self.agent.update_parameters(episode)

            if episode % print_every_episode == 0:
                print("Episode %d finished after %f time steps" % (episode, t))
                print("cumulated reward: %f" % cum_reward)
                #print("exploring rate %f" % agent.exploring_rate)
                #print("learning rate %f" % agent.learning_rate)
                reward_per_epoch.append(cum_reward)
                #exploring_rates.append(agent.exploring_rate)
                #learning_rates.append(agent.learning_rate)
                lifetime_per_epoch.append(t)

            # for every 5000 episode, record an animation
            if episode % show_gif_every_episode == 0:
                print("len frames:", len(frames))
                clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
                display(clip.ipython_display(fps=60, autoplay=1, loop=1))

## Main関数

In [188]:
flappybird_env = Environment()
flappybird_env.run()

Episode 0 finished after 57.000000 time steps
cumulated reward: -5.000000
len frames: 58


 98%|█████████▊| 58/59 [00:00<00:00, 204.82it/s]


Episode 500 finished after 62.000000 time steps
cumulated reward: -5.000000
Episode 1000 finished after 62.000000 time steps
cumulated reward: -5.000000
Episode 1500 finished after 65.000000 time steps
cumulated reward: -5.000000
Episode 2000 finished after 62.000000 time steps
cumulated reward: -5.000000
Episode 2500 finished after 62.000000 time steps
cumulated reward: -5.000000
Episode 3000 finished after 62.000000 time steps
cumulated reward: -5.000000
Episode 3500 finished after 62.000000 time steps
cumulated reward: -5.000000
Episode 4000 finished after 62.000000 time steps
cumulated reward: -5.000000
Episode 4500 finished after 62.000000 time steps
cumulated reward: -5.000000


  3%|▎         | 2/64 [00:00<00:03, 17.44it/s]

Episode 5000 finished after 62.000000 time steps
cumulated reward: -5.000000
len frames: 63


 98%|█████████▊| 63/64 [00:00<00:00, 183.22it/s]


Episode 5500 finished after 62.000000 time steps
cumulated reward: -5.000000
Episode 6000 finished after 112.000000 time steps
cumulated reward: -3.000000
Episode 6500 finished after 134.000000 time steps
cumulated reward: -3.000000
Episode 7000 finished after 54.000000 time steps
cumulated reward: -5.000000
Episode 7500 finished after 78.000000 time steps
cumulated reward: -4.000000
Episode 8000 finished after 98.000000 time steps
cumulated reward: -4.000000
Episode 8500 finished after 148.000000 time steps
cumulated reward: -2.000000
Episode 9000 finished after 98.000000 time steps
cumulated reward: -4.000000
Episode 9500 finished after 62.000000 time steps
cumulated reward: -5.000000


 17%|█▋        | 17/100 [00:00<00:00, 167.91it/s]

Episode 10000 finished after 98.000000 time steps
cumulated reward: -4.000000
len frames: 99


 99%|█████████▉| 99/100 [00:00<00:00, 253.55it/s]


Episode 10500 finished after 447.000000 time steps
cumulated reward: 6.000000
Episode 11000 finished after 62.000000 time steps
cumulated reward: -5.000000
Episode 11500 finished after 227.000000 time steps
cumulated reward: 0.000000
Episode 12000 finished after 98.000000 time steps
cumulated reward: -4.000000
Episode 12500 finished after 62.000000 time steps
cumulated reward: -5.000000
Episode 13000 finished after 175.000000 time steps
cumulated reward: -2.000000
Episode 13500 finished after 618.000000 time steps
cumulated reward: 10.000000
Episode 14000 finished after 62.000000 time steps
cumulated reward: -5.000000
Episode 14500 finished after 514.000000 time steps
cumulated reward: 7.000000


  2%|▏         | 7/439 [00:00<00:06, 69.08it/s]

Episode 15000 finished after 437.000000 time steps
cumulated reward: 5.000000
len frames: 438


100%|█████████▉| 438/439 [00:01<00:00, 249.64it/s]


Episode 15500 finished after 247.000000 time steps
cumulated reward: 0.000000
Episode 16000 finished after 889.000000 time steps
cumulated reward: 17.000000
Episode 16500 finished after 1920.000000 time steps
cumulated reward: 45.000000
Episode 17000 finished after 288.000000 time steps
cumulated reward: 1.000000
Episode 17500 finished after 134.000000 time steps
cumulated reward: -3.000000
Episode 18000 finished after 2357.000000 time steps
cumulated reward: 56.000000
Episode 18500 finished after 1680.000000 time steps
cumulated reward: 38.000000
Episode 19000 finished after 247.000000 time steps
cumulated reward: 0.000000
Episode 19500 finished after 360.000000 time steps
cumulated reward: 3.000000


  0%|          | 6/4358 [00:00<01:13, 59.52it/s]

Episode 20000 finished after 4356.000000 time steps
cumulated reward: 109.000000
len frames: 4357


100%|█████████▉| 4357/4358 [00:20<00:00, 209.81it/s]


ValueError: The duration of video __temp__.mp4 (72.6) exceeds the 'max_duration' attribute. You can increase 'max_duration', but note that embedding large videos may take all the memory away !