In [15]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym

from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display

In [16]:
def display_frames_as_gif(frames):
    plt.figure(figsize=(frames[0].shape[1]/72.0,frames[0].shape[0]/72.0),dpi=72)
    
    patch = plt.imshow(frames[0])
    plt.axis('off')
    
    def animate(i):
        patch.set_data(frames[i])
        
    anim = animation.FuncAnimation(plt.gcf(), animate, frames=len(frames), interval=50)
    anim.save('movie_cartpole.mp4')
    display(display_animation(anim, default_mode='loop'))

In [17]:
#定数の設定
ENV = 'CartPole-v0' #使用する課題名
NUM_DIZITIZED = 6 #各状態の離散値への分割数
GAMMA = 0.99 #時間割引率
ETA = 0.5 #学習係数
MAX_STEPS = 200 #1試行のstep数
NUM_EPISODES = 1000 #最大試行回数

In [18]:
#エージェントクラス
class Agent:
    def __init__(self,num_states,num_actions):
        self.brain = Brain(num_states, num_actions)
        
    def update_Q_function(self, observation, action, reward, observation_next):
        #Q関数の更新
        self.brain.update_Q_table(observation, action, reward, observation_next)
        
    def get_action(self, observation, step):
        #行動の決定
        action = self.brain.decide_action(observation, step)
        return action

In [19]:
#エージェントがもつ頭脳となるクラス

class Brain:
    def __init__(self, num_states, num_actions):
        self.num_actions = num_actions
    
        #Qテーブルを作成する
        self.q_table = np.random.uniform(low=0, high=1, size=(NUM_DIZITIZED**num_states, num_actions))
        
    def bins(self,clip_min, clip_max, num):
        #観測したデータを離散値にするしきい値を求める
        return np.linspace(clip_min, clip_max, num+1)[1:-1]
    
    def digitize_state(self, observation):
        #観測したobservationを離散値に変換する
        cart_pos, cart_v, pole_angle, pole_v = observation
        
        digitized =[
            np.digitize(cart_pos, bins=self.bins(-2.4, 2.4, NUM_DIZITIZED)),
            np.digitize(cart_v, bins=self.bins(-3.0, 3.0, NUM_DIZITIZED)),
            np.digitize(pole_angle, bins=self.bins(-0.5, 0.5, NUM_DIZITIZED)),
            np.digitize(pole_v, bins=self.bins(-2.0, 2.0, NUM_DIZITIZED))
        ]
        return sum([x*(NUM_DIZITIZED**i) for i, x in enumerate(digitized)])
    
    def update_Q_table(self, observation, action, reward, observation_next):
        #QテーブルをQ学習により更新
        state = self.digitize_state(observation) #状態を離散化
        state_next = self.digitize_state(observation_next) #次の状態を離散化
        Max_Q_next = max(self.q_table[state_next][:])
        self.q_table[state, action] = self.q_table[state, action] + \
        ETA *(reward + GAMMA * Max_Q_next - self.q_table[state, action])

    def decide_action(self, observation, episode):
        #epsilon-greedy法で徐々に最適行動を採用する
        state = self.digitize_state(observation)
        epsilon = 0.5*(1/(episode+1))
        
        if epsilon <= np.random.uniform(0, 1):
            action = np.argmax(self.q_table[state][:])
        else:
            action = np.random.choice(self.num_actions)
        return action
        
    

In [22]:
class Environment:

    def __init__(self):
        self.env = gym.make(ENV) #実行する課題を設定
        
        num_states = self.env.observation_space.shape[0] #課題の状態数を取得
        num_actions = self.env.action_space.n #行動取得
        self.agent = Agent(num_states, num_actions) #環境内で行動するAgent生成
        
    def run(self):
        #実行
        complete_episodes = 0 #195step以上連続で立ち続けた試行数
        is_episode_final = False #最終試行フラグ
        frames = [] #動画用の画像を格納する変数
        
        for episode in range(NUM_EPISODES): #試行数分繰り返す
            observation = self.env.reset() #環境の初期化
            
            for step in range(MAX_STEPS): #1エピソードのループ
                if is_episode_final is True:
                    frames.append(self.env.render(mode='rgb_array'))
                    
                #行動を求める
                action = self.agent.get_action(observation, episode)
                    
                #行動a_tに基づき， s_{t+1}，r_{t+1}を求める
                observation_next, _, done, _ = self.env.step(action) # reward と infoは使わない
                    
                #報酬を与える
                if done:
                    if step < 195:
                        reward = -1 #途中でこけたら罰則として報酬-1を与える
                        complete_episodes =0
                    else:
                        reward =1
                        complete_episodes += 1
                else:
                    reward = 0
                        
                #Q関数を更新する
                self.agent.update_Q_function(observation, action, reward, observation_next)
                    
                #観測の更新
                observation = observation_next
                    
                #終了時の処理
                if done:
                    print('{0} Episode: Finish after {1}time steps'.format(episode,step+1))
                    break
            if is_episode_final is True: #最終試行では動画を保存と描写
                display_fra
                mes_as_gif(frames)
                break
                
            if complete_episodes >= 10:
                print('10回連続成功')


In [None]:
#main
cartpole_env = Environment()
cartpole_env.run()

0 Episode: Finish after 25time steps
1 Episode: Finish after 25time steps
2 Episode: Finish after 41time steps
3 Episode: Finish after 11time steps
4 Episode: Finish after 27time steps
5 Episode: Finish after 122time steps
6 Episode: Finish after 34time steps
7 Episode: Finish after 34time steps
8 Episode: Finish after 25time steps
9 Episode: Finish after 60time steps
10 Episode: Finish after 24time steps
11 Episode: Finish after 107time steps
12 Episode: Finish after 39time steps
13 Episode: Finish after 36time steps
14 Episode: Finish after 200time steps
15 Episode: Finish after 21time steps
16 Episode: Finish after 72time steps
17 Episode: Finish after 138time steps
18 Episode: Finish after 86time steps
19 Episode: Finish after 41time steps
20 Episode: Finish after 39time steps
21 Episode: Finish after 17time steps
22 Episode: Finish after 152time steps
23 Episode: Finish after 18time steps
24 Episode: Finish after 20time steps
25 Episode: Finish after 12time steps
26 Episode: Finis

209 Episode: Finish after 92time steps
210 Episode: Finish after 158time steps
211 Episode: Finish after 200time steps
212 Episode: Finish after 150time steps
213 Episode: Finish after 28time steps
214 Episode: Finish after 160time steps
215 Episode: Finish after 200time steps
216 Episode: Finish after 200time steps
217 Episode: Finish after 200time steps
218 Episode: Finish after 159time steps
219 Episode: Finish after 182time steps
220 Episode: Finish after 191time steps
221 Episode: Finish after 200time steps
222 Episode: Finish after 107time steps
223 Episode: Finish after 116time steps
224 Episode: Finish after 200time steps
225 Episode: Finish after 200time steps
226 Episode: Finish after 200time steps
227 Episode: Finish after 200time steps
228 Episode: Finish after 200time steps
229 Episode: Finish after 200time steps
230 Episode: Finish after 200time steps
231 Episode: Finish after 200time steps
232 Episode: Finish after 200time steps
233 Episode: Finish after 200time steps
10