In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import copy
#Jupyter notebook 에서 pop up window
%matplotlib tk 

In [2]:
# Open AI Gym을 이용한 Custom Environment 생성 예
class GridEnv(gym.Env):
    
    def __init__(self):
        # 초기 환경 구성 시 필요한 파라미터 설정
        self.map_size = (100,100)
        self.agent_pos = [0,0]
        self.obstacle = []
                        
        self.goal = [0,0]#[self.map_size[0]-1,self.map_size[1]-1]
        self.is_done = None
        # Temporal Difference 
        self.V = np.zeros([self.map_size[0],self.map_size[1]])
        self.Q = np.zeros([4,self.map_size[0],self.map_size[1]])
        self.gamma = 0.9
        self.alpha = 0.1
        self.epsillon = 1
       
        # Open AI gym 환경 정보 설정
        self.action_space = gym.spaces.Discrete(4)
        self.obs_space = gym.spaces.Discrete(3)
        
        #plt figtext 위치
        self.text_pos_x = 0.8
        self.text_pos_y = 0.9

    def step(self, action):
        self.prev_state = copy.deepcopy(self.agent_pos)
        if action == 0: #Uo
            self.agent_pos[0] += -1
        elif action == 1: #Down
            self.agent_pos[0] += +1
        elif action == 2: #Left
            self.agent_pos[1] += -1
        elif action == 3: #Right
            self.agent_pos[1] += +1
        else:
            raise Exception("Action is not defined")
      
    
        self.next_state = copy.deepcopy(self.agent_pos)
        
        #self.render_text(self.obs(),self.get_reward())
            
        return self.obs(), self.get_reward(), self._is_done(), self.next_state, self.prev_state
    
    def obs(self):
        
        if self.agent_pos in self.obstacle:
            return 0
        
        elif self.agent_pos == self.goal:    
            return 1
        elif self.agent_pos[0] < 0 \
        or self.agent_pos[1] < 0 \
        or self.agent_pos[0] > self.map_size[0]-1 \
        or self.agent_pos[1] > self.map_size[1]-1:
            return 3
        else:    
            return 2
    
    def _is_done(self):
        # 맵 밖으로 나갔을 시 
        if self.agent_pos[0] < 0 \
        or self.agent_pos[1] < 0 \
        or self.agent_pos[0] > self.map_size[0]-1 \
        or self.agent_pos[1] > self.map_size[1]-1:
            return True
        # 도착 지점 도착시
        elif self.agent_pos == self.goal:
            return True
        
        else:
            return False
            
    def reset(self):
        '''환경 초기화'''
        
        #맵 사이즈 설정
        self.world = np.zeros(self.map_size)        
        
        #에이전트의 초기위치 및 장애물 위치 설정
        for obs_x,obs_y in self.obstacle:
            self.world[obs_x,obs_y] = 2
        
        
        self.agent_pos = [self.map_size[0]-1,self.map_size[1]-1]#[np.random.randint(self.map_size[0]),np.random.randint(self.map_size[1])]

        return self.obs(),self.get_reward()
    
    def policy(self, state = None): 
        if state[0] < 0 \
        or state[1] < 0 \
        or state[0] > self.map_size[0]-1 \
        or state[1] > self.map_size[1]-1:
            state = self.prev_state
        #e-greedy policy
                    
        self.p = np.random.uniform()
        if self.p < self.epsillon:
            self.action = self.action_space.sample()
        else:
            self.action = np.argmax(self.Q, axis=0)[state[0],state[1]]
        
        
        
        return self.action
        
    def render(self,episode,step,state):
        # 시각화
        plt.ion()
        plt.title("Grid World")
        plt.figtext(self.text_pos_x,self.text_pos_y, f"Episode = {episode}")
        plt.figtext(self.text_pos_x,self.text_pos_y-0.1, "Step : {}".format(step))
        
        
        self.world[state[0],state[1]] = -1
        self.world[self.goal[0],self.goal[1]] = 3
        plt.matshow(self.world,fignum=0)
        plt.draw()
        plt.pause(0.01) #
        plt.clf()
        self.world[state[0],state[1]] = 0
        
        for obs_x,obs_y in self.obstacle:
            self.world[obs_x,obs_y] = 2
        
    def render_text(self,obs,reward):
        # 시뮬레이션 정보 출력
        plt.figtext(self.text_pos_x,self.text_pos_y-0.3, f"Reward : {reward}")
#         plt.figtext(self.text_pos_x-0.05,self.text_pos_y-0.4, f"State Value Function : \n{np.around(self.V,4)}")
#         plt.figtext(self.text_pos_x+0.1,self.text_pos_y-0.6, f"Action Value Function : \n{np.around(self.Q,4)}")
        plt.figtext(self.text_pos_x-0.05,self.text_pos_y-0.6, f"Optimal Policy : \n{np.argmax(self.Q, axis=0)}")
        plt.figtext(self.text_pos_x,self.text_pos_y-0.15, f"Eps : \n{self.epsillon}")
        if obs == 2:
            plt.figtext(self.text_pos_x,self.text_pos_y-0.2, "None")
        elif obs == 1:
            plt.figtext(self.text_pos_x,self.text_pos_y-0.2, "GOAL IN")
        elif obs == 0:
            plt.figtext(self.text_pos_x,self.text_pos_y-0.2, "Obstacle")
        elif obs == 3:
            plt.figtext(self.text_pos_x,self.text_pos_y-0.2, "Out of map")
        
            
            
    def get_reward(self):
        # 리워드 설정
        if self.agent_pos in self.obstacle:
            self.reward = -1
            return self.reward
        
        elif self.agent_pos == self.goal:
            self.reward = 1
            return self.reward
        
        # 맵 밖으로 나갔을 시 
        elif self.agent_pos[0] < 0 \
        or self.agent_pos[1] < 0 \
        or self.agent_pos[0] > self.map_size[0]-1 \
        or self.agent_pos[1] > self.map_size[1]-1:
            self.reward = -1
            return self.reward
        
        else:
            self.reward = 0
            return self.reward
        
    def value_function_update(self,action,next_action,state,next_state):
        #State value function
        try:
            self.V[state[0]][state[1]] = \
            self.V[state[0]][state[1]] + self.alpha * (self.reward + self.gamma*self.V[next_state[0]][next_state[1]]- self.V[state[0]][state[1]])
        except IndexError: # Agent가 밖으로 나갔을 시 Value function update 예외 처리
            self.V[state[0]][state[1]] = \
        self.V[state[0]][state[1]] + self.alpha * (self.reward + self.gamma*self.V[state[0]][state[1]]- self.V[state[0]][state[1]])
        #Action value function
        try:
            self.Q[action,state[0],state[1]] = \
            self.Q[action,state[0],state[1]] + self.alpha*(self.reward + self.gamma * self.Q[next_action,next_state[0],next_state[1]]-self.Q[action,state[0],state[1]])
        except IndexError:
            self.Q[action,state[0],state[1]] = \
            self.Q[action,state[0],state[1]] + self.alpha*(self.reward + self.gamma * self.Q[action,state[0],state[1]]-self.Q[action,state[0],state[1]])
        
    def close(self):
        
        pass

In [3]:
env = GridEnv()  #np.argmax(self.Q, axis=0)[state[0],state[1]]

In [4]:
for episode in range(10000): # 에피소드 수 설정
    #환경 생성
    obs,reward = env.reset()
    
    for step in range(100): # step 상한선 설정
        if episode % 100 ==0:
            env.render(episode, step,0.1)
            env.render_text(obs,reward)
       
        
        #Sampling Action
        action = env.policy(env.agent_pos)
        
        #Take action -> Observe Reward, Observe Next State
        obs, reward, is_done , next_state, state = env.step(action)
        
    
        #Choose next action from next state
        next_action = env.policy(next_state)
        
        #Value function update
        env.value_function_update(action,next_action,state,next_state)
        
      
            
        if is_done: 
            
            if env.epsillon > 0.1:
                env.epsillon -= 0.001
            else:
                env.epsilon = 0.1
            
            break;
env.close()
#plt.close()

IndexError: list index out of range