In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import copy
#Jupyter notebook 에서 pop up window
%matplotlib tk 

In [2]:
# Open AI Gym을 이용한 Custom Environment 생성 예
class GridEnv(gym.Env):
    
    def __init__(self):
        # 초기 환경 구성 시 필요한 파라미터 설정
        self.map_size = (3,3)
        self.agent_pos = [0,0]
        self.obstacle = [[1,1],[1,2]]
        self.goal = [2,2]
        # Temporal Difference 
        self.V = np.zeros([3,3])
        self.Q = np.zeros([3,3])
        self.gamma = 0.9
        self.alpha = 0.1
        
       
        print(f"Value function \n {self.V}\n")
        print(f"Action Value function \n {self.Q}\n")
        print(f"Initial Policy : Random")
        # Open AI gym 환경 정보 설정
        self.action_space = gym.spaces.Discrete(4)
        self.obs_space = gym.spaces.Discrete(3)
        
        #plt figtext 위치
        self.text_pos_x = 0.8
        self.text_pos_y = 0.9

    def step(self, action):
        self.post_state = copy.deepcopy(self.agent_pos)
        if action == 0: #Left
            self.agent_pos[0] += -1
        elif action == 1: #Right
            self.agent_pos[0] += +1
        elif action == 2: #Up
            self.agent_pos[1] += -1
        elif action == 3: #Down
            self.agent_pos[1] += +1
        else:
            raise Exception("Action is not defined")
        
#         if self.agent_pos[0] < 0 \
#         or self.agent_pos[1] < 0 \
#         or self.agent_pos[0] > self.map_size[0]-1 \
#         or self.agent_pos[1] > self.map_size[1]-1:
#             if action == 0: #Left
#                 self.agent_pos[0] += +1
#             elif action == 1: #Right
#                 self.agent_pos[0] += -1
#             elif action == 2: #Up
#                 self.agent_pos[1] += +1
#             elif action == 3: #Down
#                 self.agent_pos[1] += -1
            
            
        return self.obs(), self.get_reward(), self._is_done(), self.agent_pos, self.post_state
    
    def obs(self):
        
        if self.agent_pos in self.obstacle:
            return 0
        
        elif self.agent_pos == self.goal:    
            return 1
        elif self.agent_pos[0] < 0 \
        or self.agent_pos[1] < 0 \
        or self.agent_pos[0] > self.map_size[0]-1 \
        or self.agent_pos[1] > self.map_size[1]-1:
            return 3
        else:    
            return 2
    
    def _is_done(self):
        # 맵 밖으로 나갔을 시 
        if self.agent_pos[0] < 0 \
        or self.agent_pos[1] < 0 \
        or self.agent_pos[0] > self.map_size[0]-1 \
        or self.agent_pos[1] > self.map_size[1]-1:
            return True
        # 도착 지점 도착시
        elif self.agent_pos == self.goal:
            return True
        
        else:
            return False
            
    def reset(self):
        '''환경 초기화'''
        
        #맵 사이즈 설정
        self.world = np.zeros(self.map_size)        
        
        #에이전트의 초기위치 및 장애물 위치 설정
        self.world[self.obstacle[0][0],self.obstacle[0][1]] = 2
        self.world[self.obstacle[1][0],self.obstacle[1][1]] = 2
        
        
        self.agent_pos = [0,2]

        return self.obs()
    def policy(self, state = None): 
        if state == None:
            
            return env.action_space.sample()
        else:
            
            return env.action_space.sample()
        
    def render(self,episode,step):
        # 시각화
        plt.ion()
        plt.title("Grid World")
        plt.figtext(self.text_pos_x,self.text_pos_y, f"Episode = {episode}")
        plt.figtext(self.text_pos_x,self.text_pos_y-0.1, "Step : {}".format(step))
        
        self.world[self.agent_pos[0], self.agent_pos[1]] = -1
        self.world[self.goal[0],self.goal[1]] = 3
        plt.matshow(self.world,fignum=0)
        plt.draw()
        plt.pause(0.01) #
        plt.clf()
        self.world[self.agent_pos[0], self.agent_pos[1]] = 0
        
        self.world[self.obstacle[0][0],self.obstacle[0][1]] = 2
        self.world[self.obstacle[1][0],self.obstacle[1][1]] = 2
        
    def render_text(self,obs,reward):
        # 시뮬레이션 정보 출력
        plt.figtext(self.text_pos_x,self.text_pos_y-0.3, f"Reward : {reward}")
        if obs == 2:
            plt.figtext(self.text_pos_x,self.text_pos_y-0.2, "")
        elif obs == 1:
            plt.figtext(self.text_pos_x,self.text_pos_y-0.2, "GOAL IN")
        elif obs == 0:
            plt.figtext(self.text_pos_x,self.text_pos_y-0.2, "Obstacle")
        elif obs == 3:
            plt.figtext(self.text_pos_x,self.text_pos_y-0.2, "Out of map")
            
    def get_reward(self):
        # 리워드 설정
        if self.agent_pos in self.obstacle:
            return -1
        
        elif self.agent_pos == self.goal:
            return +1
        
        elif self.agent_pos[0] < 0 \
        or self.agent_pos[1] < 0 \
        or self.agent_pos[0] > self.map_size[0]-1 \
        or self.agent_pos[1] > self.map_size[1]-1:
            return -1
        
        else:
            return 0
        
    
    def close(self):
        # Clear env
        pass

In [3]:
env = GridEnv()

Value function 
 [[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]

Action Value function 
 [[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]

Initial Policy : Random


In [4]:
env.V

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [None]:
for episode in range(100): # 에피소드 수 설정
    #환경 생성
    obs = env.reset()
    for step in range(100): # step 상한선 설정
        
        env.render(episode, step)
        #Sampling Action
        action = env.policy(env.agent_pos)
        
        #Take action -> Observe Reward, Observe Next State
        obs, reward, is_done , next_state, state = env.step(action)
        env.render_text(obs,reward)        
        
        #Value function update
        try:
            env.V[state[0]][state[1]] = \
            env.V[state[0]][state[1]] + env.alpha * (reward + env.gamma*env.V[next_state[0]][next_state[1]]- env.V[state[0]][state[1]])
        except IndexError: # Agent가 밖으로 나갔을 시 Value function update 예외 처리
            env.V[state[0]][state[1]] = \
        env.V[state[0]][state[1]] + env.alpha * (reward + env.gamma*env.V[state[0]][state[1]]- env.V[state[0]][state[1]])
        
            
        if is_done:                
            break;
env.close()
plt.close()