📘 **Note Format Guide**

This format serves as a structured guide for organizing lecture content, personal interpretation, experiments, and study-related questions.

| Type | What It Means | When I Use It |
|------|----------------|----------------|
| 📝 Lecture | Original material from the professor’s notes | When I’m referencing core concepts or provided code |
| 🗣️ In-Class Note | Verbal explanations shared during the lecture | When I want to record something the professor said in class but didn’t include in the official notes |
| ✍️ My Note | My thoughts, interpretations, or additional explanations | When I reflect on or explain something in my own words |
| 🔬 Experiment | Code I tried out or changed to explore further | When I test variations or go beyond the original example |
| ❓ Question | Questions I had while studying | When I want to revisit or research something more deeply |

📝
🗣️
✍️
🔬
❓

# 1. 강의노트 원본 및 영상 링크

[https://guebin.github.io/DL2025/posts/13wk-2.html](https://guebin.github.io/DL2025/posts/13wk-2.html)

# 2. Imports 📝

In [1]:
import gymnasium as gym
#---#
import numpy as np
import collections
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import IPython

# 3. Bandit 환경 설계 및 풀이 📝

## A. 대충 개념만 실습

In [2]:
action_space = [0,1] 
actions_deque = collections.deque(maxlen=500)
rewards_deque =  collections.deque(maxlen=500)
#---#

In [3]:
for _ in range(10):
    action = np.random.choice(action_space)
    if action == 1:
        reward = 10 
    else:
        reward = 1
    actions_deque.append(action)
    rewards_deque.append(reward)

In [4]:
actions_deque

deque([np.int64(1),
       np.int64(0),
       np.int64(0),
       np.int64(1),
       np.int64(0),
       np.int64(0),
       np.int64(0),
       np.int64(1),
       np.int64(1),
       np.int64(0)],
      maxlen=500)

In [5]:
rewards_deque

deque([10, 1, 1, 10, 1, 1, 1, 10, 10, 1], maxlen=500)

In [6]:
actions_numpy = np.array(actions_deque)
rewards_numpy = np.array(rewards_deque)

In [7]:
q0 = rewards_numpy[actions_numpy == 0].mean()
q1 = rewards_numpy[actions_numpy == 1].mean()
q_table = np.array([q0,q1])
q_table

array([ 1., 10.])

In [8]:
action = q_table.argmax()

In [9]:
for _ in range(5):
    action = q_table.argmax()
    if action == 1:
        reward = 10 
    else:
        reward = 1
    actions_deque.append(action)
    rewards_deque.append(reward)
    actions_numpy = np.array(actions_deque)
    rewards_numpy = np.array(rewards_deque)    
    q0 = rewards_numpy[actions_numpy == 0].mean()
    q1 = rewards_numpy[actions_numpy == 1].mean()
    q_table = np.array([q0,q1])

In [10]:
actions_numpy

array([1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1])

In [11]:
rewards_numpy

array([10,  1,  1, 10,  1,  1,  1, 10, 10,  1, 10, 10, 10, 10, 10])

## B. 클래스를 이용한 구현 

In [12]:
class Bandit:
    def __init__(self):
        self.reward = None 
    def step(self,action):
        if action == 0:
            self.reward = 1
        else: 
            self.reward = 10 
        return self.reward 

In [16]:
class Agent:
    def __init__(self):
        pass 
    def act(self):
        # 만약에 경험이 20보다 작음 --> 랜덤액션 
        # 경험이 20보다 크면 --> action = q_table.argmax()
        pass 
    def save_experience(self):
        # 데이터 저장 
        pass 
    def learn(self):
        # q_table 을 업데이트하는 과정 
        pass

🗣️(

에이전트(=플레이어) 액션 --> 보상, 다음상태

In [14]:
env = Bandit()

In [15]:
env.step(1)

10

In [17]:
class Agent:
    def __init__(self):
        self.action = None
        self.action_space = [0,1]
        self.q_table = None
    def act(self):
        if n_experience < 20:
            self.action = np.random.choice(self.action_space)
        else:
            self.action = self.q_table.argmax()
        # 만약에 경험이 20보다 작음 --> 랜덤액션 
        # 경험이 20보다 크면 --> action = q_table.argmax()
        pass 
    def save_experience(self):
        # 데이터 저장 
        pass 
    def learn(self):
        # q_table 을 업데이트하는 과정 
        pass

- 아래를 먼저 채우고 그에 맞춰 위의 `__init__`을 채움

In [19]:
class Agent:
    def __init__(self):
        self.action = None
        self.action_space = [0,1]
        self.q_table = None
        self.n_experience = 0
    def act(self):
        if self.n_experience < 20:
            self.action = np.random.choice(self.action_space)
        else:
            self.action = self.q_table.argmax() 
    def save_experience(self):
        # 데이터 저장 
        pass 
    def learn(self):
        # q_table 을 업데이트하는 과정 
        pass

In [21]:
class Agent:
    def __init__(self):
        self.action = None
        self.reward = None
        self.actions = collections.deque(maxlen=500)
        self.rewards = collections.deque(maxlen=500)
        self.action_space = [0,1]
        self.q_table = None
        self.n_experience = 0
    def act(self):
        if self.n_experience < 20:
            self.action = np.random.choice(self.action_space)
        else:
            self.action = self.q_table.argmax() 
    def save_experience(self):
        self.actions.append(self.action)
        self.rewards.append(self.reward)
        self.n_experience = self.n_experience + 1 
    def learn(self):
        # q_table 을 업데이트하는 과정
        q0 = rewards[actions == 0].mean() # 행동0을 했을 때 얻는 보상의 평균값
        q1 = rewards[actions == 1].mean() # 행동1을 했을 때 얻는 보상의 평균값
        self.q_table = np.array([q0, q1])

In [22]:
class Agent:
    def __init__(self):
        self.action = None
        self.reward = None
        self.actions = collections.deque(maxlen=500)
        self.rewards = collections.deque(maxlen=500)
        self.action_space = [0,1]
        self.q_table = None
        self.n_experience = 0
    def act(self):
        if self.n_experience < 20:
            self.action = np.random.choice(self.action_space)
        else:
            self.action = self.q_table.argmax() 
    def save_experience(self):
        self.actions.append(self.action)
        self.rewards.append(self.reward)
        self.n_experience = self.n_experience + 1 
    def learn(self):
        if self.n_experience < 20:
            pass
        else:
            # q_table 을 업데이트하는 과정
            actions = np.array(self.actions)
            rewards = np.array(self.rewards)
            q0 = rewards[actions == 0].mean() # 행동0을 했을 때 얻는 보상의 평균값
            q1 = rewards[actions == 1].mean() # 행동1을 했을 때 얻는 보상의 평균값
            self.q_table = np.array([q0, q1])

In [23]:
env = Bandit()
player = Agent()

In [24]:
player.act()

In [25]:
player.action

np.int64(1)

- 행동을 하기는 하나 어떤 행동을 보기가 직관적이지 않음

In [26]:
class Agent:
    def __init__(self):
        self.action = None
        self.reward = None
        self.actions = collections.deque(maxlen=500)
        self.rewards = collections.deque(maxlen=500)
        self.action_space = [0,1]
        self.q_table = None
        self.n_experience = 0
    def act(self):
        if self.n_experience < 20:
            self.action = np.random.choice(self.action_space)
        else:
            self.action = self.q_table.argmax()
        print(f"버튼{self.action}누름!")
    def save_experience(self):
        self.actions.append(self.action)
        self.rewards.append(self.reward)
        self.n_experience = self.n_experience + 1 
    def learn(self):
        if self.n_experience < 20:
            pass
        else:
            # q_table 을 업데이트하는 과정
            actions = np.array(self.actions)
            rewards = np.array(self.rewards)
            q0 = rewards[actions == 0].mean() # 행동0을 했을 때 얻는 보상의 평균값
            q1 = rewards[actions == 1].mean() # 행동1을 했을 때 얻는 보상의 평균값
            self.q_table = np.array([q0, q1])

In [27]:
env = Bandit()
player = Agent()

In [28]:
player.act()

버튼1누름!


In [29]:
player.action

np.int64(1)

- 실질적 동작

In [31]:
player.act()
print(player.action)

버튼1누름!
1


In [32]:
player.act()
env.step(player.action)

버튼1누름!


10

In [34]:
player.act()
player.reward = env.step(player.action)
print(player.action, player.reward)

버튼0누름!
0 1


In [35]:
player.actions # history가 있을 줄 알았는데 없음

deque([], maxlen=500)

In [36]:
player.act()
player.reward = env.step(player.action)
player.save_experience()
print(player.action, player.reward)

버튼0누름!
0 1


In [37]:
player.actions

deque([np.int64(0)], maxlen=500)

In [38]:
player.rewards

deque([1], maxlen=500)

- 저장된 것을 바탕으로 q_table을 만들어야 함

In [40]:
player.act()
player.reward = env.step(player.action)
player.save_experience() # 데이터를 저장
player.learn() # 저장된 데이터를 학습

버튼1누름!


In [41]:
player.actions

deque([np.int64(0), np.int64(1), np.int64(1)], maxlen=500)

In [42]:
player.rewards

deque([1, 10, 10], maxlen=500)

In [43]:
player.q_table # 없는 이유: n_experience < 20

In [44]:
env = Bandit()
player = Agent()

In [46]:
for _ in range(19):
    player.act()
    player.reward = env.step(player.action)
    player.save_experience() # 데이터를 저장
    player.learn() # 저장된 데이터를 학습

버튼0누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼0누름!


In [47]:
player.actions

deque([np.int64(0),
       np.int64(1),
       np.int64(0),
       np.int64(1),
       np.int64(0),
       np.int64(0),
       np.int64(1),
       np.int64(0),
       np.int64(0),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(0),
       np.int64(0),
       np.int64(1),
       np.int64(1),
       np.int64(0),
       np.int64(0),
       np.int64(0)],
      maxlen=500)

In [48]:
player.rewards

deque([1, 10, 1, 10, 1, 1, 10, 1, 1, 10, 10, 10, 1, 1, 10, 10, 1, 1, 1],
      maxlen=500)

In [49]:
player.q_table # 없는 이유: n_experience < 20

In [55]:
env = Bandit()
player = Agent()

In [56]:
for _ in range(40):
    player.act()
    player.reward = env.step(player.action)
    player.save_experience() # 데이터를 저장
    player.learn() # 저장된 데이터를 학습

버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!


In [57]:
player.actions

deque([np.int64(0),
       np.int64(1),
       np.int64(0),
       np.int64(0),
       np.int64(1),
       np.int64(0),
       np.int64(1),
       np.int64(0),
       np.int64(0),
       np.int64(0),
       np.int64(1),
       np.int64(1),
       np.int64(0),
       np.int64(0),
       np.int64(0),
       np.int64(0),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(0),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1),
       np.int64(1)],
      maxlen=500)

In [58]:
player.rewards

deque([1,
       10,
       1,
       1,
       10,
       1,
       10,
       1,
       1,
       1,
       10,
       10,
       1,
       1,
       1,
       1,
       10,
       10,
       10,
       1,
       10,
       10,
       10,
       10,
       10,
       10,
       10,
       10,
       10,
       10,
       10,
       10,
       10,
       10,
       10,
       10,
       10,
       10,
       10,
       10],
      maxlen=500)

In [59]:
player.q_table

array([ 1., 10.])

- 게임 종료 조건이 필요할 것 같음

In [60]:
np.array(player.rewards)[-20:] # 최근 20번째

array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10])

In [61]:
np.array(player.rewards)[-20:].mean() > 9.5

np.True_

In [78]:
env = Bandit()
player = Agent()

In [79]:
for _ in range(100):
    player.act()
    player.reward = env.step(player.action)
    player.save_experience() # 데이터를 저장
    player.learn() # 저장된 데이터를 학습
    if np.array(player.rewards)[-20:].mean() > 9.5:
        print("---게임클리어---")
        break

버튼0누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
---게임클리어---


In [63]:
for _ in range(100):
    player.act()
    player.reward = env.step(player.action)
    player.save_experience() # 데이터를 저장
    player.learn() # 저장된 데이터를 학습
    if np.array(player.rewards)[-20:].mean() > 9.5:
        print("---게임클리어---")
        break

버튼1누름!
---게임클리어---


- 위와 같은 경우를 피하기 위해서

In [82]:
env = Bandit()
player = Agent()

In [83]:
for _ in range(100):
    player.act()
    player.reward = env.step(player.action)
    player.save_experience() # 데이터를 저장
    player.learn() # 저장된 데이터를 학습
    if player.n_experience < 20:
        pass
    else:
        if np.array(player.rewards)[-20:].mean() > 9.5:
            print("---게임클리어---")
            break

버튼1누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
---게임클리어---


- 정리

```python
env = Bandit()
player = Agent()
for _ in range(100):
    # step1: agent action
    player.act()
    # step2: action --> state, reward
    player.reward = env.step(player.action)
    # step3: agent가 데이터를 축적하고 학습
    player.save_experience() # 데이터를 저장
    player.learn() # 저장된 데이터를 학습
    # --- 강화학습의 종료를 결정 --- #
    if player.n_experience < 20:
        pass
    else:
        if np.array(player.rewards)[-20:].mean() > 9.5:
            print("---게임클리어---")
            break
```

)🗣️

---

In [49]:
class Agent:
    def __init__(self):
        self.action = None 
        self.reward = None 
        self.actions = collections.deque(maxlen=500)
        self.rewards = collections.deque(maxlen=500)
        self.action_space = [0,1] 
        self.q_table = None 
        self.n_experience = 0
    def act(self):
        if self.n_experience < 20:
            self.action = np.random.choice(self.action_space)
        else: 
            self.action = self.q_table.argmax()
        print(f"버튼{self.action}누름!")
    def save_experience(self):
        self.actions.append(self.action)
        self.rewards.append(self.reward)
        self.n_experience = self.n_experience + 1
    def learn(self):
        if self.n_experience < 20:
            pass
        else:
            # q_table 을 업데이트하는 과정 
            actions = np.array(self.actions)
            rewards = np.array(self.rewards)
            q0 = rewards[actions == 0].mean() # 행동0을했을때 얻는 보상의 평균값
            q1 = rewards[actions == 1].mean()# 행동1을했을때 얻는 보상의 평균값
            self.q_table = np.array([q0,q1])

In [50]:
env = Bandit()
player = Agent()
for _ in range(100):
    # step1: agent action 
    player.act()
    # step2: action --> state, reward
    player.reward = env.step(player.action)
    # step3: agent가 데이터를 축적하고 학습
    player.save_experience() # 데이터를 저장
    player.learn() #저장된 데이터를 학습 
    #---강화학습의 종료를 결정--#
    if player.n_experience < 20:
        pass 
    else: 
        if np.array(player.rewards)[-20:].mean() > 9.5:
            print("---게임클리어---")
            break

버튼1누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼0누름!
버튼0누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
버튼1누름!
---게임클리어---


# 4. 예비학습: `gym.spaces` 📝

ref: <https://gymnasium.farama.org/>

🗣️(

- 통계X, 딥러닝X 내용 (패키지 관련)
    - 딥러닝 = torch
    - 강화학습 = gym

```python
action_space = [0,1]
```

- 위를 아래와 같이 할 수도 있음

In [86]:
action_space = gym.spaces.Discrete(2) # action_space = [0,1]

In [87]:
action_space

Discrete(2)

- 주요 기능

In [88]:
action_space.sample()

np.int64(0)

In [89]:
0 in action_space

True

In [90]:
2 in action_space

False

)🗣️

`-` 예시1

In [51]:
action_space = gym.spaces.Discrete(4) 
action_space 

Discrete(4)

In [52]:
[action_space.sample() for _ in range(5)]

[0, 1, 3, 2, 3]

In [53]:
0 in action_space

True

In [54]:
4 in action_space

False

🗣️(

- 경우에 따라 State와 Action을 2차원으로 설정도 가능

[[0,0], [0,1], [0,2], ... , [3,3]]은 다음과 같은 코드로 만들 수 있음

In [91]:
gym.spaces.MultiDiscrete([4,4])

MultiDiscrete([4 4])

In [92]:
state_space = gym.spaces.MultiDiscrete([4,4])

In [93]:
state_space.sample() # numpy array로 뽑힘

array([3, 1])

)🗣️

`-` 예시2

In [55]:
state_space = gym.spaces.MultiDiscrete([4,4])
state_space

MultiDiscrete([4 4])

In [56]:
[state_space.sample() for _ in range(5)]

[array([1, 3]), array([2, 0]), array([1, 2]), array([0, 2]), array([2, 0])]

In [57]:
np.array([0,1]) in state_space

True

In [58]:
np.array([3,3]) in state_space

True

In [59]:
np.array([3,4]) in state_space

False

# 5. 4x4 Grid World 게임 설명 📝

## A. 게임설명 

`-` 문제설명: 4x4 그리드월드에서 상하좌우로 움직이는 에이전트가 목표점에 도달하도록 하는 게임

- 백문이 불여일견: <https://claude.ai/public/artifacts/76e13820-2b51-4e7e-a514-00190de17c45> (출처: 클로드)

🗣️(

- 에이전트 환경
- 에이전트 행동 - 상하좌우로 이동 --> 4개의 행동 = 0,1,2,3
- 환경은 보상을 줌 -> -1, -10, +100 중 하나를 줌
    - -1: 격자 안에 에이전트가 있음 & 에이전트의 위치가 (3,3)이 아닐 때
    - +100: (격자 안에 에이전트가 있음 &) 에이전트의 위치가 (3,3)일 때
    - -10: 에이전트가 격자 안에 있지 않음
- 에이전트 <---> 환경
    - 에이전트 --(action)--> 환경
    - 에이전트 <--(reward, state)-- 환경

)🗣️

`-` GridWorld에서 사용되는 주요변수

1. **`State`**: 각 격자 셀이 하나의 상태이며, 에이전트는 이러한 상태 중 하나에 있을 수 있음. 
2. **`Action`**: 에이전트는 현재상태에서 다음상태로 이동하기 위해 상,하,좌,우 중 하나의 행동을 취할 수 있음. 
3. **`Reward`**: 에이전트가 현재상태에서 특정 action을 하면 얻어지는 보상.
4. **`Terminated`**: 하나의 에피소드가 종료되었음을 나타내는 상태.

## B. 시각화 

In [2]:
def show(states):
    fig = plt.Figure()
    ax = fig.subplots()
    ax.matshow(np.zeros([4,4]), cmap='bwr',alpha=0.0)
    sc = ax.scatter(0, 0, color='red', s=500)  
    ax.text(0, 0, 'start', ha='center', va='center')
    ax.text(3, 3, 'end', ha='center', va='center')
    # Adding grid lines to the plot
    ax.set_xticks(np.arange(-.5, 4, 1), minor=True)
    ax.set_yticks(np.arange(-.5, 4, 1), minor=True)
    ax.grid(which='minor', color='black', linestyle='-', linewidth=2)
    state_space = gym.spaces.MultiDiscrete([4,4])
    def update(t):
        if states[t] in state_space:
            s1,s2 = states[t]
            states[t] = [s2,s1]
            sc.set_offsets(states[t])
        else:
            s1,s2 = states[t]
            s1 = s1 + 0.5 if s1 < 0 else (s1 - 0.5 if s1 > 3 else s1)
            s2 = s2 + 0.5 if s2 < 0 else (s2 - 0.5 if s2 > 3 else s2)
            states[t] = [s2,s1]       
            sc.set_offsets(states[t])
    ani = FuncAnimation(fig,update,frames=len(states))
    display(IPython.display.HTML(ani.to_jshtml()))

🗣️(

- 위 코드는 공부할 필요 없고, 사용 방법을 알면 됨

In [3]:
show(
    [[0,0],[0,1]]
)

In [4]:
show(
    [[0,0],[0,1],[0,2],[0,3],[0,4]]
) # 밖으로 나감

)🗣️

In [5]:
show([[0,0],[1,0],[2,0],[3,0],[4,0]]) # show 사용방법

# 6. 4x4 Grid World 환경 구현 📝

🗣️(

In [99]:
action_space = gym.spaces.Discrete(4)
action_space

Discrete(4)

In [100]:
action_space.sample()

np.int64(1)

In [101]:
state_space = gym.spaces.MultiDiscrete([4,4])
state_space

MultiDiscrete([4 4])

In [102]:
state_space.sample()

array([3, 2])

In [103]:
# state가 하나 있다고 하면
state = np.array([1,1]) # state_space.sample()
state

array([1, 1])

In [105]:
state = state + np.array([0,1]) # action
state

array([1, 2])

In [106]:
state = state + np.array([-1,0]) # action
state

array([0, 2])

- action은 총 4가지
    - 0, 1, 2, 3
    - 정의 순서는 상관 없음 

In [107]:
a2d = {0:np.array([0,1]), 1:np.array([0,-1]), 2:np.array([1,0]), 3:np.array([-1,0])}
a2d[0] # 0번 행동 (action)

array([0, 1])

In [108]:
# state를 다음과 같이 업데이트하면 됨
action = 0
state = state + a2d[action]
state

array([0, 3])

In [109]:
action = 2
state = state + a2d[action]
state

array([1, 3])

- class에 저장

In [119]:
class GridWorld:
    def __init__(self):
        self.a2d = {0:np.array([0,1]), 1:np.array([0,-1]), 2:np.array([1,0]), 3:np.array([-1,0])}
        self.state = np.array([0,0]) # 초기 상태
    def step(self, action):
        self.state = self.state + a2d[action]

In [120]:
env = GridWorld()

In [121]:
env.state

array([0, 0])

In [122]:
env.step(0)

In [123]:
env.state

array([0, 1])

- 보상

In [124]:
class GridWorld:
    def __init__(self):
        self.a2d = {0:np.array([0,1]), 1:np.array([0,-1]), 2:np.array([1,0]), 3:np.array([-1,0])}
        self.state = np.array([0,0]) # 초기 상태
    def step(self, action):
        self.state = self.state + a2d[action]
        # 보상을 줄 것임
        # state == (3,3)이면 +100점
        # state == (3,3)이 아니지만, state in state_space 이면 -1점
        # state not in state_space 이면 -10점

In [125]:
env.state

array([0, 1])

In [126]:
s1,s2 = env.state

In [127]:
s1, s2, env.state

(np.int64(0), np.int64(1), array([0, 1]))

```python
class GridWorld:
    def __init__(self):
        self.a2d = {0:np.array([0,1]), 1:np.array([0,-1]), 2:np.array([1,0]), 3:np.array([-1,0])}
        self.state = np.array([0,0]) # 초기 상태
        self.reward = None
        self.terminated = False
    def step(self, action):
        self.state = self.state + a2d[action]
        # 보상을 줄 것임
        # state == (3,3)이면 +100점
        if (s1==3) and (s2==3):
            self.reward = 100
            self.terminated = True
        elif 
        # state == (3,3)이 아니지만, state in state_space 이면 -1점
        # state not in state_space 이면 -10점
```

```python
class GridWorld:
    def __init__(self):
        self.a2d = {0:np.array([0,1]), 1:np.array([0,-1]), 2:np.array([1,0]), 3:np.array([-1,0])}
        self.state_space = gym.spaces.MultiDiscrete([4,4])
        self.state = np.array([0,0]) # 초기 상태
        self.reward = None
        self.terminated = False
    def step(self, action):
        self.state = self.state + a2d[action]
        # 보상을 줄 것임
        # state == (3,3)이면 +100점
        if (s1==3) and (s2==3):
            self.reward = 100
            self.terminated = True
        # state == (3,3)이 아니지만, state in state_space 이면 -1점    
        elif self.state in self.state_space:
            self.reward = -1
            self.terminated = False
        # state not in state_space 이면 -10점
```

```python
class GridWorld:
    def __init__(self):
        self.a2d = {0:np.array([0,1]), 1:np.array([0,-1]), 2:np.array([1,0]), 3:np.array([-1,0])}
        self.state_space = gym.spaces.MultiDiscrete([4,4])
        self.state = np.array([0,0]) # 초기 상태
        self.reward = None
        self.terminated = False
    def step(self, action):
        self.state = self.state + a2d[action]
        # 보상을 줄 것임
        # state == (3,3)이면 +100점
        if (s1==3) and (s2==3):
            self.reward = 100
            self.terminated = True
        # state == (3,3)이 아니지만, state in state_space 이면 -1점    
        elif self.state in self.state_space:
            self.reward = -1
            self.terminated = False
        # state not in state_space 이면 -10점
        else:
            self.reward = -10
            self.terminated = True
        return self.state, self.reward, self.terminated
```

In [159]:
class GridWorld:
    def __init__(self):
        self.a2d = {
            0: np.array([0,1]), # →
            1: np.array([0,-1]), # ←  
            2: np.array([1,0]),  # ↓
            3: np.array([-1,0])  # ↑
        }
        self.state_space = gym.spaces.MultiDiscrete([4,4])
        self.state = np.array([0,0])
        self.reward = None
        self.terminated = False
    def step(self, action):
        self.state = self.state + a2d[action]
        s1,s2 = self.state
        if (s1==3) and (s2==3):
            self.reward = 100
            self.terminated = True
        elif self.state in self.state_space:
            self.reward = -1
            self.terminated = False
        else:
            self.reward = -10
            self.terminated = True
        return self.state, self.reward, self.terminated

In [162]:
env = GridWorld()

In [169]:
action_space = gym.spaces.Discrete(4)
action = action_space.sample()
env.step(action) # 다음 상태, 보상 값, 종료 여부

(array([-2, -1]), -10, True)

- 지금 구현 X: 게임 재시작 시 상태 초기화

In [170]:
env = GridWorld()

In [171]:
for _ in range(3):
    action_space = gym.spaces.Discrete(4)
    action = action_space.sample()
    print(env.step(action))

(array([ 0, -1]), -10, True)
(array([ 0, -2]), -10, True)
(array([ 0, -3]), -10, True)


In [172]:
env = GridWorld()

In [173]:
for _ in range(10):
    action_space = gym.spaces.Discrete(4)
    action = action_space.sample()
    state, reward, terminated = env.step(action)
    print(state, reward, terminated)
    if env.terminated == True:
        env.state = np.array([0,0])
        break

[0 1] -1 False
[0 2] -1 False
[0 3] -1 False
[0 2] -1 False
[0 1] -1 False
[0 2] -1 False
[-1  2] -10 True


In [202]:
class GridWorld:
    def __init__(self):
        self.a2d = {
            0: np.array([0,1]), # →
            1: np.array([0,-1]), # ←  
            2: np.array([1,0]),  # ↓
            3: np.array([-1,0])  # ↑
        }
        self.state_space = gym.spaces.MultiDiscrete([4,4])
        self.state = np.array([0,0])
        self.reward = None
        self.terminated = False
    def step(self, action):
        self.state = self.state + a2d[action]
        s1,s2 = self.state
        if (s1==3) and (s2==3):
            self.reward = 100
            self.terminated = True
        elif self.state in self.state_space:
            self.reward = -1
            self.terminated = False
        else:
            self.reward = -10
            self.terminated = True
        return self.state, self.reward, self.terminated
    def reset(self):
        self.state = np.array([0,0])
        self.terminated = False
        return self.state

In [206]:
env = GridWorld()

In [207]:
for _ in range(10):
    action_space = gym.spaces.Discrete(4)
    action = action_space.sample()
    state, reward, terminated = env.step(action)
    print(state, reward, terminated)
    if env.terminated == True:
        env.reset()
        break

[0 1] -1 False
[-1  1] -10 True


- 에피소드 별로 게임이 진행되는 것을 확인 가능

- 프린트하는 코드 대체

In [208]:
class GridWorld:
    def __init__(self):
        self.a2d = {
            0: np.array([0,1]), # →
            1: np.array([0,-1]), # ←  
            2: np.array([1,0]),  # ↓
            3: np.array([-1,0])  # ↑
        }
        self.state_space = gym.spaces.MultiDiscrete([4,4])
        self.state = np.array([0,0])
        self.reward = None
        self.terminated = False
    def step(self, action):
        self.state = self.state + a2d[action]
        s1,s2 = self.state
        if (s1==3) and (s2==3):
            self.reward = 100
            self.terminated = True
        elif self.state in self.state_space:
            self.reward = -1
            self.terminated = False
        else:
            self.reward = -10
            self.terminated = True
        print(
            f"action = {action}\t"
            f"state = {self.state - self.a2d[action]} -> {self.state}\t"
            f"reward = {self.reward}\t"
            f"termiated = {self.terminated}"
        )
        return self.state, self.reward, self.terminated
    def reset(self):
        self.state = np.array([0,0])
        self.terminated = False
        return self.state

In [299]:
env = GridWorld()

In [300]:
action_space = gym.spaces.Discrete(4)
for _ in range(50):
    action_space = gym.spaces.Discrete(4)
    action = action_space.sample()
    env.step(action)
    if env.terminated == True:
       env.reset()
       break

action = 2	state = [0 0] -> [1 0]	reward = -1	termiated = False
action = 0	state = [1 0] -> [1 1]	reward = -1	termiated = False
action = 1	state = [1 1] -> [1 0]	reward = -1	termiated = False
action = 2	state = [1 0] -> [2 0]	reward = -1	termiated = False
action = 0	state = [2 0] -> [2 1]	reward = -1	termiated = False
action = 0	state = [2 1] -> [2 2]	reward = -1	termiated = False
action = 0	state = [2 2] -> [2 3]	reward = -1	termiated = False
action = 2	state = [2 3] -> [3 3]	reward = 100	termiated = True


- 위와 같이 100점을 받았을 때 에이전트가 그 행동을 반복하려면 환경을 만들고 전달해야 함
    - 일단 환경은 만들었음

)🗣️

In [8]:
class GridWorld:
    def __init__(self):
        self.a2d = {
            0: np.array([0,1]), # →
            1: np.array([0,-1]), # ←  
            2: np.array([1,0]),  # ↓
            3: np.array([-1,0])  # ↑
        }
        self.state_space = gym.spaces.MultiDiscrete([4,4])
        self.state = np.array([0,0])
        self.reward = None
        self.terminated = False
    def step(self,action):
        self.state = self.state + self.a2d[action]
        s1,s2 = self.state
        if (s1==3) and (s2==3):
            self.reward = 100 
            self.terminated = True
        elif self.state in self.state_space:
            self.reward = -1 
            self.terminated = False
        else:
            self.reward = -10
            self.terminated = True
        print(
            f"action = {action}\t"
            f"state = {self.state - self.a2d[action]} -> {self.state}\t"
            f"reward = {self.reward}\t"
            f"termiated = {self.terminated}"
        )            
        return self.state, self.reward, self.terminated
    def reset(self):
        self.state = np.array([0,0])
        self.terminated = False
        return self.state

In [302]:
env = GridWorld()

In [303]:
action_space = gym.spaces.Discrete(4)
for _ in range(50):
    action = action_space.sample()
    env.step(action)
    if env.terminated == True:
        env.reset()
        break

action = 1	state = [0 0] -> [ 0 -1]	reward = -10	termiated = True


# 7. "에이전트 $\Leftrightarrow$ 환경" 상호작용 구현 📝

`-` 우리가 구현하고 싶은 기능 

- `.act()`: 액션을 결정 --> 여기서는 그냥 랜덤액션 
- `.save_experience()`: 데이터를 저장 --> 여기에 일단 초점을 맞추자
- `.learn()`: 데이터로에서 학습 --> 패스 

🗣️(

- 랜덤액션이기 때문에 일단 save_experience()에만 초점을 둠
- 위의 Agent 코드 참고하여 현재 상황에 맞게 수정하면

```python
class Agent:
    def __init__(self):
        self.action = None
        self.reward = None
        self.actions = collections.deque(maxlen=500)
        self.rewards = collections.deque(maxlen=500)
        self.action_space = gym.spaces.Discrete(4)
        self.n_experience = 0
    def act(self):
        self.action = self.action_space.sample()
    def save_experience(self):
        self.actions.append(self.action)
        self.rewards.append(self.reward)
        self.n_experience = self.n_experience + 1 
    def learn(self):
        pass
```

- state 관련 변수 필요
- state, action, reward, next_state, terminated 5개 외우면 편함 SARST
- 참고) SARSA: 알고리즘 이름
    - S: current state
    - A: action
    - R: reward (from env)
    - S: next state (from env)
    - A: (current state = next state) action
    - 반복

In [6]:
class RandomAgent:
    def __init__(self):
        self.state = None
        self.action = None
        self.reward = None
        self.next_state = None
        self.terminated = None
        ##---## SARST
        #---#
        self.states = collections.deque(maxlen=500)
        self.actions = collections.deque(maxlen=500)
        self.rewards = collections.deque(maxlen=500)
        self.next_states = collections.deque(maxlen=500)
        self.terminations = collections.deque(maxlen=500)
        #---#
        self.action_space = gym.spaces.Discrete(4)
        self.n_experience = 0
    def act(self):
        self.action = self.action_space.sample()
    def save_experience(self):
        self.states.append(self.state)
        self.actions.append(self.action)
        self.rewards.append(self.reward)
        self.next_states.append(self.next_state)
        self.terminations.append(self.terminated)
        self.n_experience = self.n_experience + 1 
    def learn(self):
        pass

In [9]:
player = RandomAgent()
env = GridWorld()

In [10]:
for t in range(50):
    # step1 -- 에이전트가 action을 함
    player.act()
    # step2 -- 환경이 에이전트의 action을 보고 next_state, reward, terminated
    player.next_state, player.reward, player.terminated = env.step(player.action)
    # step3 -- 에이전트가 save & learn
    player.save_experience()
    player.learn() # pass
    # step4 -- next iteration
    player.state = player.next_state
    if env.terminated:
       player.state = env.reset()
       break

action = 2	state = [0 0] -> [1 0]	reward = -1	termiated = False
action = 2	state = [1 0] -> [2 0]	reward = -1	termiated = False
action = 0	state = [2 0] -> [2 1]	reward = -1	termiated = False
action = 0	state = [2 1] -> [2 2]	reward = -1	termiated = False
action = 3	state = [2 2] -> [1 2]	reward = -1	termiated = False
action = 2	state = [1 2] -> [2 2]	reward = -1	termiated = False
action = 0	state = [2 2] -> [2 3]	reward = -1	termiated = False
action = 2	state = [2 3] -> [3 3]	reward = 100	termiated = True


- 위는 하나의 에피소드
- 여러 에피소드를 진행하고 싶다면

In [11]:
for e in range(1,20):
    player.state = env.reset() # 밖으로 빠져도 됨
    for t in range(50):
        # step1 -- 에이전트가 action을 함
        player.act()
        # step2 -- 환경이 에이전트의 action을 보고 next_state, reward, terminated
        player.next_state, player.reward, player.terminated = env.step(player.action)
        # step3 -- 에이전트가 save & learn
        player.save_experience()
        player.learn() # pass
        # step4 -- next iteration
        player.state = player.next_state
        if env.terminated:
           break

action = 0	state = [0 0] -> [0 1]	reward = -1	termiated = False
action = 3	state = [0 1] -> [-1  1]	reward = -10	termiated = True
action = 2	state = [0 0] -> [1 0]	reward = -1	termiated = False
action = 1	state = [1 0] -> [ 1 -1]	reward = -10	termiated = True
action = 3	state = [0 0] -> [-1  0]	reward = -10	termiated = True
action = 0	state = [0 0] -> [0 1]	reward = -1	termiated = False
action = 2	state = [0 1] -> [1 1]	reward = -1	termiated = False
action = 3	state = [1 1] -> [0 1]	reward = -1	termiated = False
action = 0	state = [0 1] -> [0 2]	reward = -1	termiated = False
action = 2	state = [0 2] -> [1 2]	reward = -1	termiated = False
action = 2	state = [1 2] -> [2 2]	reward = -1	termiated = False
action = 3	state = [2 2] -> [1 2]	reward = -1	termiated = False
action = 0	state = [1 2] -> [1 3]	reward = -1	termiated = False
action = 0	state = [1 3] -> [1 4]	reward = -10	termiated = True
action = 1	state = [0 0] -> [ 0 -1]	reward = -10	termiated = True
action = 0	state = [0 0] -> [0 1

- 구분해서 보기가 어려우므로

In [17]:
for e in range(1,20):
    player.state = env.reset() # 밖으로 빠져도 됨
    for t in range(50):
        # step1 -- 에이전트가 action을 함
        player.act()
        # step2 -- 환경이 에이전트의 action을 보고 next_state, reward, terminated
        player.next_state, player.reward, player.terminated = env.step(player.action)
        # step3 -- 에이전트가 save & learn
        player.save_experience()
        player.learn() # pass
        # step4 -- next iteration
        player.state = player.next_state
        if env.terminated:
            print(f"---에피소드{e}종료---")
            break

action = 0	state = [0 0] -> [0 1]	reward = -1	termiated = False
action = 1	state = [0 1] -> [0 0]	reward = -1	termiated = False
action = 3	state = [0 0] -> [-1  0]	reward = -10	termiated = True
---에피소드1종료---
action = 2	state = [0 0] -> [1 0]	reward = -1	termiated = False
action = 1	state = [1 0] -> [ 1 -1]	reward = -10	termiated = True
---에피소드2종료---
action = 2	state = [0 0] -> [1 0]	reward = -1	termiated = False
action = 2	state = [1 0] -> [2 0]	reward = -1	termiated = False
action = 2	state = [2 0] -> [3 0]	reward = -1	termiated = False
action = 2	state = [3 0] -> [4 0]	reward = -10	termiated = True
---에피소드3종료---
action = 2	state = [0 0] -> [1 0]	reward = -1	termiated = False
action = 3	state = [1 0] -> [0 0]	reward = -1	termiated = False
action = 3	state = [0 0] -> [-1  0]	reward = -10	termiated = True
---에피소드4종료---
action = 3	state = [0 0] -> [-1  0]	reward = -10	termiated = True
---에피소드5종료---
action = 1	state = [0 0] -> [ 0 -1]	reward = -10	termiated = True
---에피소드6종료---
action = 1

- 학습이 잘 된 경우를 보기 위해 에피소드 별로 score를 계산하면

In [21]:
scores = []
for e in range(1,20):
    score = 0
    player.state = env.reset() # 밖으로 빠져도 됨
    for t in range(50):
        # step1 -- 에이전트가 action을 함
        player.act()
        # step2 -- 환경이 에이전트의 action을 보고 next_state, reward, terminated
        player.next_state, player.reward, player.terminated = env.step(player.action)
        # step3 -- 에이전트가 save & learn
        player.save_experience()
        player.learn() # pass
        # step4 -- next iteration
        player.state = player.next_state
        score = score + player.reward
        if env.terminated:
            scores.append(score)
            print(f"---에피소드{e}종료---")
            break

action = 0	state = [0 0] -> [0 1]	reward = -1	termiated = False
action = 1	state = [0 1] -> [0 0]	reward = -1	termiated = False
action = 2	state = [0 0] -> [1 0]	reward = -1	termiated = False
action = 1	state = [1 0] -> [ 1 -1]	reward = -10	termiated = True
---에피소드1종료---
action = 2	state = [0 0] -> [1 0]	reward = -1	termiated = False
action = 3	state = [1 0] -> [0 0]	reward = -1	termiated = False
action = 1	state = [0 0] -> [ 0 -1]	reward = -10	termiated = True
---에피소드2종료---
action = 0	state = [0 0] -> [0 1]	reward = -1	termiated = False
action = 1	state = [0 1] -> [0 0]	reward = -1	termiated = False
action = 3	state = [0 0] -> [-1  0]	reward = -10	termiated = True
---에피소드3종료---
action = 3	state = [0 0] -> [-1  0]	reward = -10	termiated = True
---에피소드4종료---
action = 2	state = [0 0] -> [1 0]	reward = -1	termiated = False
action = 0	state = [1 0] -> [1 1]	reward = -1	termiated = False
action = 1	state = [1 1] -> [1 0]	reward = -1	termiated = False
action = 1	state = [1 0] -> [ 1 -1]	rewa

In [22]:
scores

[-13,
 -12,
 -12,
 -10,
 -13,
 95,
 -25,
 -10,
 -10,
 -11,
 -21,
 -10,
 -10,
 -10,
 -10,
 -14,
 -10,
 -11,
 -10]

- 잘 학습된 경우(95)를 찾을 수 있음
- 이렇게 성공적으로 된 경우 멈추고 싶다면

In [23]:
scores = []
for e in range(1,20):
    score = 0
    player.state = env.reset() # 밖으로 빠져도 됨
    for t in range(50):
        # step1 -- 에이전트가 action을 함
        player.act()
        # step2 -- 환경이 에이전트의 action을 보고 next_state, reward, terminated
        player.next_state, player.reward, player.terminated = env.step(player.action)
        # step3 -- 에이전트가 save & learn
        player.save_experience()
        player.learn() # pass
        # step4 -- next iteration
        player.state = player.next_state
        score = score + player.reward
        if env.terminated:
            scores.append(score)
            print(f"---에피소드{e}종료---")
            break
    if scores[-1] > 0:
        break

action = 2	state = [0 0] -> [1 0]	reward = -1	termiated = False
action = 1	state = [1 0] -> [ 1 -1]	reward = -10	termiated = True
---에피소드1종료---
action = 2	state = [0 0] -> [1 0]	reward = -1	termiated = False
action = 0	state = [1 0] -> [1 1]	reward = -1	termiated = False
action = 3	state = [1 1] -> [0 1]	reward = -1	termiated = False
action = 0	state = [0 1] -> [0 2]	reward = -1	termiated = False
action = 1	state = [0 2] -> [0 1]	reward = -1	termiated = False
action = 1	state = [0 1] -> [0 0]	reward = -1	termiated = False
action = 0	state = [0 0] -> [0 1]	reward = -1	termiated = False
action = 0	state = [0 1] -> [0 2]	reward = -1	termiated = False
action = 1	state = [0 2] -> [0 1]	reward = -1	termiated = False
action = 2	state = [0 1] -> [1 1]	reward = -1	termiated = False
action = 1	state = [1 1] -> [1 0]	reward = -1	termiated = False
action = 3	state = [1 0] -> [0 0]	reward = -1	termiated = False
action = 3	state = [0 0] -> [-1  0]	reward = -10	termiated = True
---에피소드2종료---
action =

- 17번째 에피소드에서 종료됨

In [24]:
np.array(player.next_states)[-10:]

array([[0, 1],
       [1, 1],
       [2, 1],
       [1, 1],
       [1, 2],
       [1, 3],
       [1, 2],
       [1, 3],
       [2, 3],
       [3, 3]])

In [25]:
show(np.array(player.next_states)[-10:])

- 첫 번째 상태도 강제적으로 넣으면

In [26]:
paths = [np.array([0,0])] + list(player.next_states)[-10:]
show(paths)

)🗣️

In [65]:
class RandomAgent:
    def __init__(self):
        self.state = None 
        self.action = None 
        self.reward = None 
        self.next_state = None
        self.terminated = None
        #---#
        self.states = collections.deque(maxlen=500)
        self.actions = collections.deque(maxlen=500)
        self.rewards = collections.deque(maxlen=500)
        self.next_states = collections.deque(maxlen=500)
        self.terminations = collections.deque(maxlen=500)
        #---#
        self.action_space = gym.spaces.Discrete(4)
        self.n_experience = 0
    def act(self):
        self.action = self.action_space.sample()
    def save_experience(self):
        self.states.append(self.state)
        self.actions.append(self.action)
        self.rewards.append(self.reward)
        self.next_states.append(self.next_state)
        self.terminations.append(self.terminated)
        self.n_experience = self.n_experience + 1
    def learn(self):
        pass 

In [66]:
player = RandomAgent()
env = GridWorld()

In [67]:
for t in range(50):
    # step1 -- 에이전트가 action을 함 
    player.act()
    # step2 -- 환경이 에이전트의 action을 보고 next_state, reward, terminated 
    player.next_state, player.reward, player.terminated = env.step(player.action)
    # step3 -- 에이전트가 save & learn
    player.save_experience()
    player.learn()
    # step4 -- next iteration 
    player.state = player.next_state
    if env.terminated:
        player.state = env.reset()
        break

action = 3	state = [0 0] -> [-1  0]	reward = -10	termiated = True


In [68]:
scores = [] 
score = 0 
for e in range(1,100):
    #---에피소드시작---#
    while True:
        # step1 -- 에이전트가 action을 함 
        player.act()
        # step2 -- 환경이 에이전트의 action을 보고 next_state, reward, terminated 을 return
        player.next_state, player.reward, player.terminated = env.step(player.action)
        # step3 -- 에이전트가 save & learn
        player.save_experience()
        player.learn()
        # step4 -- next iteration 
        if env.terminated:
            score = score + player.reward
            scores.append(score)
            score = 0 
            player.state = env.reset() 
            print(f"---에피소드{e}종료---")
            break
        else: 
            score = score + player.reward
            player.state = player.next_state
    #---에피소드끝---#
    if scores[-1] > 0:
        break

action = 2	state = [0 0] -> [1 0]	reward = -1	termiated = False
action = 1	state = [1 0] -> [ 1 -1]	reward = -10	termiated = True
---에피소드1종료---
action = 1	state = [0 0] -> [ 0 -1]	reward = -10	termiated = True
---에피소드2종료---
action = 3	state = [0 0] -> [-1  0]	reward = -10	termiated = True
---에피소드3종료---
action = 1	state = [0 0] -> [ 0 -1]	reward = -10	termiated = True
---에피소드4종료---
action = 3	state = [0 0] -> [-1  0]	reward = -10	termiated = True
---에피소드5종료---
action = 2	state = [0 0] -> [1 0]	reward = -1	termiated = False
action = 3	state = [1 0] -> [0 0]	reward = -1	termiated = False
action = 0	state = [0 0] -> [0 1]	reward = -1	termiated = False
action = 3	state = [0 1] -> [-1  1]	reward = -10	termiated = True
---에피소드6종료---
action = 3	state = [0 0] -> [-1  0]	reward = -10	termiated = True
---에피소드7종료---
action = 2	state = [0 0] -> [1 0]	reward = -1	termiated = False
action = 1	state = [1 0] -> [ 1 -1]	reward = -10	termiated = True
---에피소드8종료---
action = 0	state = [0 0] -> [0 1]	reward

In [71]:
paths = [np.array([0,0])]+ list(player.next_states)[-10:]
show(paths)