In [None]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

# FrozenLake-v0을 gym 환경에서 불러온다
env = gym.make("FrozenLake-v0")

[2022-12-10 19:23:26,171] Making new env: FrozenLake-v0


In [None]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n
q_table = np.zeros((state_space_size, action_space_size))
 
# Hyperparameter 설정
num_episodes = 40000
max_steps_per_episode = 1000
learning_rate = 0.1
discount_rate = 0.99

rewards_all_episodes = []

In [None]:
def epsilonGreedyExplore(env, state, Q_table, e, episodes):
    prob = 1 - e / episodes
    if np.random.rand() < prob:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q_table[state, :])
    return action

In [None]:
def softmaxExplore(env, state, Q_table, tau=1):
    num_action = env.action_space.n
    action_prob = np.zeros(num_action)
    denominator = np.sum(np.exp(Q_table[state, :] / tau))

    for a in range(num_action):
        action_prob[a] = np.exp(Q_table[state, a] / tau) / denominator
    action = np.random.choice([0, 1, 2, 3], 1, p=action_prob)[0]
    return action

In [None]:
# Q-learning 학습

# Epsilon-Greedy 탐색 시
strategy = "epsilon-greedy"

for episode in range(num_episodes):
    # 새로운 에피소드 초기화
    state = env.reset()
    done = False
    rewards_current_episode = 0
    for step in range(max_steps_per_episode):

        if strategy == "epsilon-greedy":
            action = epsilonGreedyExplore(env, state, q_table, episode, num_episodes)
        else:
            action = softmaxExplore(env, state, q_table)

        new_state, reward, done, info = env.step(action)
        
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

        state = new_state
        rewards_current_episode += reward

        if done == True:
            break
    rewards_all_episodes.append(rewards_current_episode)

In [None]:
# 2000번의 에피소드당 평균 성공 확률
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/2000)
count = 2000
 
print("********2000 에피소드당 평균 reward ********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/2000)))
    count += 2000
 
# 업데이트된 q_table을 출력
print("\n\n********Q_table********\n")
print(q_table)

********2000 에피소드당 평균 reward ********

2000 :  0.010500000000000006
4000 :  0.017500000000000012
6000 :  0.020000000000000014
8000 :  0.022500000000000017
10000 :  0.02650000000000002
12000 :  0.03150000000000002
14000 :  0.036500000000000025
16000 :  0.03800000000000003
18000 :  0.059500000000000046
20000 :  0.058500000000000045
22000 :  0.07900000000000006
24000 :  0.08700000000000006
26000 :  0.11600000000000009
28000 :  0.1285000000000001
30000 :  0.16400000000000012
32000 :  0.21750000000000017
34000 :  0.2815000000000002
36000 :  0.35150000000000026
38000 :  0.4870000000000004
40000 :  0.6889999999999795


********Q_table********

[[0.59070313 0.50460526 0.50430182 0.51733545]
 [0.32129499 0.36567694 0.3491045  0.53906892]
 [0.43045316 0.42297028 0.42517377 0.50209038]
 [0.34671855 0.26705765 0.33973301 0.48150242]
 [0.61953063 0.34601448 0.33415734 0.3617573 ]
 [0.         0.         0.         0.        ]
 [0.3460861  0.1889122  0.20133815 0.16331143]
 [0.         0.         0.

In [None]:
for episode in range(3):
    # 각 에피소드의 변수를 초기화한다.
    state = env.reset()
    done = False
    print("*****에피소드 ", episode+1, "*****\n\n\n\n")
    time.sleep(1)
    for step in range(max_steps_per_episode):
        # 현재 상태를 그려 본다.
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        # 현재 상태에서의 q값(보상)이 가장 큰 action을 취한다.
        action = np.argmax(q_table[state, :]) 
        # 새로운 action을 취한다
        new_state, reward, done, info = env.step(action)
        if done:
            if reward == 1:
                # 만약에 Goal에 도착하여 reward가 1이라면
                print("****목표에 도달하였습니다.!****")
                time.sleep(3)
            else:
                # Goal에 도달하지 못했다면
                print("****Hole에 빠지고 말았습니다.****")
                time.sleep(3)
                clear_output(wait=True)            
            break
        
        # 새로운 상태를 설정한다.
        state = new_state
env.close()



SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
****목표에 도달하였습니다.!****


In [None]:
# 볼츠만(소프트맥스) 탐색 시
strategy = "softmax"

# Q-learning 학습
for episode in range(num_episodes):
    # 새로운 에피소드 초기화
    state = env.reset()
    done = False
    rewards_current_episode = 0
    for step in range(max_steps_per_episode):

        if strategy == "epsilon-greedy":
            action = epsilonGreedyExplore(env, state, q_table, episode, num_episodes)
        else:
            action = softmaxExplore(env, state, q_table)

        new_state, reward, done, info = env.step(action)
        
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

        state = new_state
        rewards_current_episode += reward

        if done == True:
            break
    rewards_all_episodes.append(rewards_current_episode)

In [None]:
# 2000번의 에피소드당 평균 성공 확률
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/2000)
count = 2000
 
print("********2000 에피소드당 평균 reward ********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/2000)))
    count += 2000
 
# 업데이트된 q_table을 출력
print("\n\n********Q_table********\n")
print(q_table)

********2000 에피소드당 평균 reward ********

2000 :  0.02800000000000002
4000 :  0.04250000000000003
6000 :  0.058000000000000045
8000 :  0.07450000000000005
10000 :  0.11800000000000009
12000 :  0.16600000000000012
14000 :  0.2445000000000002
16000 :  0.3815000000000003
18000 :  0.6329999999999857
20000 :  1.175999999999926
22000 :  0.029500000000000023
24000 :  0.03800000000000003
26000 :  0.03800000000000003
28000 :  0.035000000000000024
30000 :  0.034500000000000024
32000 :  0.03250000000000002
34000 :  0.03250000000000002
36000 :  0.03250000000000002
38000 :  0.036000000000000025
40000 :  0.030000000000000023


********Q_table********

[[0.51838264 0.50742959 0.50058661 0.50763382]
 [0.3697972  0.39201139 0.39756126 0.48278307]
 [0.3826824  0.41068956 0.42866755 0.47025703]
 [0.31325793 0.28265121 0.23765821 0.46123478]
 [0.53721901 0.36603718 0.28263476 0.40060473]
 [0.         0.         0.         0.        ]
 [0.29244727 0.15964546 0.24865934 0.0527394 ]
 [0.         0.         0.  

In [None]:
for episode in range(3):
    # 각 에피소드의 변수를 초기화한다
    state = env.reset()
    done = False
    print("*****에피소드 ", episode+1, "*****\n\n\n\n")
    time.sleep(1)
    for step in range(max_steps_per_episode):
        # 현재 상태를 그려 본다.
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        # 현재 상태에서의 q값(보상)이 가장 큰 action을 취한다.
        action = np.argmax(q_table[state, :]) 
        # 새로운 action을 취한다
        new_state, reward, done, info = env.step(action)
        if done:
            if reward == 1:
                # 만약에 Goal에 도착하여 reward가 1이라면
                print("****목표에 도달하였습니다.!****")
                time.sleep(3)
            else:
                # Goal에 도달하지 못했다면
                print("****Hole에 빠지고 말았습니다.****")
                time.sleep(3)
                clear_output(wait=True)            
            break
        
        # 새로운 상태를 설정한다.
        state = new_state
env.close()

SFFF
FHFH
FFFH
HF[41mF[0mG
  (Right)
****목표에 도달하였습니다.!****
