Họ và tên: Huỳnh Thiện Tùng

MSSV: 19522492

LAB: 05

In [1]:
import gym
import numpy as np
import random

In [2]:
env1 = gym.make('FrozenLake-v0')
env2 = gym.make('FrozenLake8x8-v0')
env3 = gym.make('Taxi-v3')

In [3]:
# Hyperparameters
gamma = 0.99
learning_rate = 0.1
max_epsilon = 1.0
min_epsilon = 0.01
epsilon_decay_rate = 0.005

num_episodes = 20000
num_steps_per_episode = 100

In [4]:
def q_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate):
    q_table = np.zeros((env.observation_space.n, env.action_space.n))
    rewards_all = []
    for episode in range(num_episodes):
        state = env.reset()

        reward_episode = 0.0
        done = False
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay_rate*episode)
        for step in range(num_steps_per_episode):
            exploration = random.uniform(0,1)
            if exploration < epsilon:
                action = env.action_space.sample()
            else:
                action = np.argmax(q_table[state, :])

            next_state, reward, done, info = env.step(action)
            q_table[state, action] = q_table[state, action] * (1 - learning_rate) + learning_rate * (reward + gamma * np.max(q_table[next_state,:]))

            reward_episode += reward
            state = next_state

            if done:
                break
        rewards_all.append(reward_episode)
    print(f'Episode {episode} finished')
    print("Performance : {}".format(sum(rewards_all) / num_episodes))
    return q_table, rewards_all

In [5]:
def sarsa_learning(env, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate):
    q_table = np.zeros((env.observation_space.n, env.action_space.n))
    rewards_all = []
    for episode in range(num_episodes):
        state = env.reset()
        
        reward_episode = 0.0
        done = False
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-epsilon_decay_rate*episode)

        # action 1
        exploration = random.uniform(0,1)
        if exploration < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state, :])

        for step in range(num_steps_per_episode):
            
            # choose next action
            next_state, reward, done, info = env.step(action)

            # action 2
            exploration = random.uniform(0,1)
            if exploration < epsilon:
                next_action = env.action_space.sample()
            else:
                next_action = np.argmax(q_table[next_state, :])

            # update with Exponentially Weighted Moving Average
            q_table[state, action] = q_table[state, action] + learning_rate * (reward + gamma * q_table[next_state, next_action] - q_table[state, action])

            reward_episode += reward
            state = next_state
            action = next_action

            if done:
                break
        rewards_all.append(reward_episode)
    print(f'Episode {episode} finished')
    print("Performance : {}".format(sum(rewards_all) / num_episodes))
    return q_table, rewards_all

In [6]:
def play(env, q_table, render=False):
    state = env.reset()
    total_reward = 0
    steps = 0
    done = False
    while not done:
        action = np.argmax(q_table[state, :])
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        if render:
            env.render()
            time.sleep(0.2)
            if not done:
                display.clear_output(wait=True)
        state = next_state

    return (total_reward, steps)

In [7]:
def play_multiple_times(env, q_table, max_episodes):
    success = 0
    list_of_steps = []
    for i in range(max_episodes):
        total_reward, steps = play(env, q_table)

        if total_reward > 0:
            success += 1
            list_of_steps.append(steps)

    print(f'Number of successes: {success}/{max_episodes}')
    print(f'Average number of steps: {np.mean(list_of_steps)}')

# MAP: FrozenLake-v0

## Q_learning

In [61]:
# Initialize Q-value table randomly
q_table = np.zeros((env1.observation_space.n, env1.action_space.n))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [62]:
q_table, rewards_all = q_learning(env1, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished
Performance : 0.6215


In [63]:
q_table

array([[5.28737179e-01, 4.47673774e-01, 4.42523322e-01, 4.59180230e-01],
       [1.59034133e-01, 2.15691138e-01, 2.11256988e-01, 4.43397520e-01],
       [4.00510765e-01, 2.38580114e-01, 2.22021718e-01, 2.62719314e-01],
       [1.68056158e-01, 0.00000000e+00, 4.54203407e-04, 1.49600148e-05],
       [5.46975915e-01, 4.47734358e-01, 3.00754165e-01, 2.69082244e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.82129870e-01, 1.13628980e-01, 1.60214758e-01, 9.72132201e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.29368691e-01, 2.71139828e-01, 3.65388126e-01, 5.89063500e-01],
       [4.40452394e-01, 6.64652390e-01, 4.86202606e-01, 4.25229431e-01],
       [6.28101600e-01, 3.58396915e-01, 3.18943743e-01, 2.34180490e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.61688137e-01, 4.64823738e-01, 7.25062103e

In [64]:
sum(rewards_all)

12430.0

In [68]:
play_multiple_times(env1, q_table, 1000)

Number of successes: 753/1000
Average number of steps: 38.440903054448874


## SARSA

In [52]:
# Initialize Q-value table randomly
q_table = np.zeros((env1.observation_space.n, env1.action_space.n))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [53]:
q_table, rewards_all = sarsa_learning(env1, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished
Performance : 0.64635


In [54]:
q_table

array([[4.79603975e-01, 4.18807269e-01, 4.07985659e-01, 4.06377516e-01],
       [2.91766363e-01, 1.64541243e-01, 2.26971268e-01, 3.72674728e-01],
       [3.52459732e-01, 2.56138448e-01, 2.30860468e-01, 2.27677501e-01],
       [3.63155367e-05, 0.00000000e+00, 2.11316836e-05, 2.88843473e-01],
       [5.05752438e-01, 3.80941246e-01, 3.41734305e-01, 3.60657382e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.07976913e-01, 1.37848827e-01, 1.50551710e-01, 1.52078384e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.32374373e-01, 3.95436854e-01, 3.74964757e-01, 5.53709715e-01],
       [4.76154548e-01, 6.36279318e-01, 4.70961408e-01, 3.99272149e-01],
       [6.09662218e-01, 3.20611073e-01, 2.65168789e-01, 2.30194044e-01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.25284487e-01, 5.47554541e-01, 7.83404559e

In [55]:
sum(rewards_all)

12927.0

In [60]:
play_multiple_times(env1, q_table, 1000)

Number of successes: 740/1000
Average number of steps: 36.924324324324324


# MAP: FrozenLake8x8-v0

## Q_learning

In [18]:
# Initialize Q-value table randomly
q_table = np.zeros((env2.observation_space.n, env2.action_space.n))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [19]:
q_table, rewards_all = q_learning(env2, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished
Performance : 0.0


In [20]:
q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],


In [21]:
sum(rewards_all)

0.0

In [22]:
play_multiple_times(env2, q_table, 1000)

Number of successes: 0/1000
Average number of steps: nan


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


## SARSA

In [23]:
# Initialize Q-value table randomly
q_table = np.zeros((env2.observation_space.n, env2.action_space.n))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [24]:
q_table, rewards_all = sarsa_learning(env2, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished
Performance : 0.0


In [25]:
q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],


In [26]:
sum(rewards_all)

0.0

In [27]:
play_multiple_times(env2, q_table, 1000)

Number of successes: 0/1000
Average number of steps: nan


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


# MAP: Taxi-v3

## Q_learning

In [28]:
# Initialize Q-value table randomly
q_table = np.zeros((env3.observation_space.n, env3.action_space.n))
print(q_table)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [29]:
q_table, rewards_all = q_learning(env3, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished
Performance : 0.20855


In [30]:
q_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-0.03666757, -2.72743051, -3.06046268, -1.54379613,  9.6220697 ,
        -4.24885632],
       [-0.78836435,  1.95384805, -0.93778362,  2.99985073, 14.11880599,
        -3.79496831],
       ...,
       [-1.28214366, -1.10318794, -1.28081771, -1.26395688, -3.51293925,
        -1.926829  ],
       [-2.59029988, -2.57933129, -2.53993924, -2.57230181, -6.77639713,
        -6.28481984],
       [-0.1       , -0.1       , -0.271     , 12.34861424, -1.        ,
        -1.        ]])

In [31]:
sum(rewards_all)

4171.0

In [32]:
play_multiple_times(env3, q_table, 1000)

Number of successes: 1000/1000
Average number of steps: 13.174


## SARSA

In [88]:
# Initialize Q-value table randomly
q_table = np.zeros((env3.observation_space.n, env3.action_space.n))
print(q_table)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [89]:
q_table, rewards_all = sarsa_learning(env3, num_episodes, num_steps_per_episode, learning_rate, gamma, max_epsilon, min_epsilon, epsilon_decay_rate)

Episode 19999 finished
Performance : 0.24895


In [90]:
q_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-1.47880422, -1.73933141, -3.53625235, -3.95984075,  8.59495851,
        -3.5112623 ],
       [ 0.57057963,  1.38664333,  0.71136611,  0.02632344, 13.01321411,
        -3.28759794],
       ...,
       [-2.33417349, -2.28672833, -2.31937568,  2.96007823, -3.65955129,
        -5.21722204],
       [-3.50482   , -3.50178433, -3.48122694,  1.18778051, -4.57297855,
        -5.22084875],
       [-0.3781    , -0.227872  , -0.289     , 11.91112778, -2.71      ,
        -1.919701  ]])

In [91]:
sum(rewards_all)

4979.0

In [98]:
play_multiple_times(env3, q_table, 1000)

Number of successes: 1000/1000
Average number of steps: 12.994


# Nhận xét

Trong 2 thuật toán Q_learning và SARSA thì em chọn giải pháp để đánh giá performance của 2 thuật toán trên bằng: 



> ```print("Performance : {}".format(sum(rewards_all) / num_episodes))```


nghĩa là lấy tổng số điểm thưởng chia cho số episode. Cuối cùng là bấm


```
> Restart and run all

```
Qua nhiều lần thực thi và quan sát, em có một số nhận xét:

Xét về tiêu chí tổng điểm thưởng (sum of reward_all) thì SARSA nhiều hơn

*   Map: FrozenLake-v0: 12927 > 12439
*   Map: Taxi-v3: 4979 > 4171

Xét về tiêu chí trung bình số bước đi (Average number of steps) thì SARSA ít hơn

*   Map: FrozenLake-v0: 36.92 < 38.44
*   Map: Taxi-v3: 12.99 < 13.17

Xét về tiêu chí performence thì SARSA tốt hơn

*   Map: FrozenLake-v0: 0.6464 > 0.6215
*   Map: Taxi-v3: 0.2490 > 0.2086

Nhìn chung, thuật toán SARSA cho kết quả performence tốt hơn thuật toán Q_Learning ở cả 3 map (tuy nhiên map FrozenLake8x8-v0 không chạy được cho 2 thuật toán trên nên em không so sánh).