In [1]:
import gym
import numpy as np
import random


In [1]:
import gym
env = gym.make('FrozenLake-v1',render_mode="human")

In [264]:
env.observation_space

Discrete(16)

In [265]:
env.action_space

Discrete(4)

In [5]:
env.reset()
env.render()
env.step(2)

(1, 0.0, False, False, {'prob': 0.3333333333333333})

In [7]:
env.close()

In [2]:
def print_policy(pi, env, n_cols=4):
    print('Policy:')
    arrs = {k:v for k,v in enumerate(('<', 'v', '>', '^'))}
    nS = env.observation_space.n
    for s in range(nS):
        a = pi(s)
        print("| ", end="")
        if s in [5,7,11,12,15]:
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(2), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [3]:
def test_game(env, pi):
    results = []
    for _ in range(100):
        state,_ = env.reset()
        Done = False
        while not Done:
            action = pi(state)
            state, reward, Done, _ ,_= env.step(action)
        results.append(reward>0)
    return np.sum(results)/len(results)

In [4]:
env = gym.make('FrozenLake-v1')
env.reset()
random_pi = lambda s: {k:v for k in range(16) for v in np.random.choice(4,16)}[s]

print_policy(random_pi,env)
print('Reaches goal {:.2f}%. '.format(
    test_game(env, random_pi)*100))

Policy:
| 00      < | 01      < | 02      ^ | 03      ^ |
| 04      > |           | 06      v |           |
| 08      v | 09      > | 10      v |           |
|           | 13      < | 14      v |           |
Reaches goal 4.00%. 


In [7]:
LEFT, DOWN, RIGHT, UP = range(4)
human_pi = lambda s: {
    0:RIGHT, 1:RIGHT, 2:DOWN, 3:LEFT,
    4:DOWN, 5:LEFT, 6:DOWN, 7:LEFT,
    8:RIGHT, 9:RIGHT, 10:DOWN, 11:LEFT,
    12:LEFT, 13:RIGHT, 14:RIGHT, 15:LEFT
}[s]
print_policy(human_pi, env)
print('Reaches goal {:.2f}%. '.format(
    test_game(env, human_pi)*100))

Policy:
| 00      > | 01      > | 02      v | 03      < |
| 04      v |           | 06      v |           |
| 08      > | 09      > | 10      v |           |
|           | 13      > | 14      > |           |
Reaches goal 2.00%. 


In [13]:
# mento carlo

In [34]:
def select_action(state, Q, mode="both"):
    if mode == "explore":
        return np.random.randint(len(Q[state]))
    if mode == "exploit":
        return np.argmax(Q[state])
    if mode == "both":
        if np.random.random() > 0.5:
            return np.argmax(Q[state])
        else:
            return np.random.randint(len(Q[state]))

In [28]:
def play_game(env, Q ,max_steps=200):
    state, _ = env.reset()
    episode = []
    finished = False
    step = 0

    while not finished:
        action = select_action(state, Q, mode='both')
        next_state, reward, finished, _, _ = env.step(action)
        experience = (state, action, finished,reward)
        episode.append(experience)
        if step >= max_steps:
            break
        state = next_state
        step += 1

    return np.array(episode,dtype=object)

In [21]:
def monte_carlo(env, episodes=10000, test_policy_freq=1000):
    nS, nA = env.observation_space.n, env.action_space.n
    Q = np.zeros((nS, nA), dtype=np.float64)
    returns = {} 

    for i in range(episodes): 
        episode = play_game(env, Q)
        visited = np.zeros((nS, nA), dtype=bool)

        for t, (state, action, _, _) in enumerate(episode):
            state_action = (state, action)
            if not visited[state][action]:
                visited[state][action] = True
                discount = np.array([0.9**i for i in range(len(episode[t:]))])
                reward = episode[t:, -1]
                G = np.sum( discount * reward)
                if returns.get(state_action):
                    returns[state_action].append(G)
                else:
                    returns[state_action] = [G]  

                Q[state][action] = sum(returns[state_action]) / len(returns[state_action])

        pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]

        if i % test_policy_freq == 0:
                print("Test episode {} Reaches goal {:.2f}%. ".format
                (i, test_game(env, pi)*100))
            
    return pi,Q

In [32]:
env = gym.make('FrozenLake-v1')
policy_mc,Q = monte_carlo(env,episodes=20000)


Test episode 0 Reaches goal 0.00%. 
Test episode 1000 Reaches goal 22.00%. 
Test episode 2000 Reaches goal 27.00%. 
Test episode 3000 Reaches goal 14.00%. 
Test episode 4000 Reaches goal 22.00%. 
Test episode 5000 Reaches goal 30.00%. 
Test episode 6000 Reaches goal 16.00%. 
Test episode 7000 Reaches goal 29.00%. 
Test episode 8000 Reaches goal 22.00%. 
Test episode 9000 Reaches goal 24.00%. 
Test episode 10000 Reaches goal 35.00%. 
Test episode 11000 Reaches goal 37.00%. 
Test episode 12000 Reaches goal 28.00%. 
Test episode 13000 Reaches goal 45.00%. 
Test episode 14000 Reaches goal 44.00%. 
Test episode 15000 Reaches goal 40.00%. 
Test episode 16000 Reaches goal 46.00%. 
Test episode 17000 Reaches goal 47.00%. 
Test episode 18000 Reaches goal 47.00%. 
Test episode 19000 Reaches goal 45.00%. 


In [33]:
print_policy(policy_mc,env)
print('Reaches goal {:.2f}%. '.format(
    test_game(env, policy_mc)*100))

Policy:
| 00      v | 01      ^ | 02      > | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |
Reaches goal 44.00%. 


In [48]:
def select_action(state, Q):
    Qvalue = Q[state]
    norm_Q = Qvalue - np.max(Qvalue)
    exp_Q = np.exp(norm_Q)
    probs = exp_Q / np.sum(exp_Q)
    return np.random.choice(len(Qvalue), p=probs)

In [49]:
def sarsa(env,lr = 0.01,episodes=100, gamma=0.9,test_policy_freq=1000):
    nS, nA = env.observation_space.n, env.action_space.n
    Q = np.zeros((nS, nA), dtype=np.float64)
    
    for i in range(episodes): 
        state, _ = env.reset()
        finished = False
        action = select_action(state, Q)
        while not finished:
            next_state, reward, finished, _, _ = env.step(action)
            next_action = select_action(next_state, Q)
            target = reward + gamma * Q[next_state][next_action] * (not finished)
            error = target - Q[state][action]
            Q[state][action] = Q[state][action] + lr * error
            state, action = next_state, next_action

        pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
        
        if i % test_policy_freq == 0:
                print("Test episode {} Reaches goal {:.2f}%. ".format
                (i, test_game(env, pi,)*100))

    return pi,Q

In [50]:
env = gym.make('FrozenLake-v1')
policy_sarsa,Q_sarsa = sarsa(env,lr=0.01,episodes=50000)

Test episode 0 Reaches goal 0.00%. 
Test episode 1000 Reaches goal 9.00%. 
Test episode 2000 Reaches goal 7.00%. 
Test episode 3000 Reaches goal 5.00%. 
Test episode 4000 Reaches goal 10.00%. 
Test episode 5000 Reaches goal 9.00%. 
Test episode 6000 Reaches goal 19.00%. 
Test episode 7000 Reaches goal 18.00%. 
Test episode 8000 Reaches goal 18.00%. 
Test episode 9000 Reaches goal 26.00%. 
Test episode 10000 Reaches goal 15.00%. 
Test episode 11000 Reaches goal 22.00%. 
Test episode 12000 Reaches goal 45.00%. 
Test episode 13000 Reaches goal 36.00%. 
Test episode 14000 Reaches goal 7.00%. 
Test episode 15000 Reaches goal 48.00%. 
Test episode 16000 Reaches goal 45.00%. 
Test episode 17000 Reaches goal 11.00%. 
Test episode 18000 Reaches goal 16.00%. 
Test episode 19000 Reaches goal 7.00%. 
Test episode 20000 Reaches goal 14.00%. 
Test episode 21000 Reaches goal 29.00%. 
Test episode 22000 Reaches goal 19.00%. 
Test episode 23000 Reaches goal 5.00%. 
Test episode 24000 Reaches goal 70.00

In [58]:
print_policy(policy_sarsa,env)
print('Reaches goal {:.2f}%. '.format(
    test_game(env, policy_sarsa)*100))

Policy:
| 00      < | 01      ^ | 02      < | 03      ^ |
| 04      < |           | 06      > |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |
Reaches goal 78.00%. 


In [63]:
def select_action(state, Q,temp):
    Qvalue = Q[state]
    scaled_Q = Qvalue / temp
    norm_Q = scaled_Q - np.max(scaled_Q)
    exp_Q = np.exp(norm_Q)
    probs = exp_Q / np.sum(exp_Q)
    return np.random.choice(len(Qvalue), p=probs)

In [64]:
def q_learning(env,lr = 0.001,episodes=100, gamma=0.9,test_policy_freq=1000):
    nS, nA = env.observation_space.n, env.action_space.n
    Q = np.zeros((nS, nA), dtype=np.float64)
    temp_array = np.logspace(0,-2,num=episodes)
    for i in range(episodes): 
        state, _ = env.reset()
        finished = False
        while not finished:
            action = select_action(state, Q,temp_array[i])
            next_state, reward, finished, _, _ = env.step(action)
            target = reward + gamma * Q[next_state].max() * (not finished)
            error = target - Q[state][action]
            Q[state][action] = Q[state][action] + lr * error
            state = next_state

        pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
        
        if i % test_policy_freq == 0:
                print("Test episode {} Reaches goal {:.2f}%. ".format
                (i, test_game(env, pi)*100))

    return pi,Q

In [68]:
env = gym.make('FrozenLake-v1')
policy_q_learning,Q_q_learning = q_learning(env,lr=0.01,episodes=50000)

Test episode 0 Reaches goal 0.00%. 
Test episode 1000 Reaches goal 5.00%. 
Test episode 2000 Reaches goal 26.00%. 
Test episode 3000 Reaches goal 22.00%. 
Test episode 4000 Reaches goal 18.00%. 
Test episode 5000 Reaches goal 31.00%. 
Test episode 6000 Reaches goal 16.00%. 
Test episode 7000 Reaches goal 25.00%. 
Test episode 8000 Reaches goal 23.00%. 
Test episode 9000 Reaches goal 23.00%. 
Test episode 10000 Reaches goal 43.00%. 
Test episode 11000 Reaches goal 29.00%. 
Test episode 12000 Reaches goal 29.00%. 
Test episode 13000 Reaches goal 40.00%. 
Test episode 14000 Reaches goal 48.00%. 
Test episode 15000 Reaches goal 45.00%. 
Test episode 16000 Reaches goal 42.00%. 
Test episode 17000 Reaches goal 30.00%. 
Test episode 18000 Reaches goal 29.00%. 
Test episode 19000 Reaches goal 20.00%. 
Test episode 20000 Reaches goal 65.00%. 
Test episode 21000 Reaches goal 71.00%. 
Test episode 22000 Reaches goal 41.00%. 
Test episode 23000 Reaches goal 70.00%. 
Test episode 24000 Reaches goal

In [62]:
print_policy(policy_q_learning,env)
print('Reaches goal {:.2f}%. '.format(
    test_game(env, policy_q_learning)*100))

Policy:
| 00      < | 01      ^ | 02      > | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |
Reaches goal 74.00%. 
