# 十分钟强化学习第四讲：蒙特卡罗方法

![Alt text](frozen_lake.gif)

In [1]:
from help import FrozenLake, print_policy, test_game
import numpy as np

In [2]:
def select_action(state, Q, mode="both"):
    if mode == "explore":
        return np.random.randint(len(Q[state]))
    if mode == "exploit":
        return np.argmax(Q[state])
    if mode == "both":
        if np.random.random() > 0.5:
            return np.argmax(Q[state])
        else:
            return np.random.randint(len(Q[state]))

In [3]:
def play_game(env, Q ,max_steps=200):
    state = env.reset()
    episode = []
    finished = False
    step = 0

    while not finished:
        action = select_action(state, Q, mode='both')
        next_state, reward, finished = env.step(action)
        experience = (state, action, finished,reward)
        episode.append(experience)
        if step >= max_steps:
            break
        state = next_state
        step += 1

    return np.array(episode,dtype=object)

In [4]:
def monte_carlo(env, episodes=10000, test_policy_freq=1000):
    nS, nA = 16, 4
    Q = np.zeros((nS, nA), dtype=np.float64)
    returns = {} 

    for i in range(episodes): 
        episode = play_game(env, Q)
        visited = np.zeros((nS, nA), dtype=bool)

        for t, (state, action, _, _) in enumerate(episode):
            state_action = (state, action)
            if not visited[state][action]:
                visited[state][action] = True
                discount = np.array([0.9**i for i in range(len(episode[t:]))])
                reward = episode[t:, -1]
                G = np.sum( discount * reward)
                if returns.get(state_action):
                    returns[state_action].append(G)
                else:
                    returns[state_action] = [G]  

                Q[state][action] = sum(returns[state_action]) / len(returns[state_action])
                #Q[state][action] = Q[state][action] + 1/len(returns[state_action]) * (G - Q[state][action])
        pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]

        if i % test_policy_freq == 0:
                print("Test episode {} Reaches goal {:.2f}%. ".format
                (i, test_game(env, pi)*100))
            
    return pi,Q

In [5]:
env = FrozenLake()

In [8]:
policy_mc,Q = monte_carlo(env,episodes=20000)

Test episode 0 Reaches goal 0.00%. 
Test episode 1000 Reaches goal 7.00%. 
Test episode 2000 Reaches goal 13.00%. 
Test episode 3000 Reaches goal 31.00%. 
Test episode 4000 Reaches goal 48.00%. 
Test episode 5000 Reaches goal 38.00%. 
Test episode 6000 Reaches goal 43.00%. 
Test episode 7000 Reaches goal 66.00%. 
Test episode 8000 Reaches goal 71.00%. 
Test episode 9000 Reaches goal 78.00%. 
Test episode 10000 Reaches goal 70.00%. 
Test episode 11000 Reaches goal 76.00%. 
Test episode 12000 Reaches goal 78.00%. 
Test episode 13000 Reaches goal 79.00%. 
Test episode 14000 Reaches goal 68.00%. 
Test episode 15000 Reaches goal 71.00%. 
Test episode 16000 Reaches goal 70.00%. 
Test episode 17000 Reaches goal 68.00%. 
Test episode 18000 Reaches goal 68.00%. 
Test episode 19000 Reaches goal 71.00%. 


In [7]:
print_policy(policy_mc)
print('Reaches goal {:.2f}%. '.format(test_game(env, policy_mc)*100))

Policy:
| 00      > | 01      ^ | 02      < | 03      ^ |
| 04      < |           | 06      < |           |
| 08      ^ | 09      v | 10      < |           |
|           | 13      > | 14      v |           |
Reaches goal 42.00%. 


蒙特卡罗方法的缺点：
- 要等到游戏一轮完结后才更新
- 利用的信息中噪声较多，学习效率较低