# 十分钟强化学习第七讲：从Q表到神经网络

### 使用Q表的缺点：
- 无法处理state/acton过多的情况
- 无法处理连续值的state/action
- 不具备泛化能力

In [29]:
import torch
from torch.nn import Linear
import numpy as np
import gym

In [30]:
def decay_schedule(init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10):
    decay_steps = int(max_steps * decay_ratio)
    rem_steps = max_steps - decay_steps
    values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[::-1]
    values = (values - values.min()) / (values.max() - values.min())
    values = (init_value - min_value) * values + min_value
    values = np.pad(values, (0, rem_steps), 'edge')
    return values

In [31]:
def one_hot(x,size):
    result = np.zeros(size)
    result[x] = 1
    return result 

In [32]:
def conv2tensor(x,size):
    x = one_hot(x,size)
    x = torch.from_numpy(x).float()
    return x

In [33]:
def select_action(q_value, epsilon):
    q_value_np = q_value.clone().detach().numpy().squeeze()
    if np.random.random() > epsilon:
        final_move = q_value_np.argmax()
    else:
        final_move = np.random.randint(len(q_value_np))
    return final_move

In [34]:
def Simple_DQN(env,lr = 0.001,episodes=100, max_step = 100,gamma=0.9,test_policy_freq=100):

    nS, nA = env.observation_space.n, env.action_space.n
    epsilons = decay_schedule(1,0.01,0.8, episodes)

    model = Linear(nS, nA)
    loss_fn = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    results = []
    
    for i in range(episodes): 
        state, _ = env.reset()
        state = conv2tensor(state,nS)
        finished = False
        step = 0
        while not finished :
            q_value = model(state)

            # take action
            action = select_action(q_value,epsilons[i])
            next_state, reward, finished, _, _ = env.step(action)
            next_state = conv2tensor(next_state,nS)

            # find target
            target = q_value.clone().detach()
            q_value_next = model(next_state).detach().numpy().squeeze()
            td_target = reward + gamma * q_value_next.max() * (not finished)
            target[action] = td_target
            
            optimizer.zero_grad()
            td_error = loss_fn(q_value,target)
            td_error.backward()
            optimizer.step()
            state = next_state

            step += 1
            if step >= max_step:
                break

        if finished:
            results.append(reward)

        
        if (i>0) and (i % test_policy_freq == 0):
            results_array = np.array(results)
            print("Running episode  {} Reaches goal {:.2f}%. ".format(
                i, 
                results_array[-100:].mean()*100))

    return 

In [35]:
env = gym.make('FrozenLake-v1')
Simple_DQN(env,lr = 0.001,episodes=20000, max_step = 100,gamma=0.9,test_policy_freq=1000)

Running episode  1000 Reaches goal 3.00%. 
Running episode  2000 Reaches goal 3.00%. 
Running episode  3000 Reaches goal 3.00%. 
Running episode  4000 Reaches goal 9.00%. 
Running episode  5000 Reaches goal 10.00%. 
Running episode  6000 Reaches goal 12.00%. 
Running episode  7000 Reaches goal 24.00%. 
Running episode  8000 Reaches goal 21.00%. 
Running episode  9000 Reaches goal 24.00%. 
Running episode  10000 Reaches goal 37.00%. 
Running episode  11000 Reaches goal 36.00%. 
Running episode  12000 Reaches goal 41.00%. 
Running episode  13000 Reaches goal 62.00%. 
Running episode  14000 Reaches goal 40.00%. 
Running episode  15000 Reaches goal 59.00%. 
Running episode  16000 Reaches goal 49.00%. 
Running episode  17000 Reaches goal 51.00%. 
Running episode  18000 Reaches goal 61.00%. 
Running episode  19000 Reaches goal 67.00%. 


In [36]:
env = gym.make('FrozenLake-v1',map_name="8x8")
Simple_DQN(env,lr = 0.001,episodes=20000, max_step = 100,gamma=0.9,test_policy_freq=1000)

Running episode  1000 Reaches goal 1.00%. 
Running episode  2000 Reaches goal 2.00%. 
Running episode  3000 Reaches goal 0.00%. 
Running episode  4000 Reaches goal 0.00%. 
Running episode  5000 Reaches goal 2.00%. 
Running episode  6000 Reaches goal 2.00%. 
Running episode  7000 Reaches goal 2.00%. 
Running episode  8000 Reaches goal 6.00%. 
Running episode  9000 Reaches goal 2.00%. 
Running episode  10000 Reaches goal 6.00%. 
Running episode  11000 Reaches goal 18.00%. 
Running episode  12000 Reaches goal 15.00%. 
Running episode  13000 Reaches goal 14.00%. 
Running episode  14000 Reaches goal 15.00%. 
Running episode  15000 Reaches goal 8.00%. 
Running episode  16000 Reaches goal 10.00%. 
Running episode  17000 Reaches goal 2.00%. 
Running episode  18000 Reaches goal 2.00%. 
Running episode  19000 Reaches goal 10.00%. 


### 面临的问题
- Non-stationary target
- No independent and identically distributed