In [1]:
import torch
from torch.nn import Linear
import numpy as np
import random
import gym

In [2]:
def one_hot(x,size):
    result = np.zeros(size)
    result[x] = 1
    return result 

In [3]:
def conv2tensor(x,size):
    x = one_hot(x,size)
    x = torch.from_numpy(x).float()
    return x

In [5]:
def get_action(q_value, n_game):
    q_value_np = q_value.clone().detach().numpy().squeeze()
    epsilon = 2000 - n_game
    if random.randint(0, 2000) < epsilon:
        prob = np.exp(q_value_np)/np.exp(q_value_np).sum()
        final_move = np.random.choice(len(prob), p=prob)
    else:
        final_move = q_value_np.argmax()
    return final_move

In [6]:
def Simple_DQN(env,lr = 0.001,episodes=100, max_step = 100,gamma=0.9,test_policy_freq=100):
    nS, nA = env.observation_space.n, env.action_space.n
    model = Linear(nS, nA)
    loss_fn = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    results = []
    for i in range(episodes): 
        state, _ = env.reset()
        state = conv2tensor(state,nS)
        finished = False
        step = 0
        while not finished :
            q_value = model(state)

            # 选择动作并与环境交互
            action = get_action(q_value,n_game=i)
            next_state, reward, finished, _, _ = env.step(action)
            next_state = conv2tensor(next_state,nS)

            # 计算目标值 
            target = q_value.clone().detach()
            q_value_next = model(next_state).detach().numpy().squeeze()
            td_target = reward + gamma * q_value_next.max() * (not finished)
            target[action] = td_target
            
            optimizer.zero_grad()
            td_error = loss_fn(q_value,target)
            td_error.backward()
            optimizer.step()
            state = next_state
            step += 1
            if step >= max_step:
                break

        if finished:
            results.append(reward)

        
        if (i>0) and (i % test_policy_freq == 0):
            results_array = np.array(results)
            print("Running episode  {} Reaches goal {:.2f}%. ".format(
                i, 
                results_array[-100:].mean()*100))

    return 

In [8]:
env = gym.make('FrozenLake-v1')
Simple_DQN(env,lr = 0.001,episodes=5000, max_step = 100,gamma=0.9,test_policy_freq=200)

Running episode  200 Reaches goal 0.00%. 
Running episode  400 Reaches goal 2.00%. 
Running episode  600 Reaches goal 2.00%. 
Running episode  800 Reaches goal 4.00%. 
Running episode  1000 Reaches goal 1.00%. 
Running episode  1200 Reaches goal 5.00%. 
Running episode  1400 Reaches goal 5.00%. 
Running episode  1600 Reaches goal 7.00%. 
Running episode  1800 Reaches goal 8.00%. 
Running episode  2000 Reaches goal 34.00%. 
Running episode  2200 Reaches goal 54.00%. 
Running episode  2400 Reaches goal 55.00%. 
Running episode  2600 Reaches goal 47.00%. 
Running episode  2800 Reaches goal 49.00%. 
Running episode  3000 Reaches goal 47.00%. 
Running episode  3200 Reaches goal 66.00%. 
Running episode  3400 Reaches goal 46.00%. 
Running episode  3600 Reaches goal 38.00%. 
Running episode  3800 Reaches goal 37.00%. 
Running episode  4000 Reaches goal 40.00%. 
Running episode  4200 Reaches goal 41.00%. 
Running episode  4400 Reaches goal 44.00%. 
Running episode  4600 Reaches goal 39.00%. 
R

In [10]:
env = gym.make('FrozenLake-v1',map_name="8x8")
Simple_DQN(env,lr = 0.001,episodes=5000, max_step = 100,gamma=0.9,test_policy_freq=200)

Running episode  200 Reaches goal 0.00%. 
Running episode  400 Reaches goal 0.00%. 
Running episode  600 Reaches goal 0.00%. 
Running episode  800 Reaches goal 0.00%. 
Running episode  1000 Reaches goal 0.00%. 
Running episode  1200 Reaches goal 1.00%. 
Running episode  1400 Reaches goal 0.00%. 
Running episode  1600 Reaches goal 2.00%. 
Running episode  1800 Reaches goal 0.00%. 
Running episode  2000 Reaches goal 1.00%. 
Running episode  2200 Reaches goal 2.00%. 
Running episode  2400 Reaches goal 0.00%. 
Running episode  2600 Reaches goal 0.00%. 
Running episode  2800 Reaches goal 0.00%. 
Running episode  3000 Reaches goal 0.00%. 
Running episode  3200 Reaches goal 0.00%. 
Running episode  3400 Reaches goal 0.00%. 
Running episode  3600 Reaches goal 0.00%. 
Running episode  3800 Reaches goal 0.00%. 
Running episode  4000 Reaches goal 0.00%. 
Running episode  4200 Reaches goal 0.00%. 
Running episode  4400 Reaches goal 0.00%. 
Running episode  4600 Reaches goal 0.00%. 
Running episode

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import gym
import numpy as np
import random
from collections import deque

In [97]:
class Linear_QNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)
        self.linear2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        return x

In [98]:
class QTrainer:
    def __init__(self, lr, gamma,input_dim, hidden_dim, output_dim):
        self.gamma = gamma
        self.model = Linear_QNet(input_dim,hidden_dim,output_dim)
        self.target_model = Linear_QNet(input_dim,hidden_dim,output_dim)
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.SmoothL1Loss()
        self.copy_model()

    def copy_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def train_step(self, state, action, reward, next_state, done):
        state = torch.tensor(state, dtype=torch.float)
        next_state = torch.tensor(next_state, dtype=torch.float)
        action = torch.tensor(action, dtype=torch.long)
        action = torch.unsqueeze(action, -1)
        reward = torch.tensor(reward, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.long)

        Q_value = self.model(state).gather(-1, action).squeeze()
        Q_value_next = self.target_model(next_state).detach().max(-1)[0]
        target =  (reward + self.gamma * Q_value_next * (1 - done)).squeeze()

        self.optimizer.zero_grad()
        loss = self.criterion(Q_value,target)
        loss.backward()
        self.optimizer.step()

In [99]:
class Agent:
    def __init__(self,env,state_space, action_space, hidden_dim = 16,max_explore=1000, gamma = 0.9,
                max_memory=5000, lr=0.001):
        self.max_explore = max_explore 
        self.memory = deque(maxlen=max_memory) 
        self.nS = state_space  
        self.nA = action_space  
        self.step = 0
        self.n_game=0
        self.trainer = QTrainer(lr, gamma, self.nS, hidden_dim,self.nA)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done)) 

    def train_long_memory(self,batch_size):
        if len(self.memory) > batch_size:
            mini_sample = random.sample(self.memory, batch_size) # list of tuples
        else:
            mini_sample = self.memory
        states, actions, rewards, next_states, dones = zip(*mini_sample)
        states = np.array(states)
        next_states = np.array(next_states)
        self.trainer.train_step(states, actions, rewards, next_states, dones)

    # def train_short_memory(self, state, action, reward, next_state, done):
    #     self.trainer.train_step(state, action, reward, next_state, done)


    def get_action(self, state, n_game, explore=True):
        state = torch.tensor(state, dtype=torch.float)
        prediction = self.trainer.model(state).detach().numpy().squeeze()
        epsilon = self.max_explore - n_game
        if explore and random.randint(0, self.max_explore) < epsilon:
            prob = np.exp(prediction)/np.exp(prediction).sum()
            final_move = np.random.choice(len(prob), p=prob)
        else:
            final_move = prediction.argmax()
        return final_move


    @staticmethod
    def one_hot(x,size):
        result = np.zeros(size)
        result[x] = 1
        return result 

In [88]:
def train(env, max_game=5000, max_step=100):
    nS = env.observation_space.n
    agent = Agent(env, 
                    state_space = env.observation_space.n, 
                    action_space = env.action_space.n,
                    hidden_dim=16,
                    max_explore=2000, gamma = 0.9,
                    max_memory=50000, lr=0.001)
    results = []
    state_new, _ = env.reset()
    state_new = Agent.one_hot(state_new,nS)
    done = False
    total_step = 0
    while agent.n_game <= max_game:
        state_old = state_new
        action = agent.get_action(state_old,agent.n_game,explore=True)
        state_new, reward, done, _, _ = env.step(action)
        state_new = Agent.one_hot(state_new,nS)
        agent.remember(state_old, action, reward, state_new, done)
        agent.train_long_memory(batch_size=256)
        agent.step += 1
        total_step += 1

        if total_step % 10 == 0:
            agent.trainer.copy_model()

        if done or agent.step>max_step:
            results.append(reward>0)
            state_new, _ = env.reset()
            state_new = Agent.one_hot(state_new,nS)
            agent.step = 0
            agent.n_game += 1

            if (agent.n_game>0) and (agent.n_game % 200 ==0):         
                print("Running episode  {}, step {} Reaches goal {:.2f}%. ".format(
                    agent.n_game, total_step,np.sum(results[-100:])))



In [89]:
env = gym.make('FrozenLake-v1',map_name="8x8")
train(env, 5000)

Running episode  200, step 6256 Reaches goal 0.00%. 
Running episode  400, step 12852 Reaches goal 0.00%. 
Running episode  600, step 19180 Reaches goal 0.00%. 
Running episode  800, step 26462 Reaches goal 3.00%. 
Running episode  1000, step 33680 Reaches goal 0.00%. 
Running episode  1200, step 41236 Reaches goal 5.00%. 
Running episode  1400, step 49514 Reaches goal 4.00%. 
Running episode  1600, step 57593 Reaches goal 12.00%. 
Running episode  1800, step 66808 Reaches goal 22.00%. 
Running episode  2000, step 76686 Reaches goal 30.00%. 
Running episode  2200, step 88129 Reaches goal 32.00%. 
Running episode  2400, step 99131 Reaches goal 32.00%. 
Running episode  2600, step 109721 Reaches goal 25.00%. 
Running episode  2800, step 118519 Reaches goal 20.00%. 
Running episode  3000, step 129145 Reaches goal 32.00%. 
Running episode  3200, step 138852 Reaches goal 29.00%. 
Running episode  3400, step 149629 Reaches goal 34.00%. 
Running episode  3600, step 160265 Reaches goal 33.00%.