In [1]:
import random
import gym
import numpy as np
import collections
from tqdm import tqdm
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import rl_utils
import copy

In [2]:
class PolicyNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(PolicyNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return F.softmax(self.fc2(x), dim=1)

In [3]:
class ValueNet(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim):
        super(ValueNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

In [4]:
class ActorCritic:
    def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr,
                 gamma, device):
        # 策略网络
        self.actor = PolicyNet(state_dim, hidden_dim, action_dim).to(device)
        self.critic = ValueNet(state_dim, hidden_dim).to(device)  # 价值网络
        # 策略网络优化器
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
                                                lr=actor_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
                                                 lr=critic_lr)  # 价值网络优化器
        self.gamma = gamma
        self.device = device

    def take_action(self, state):
        state = torch.tensor([state], dtype=torch.float).to(self.device)
        probs = self.actor(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()
        return action.item(),probs[0].tolist()

    def update(self, transition_dict):
        states = torch.tensor(transition_dict['states'],
                              dtype=torch.float).to(self.device)
        actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(
            self.device)
        rewards = torch.tensor(transition_dict['rewards'],
                               dtype=torch.float).view(-1, 1).to(self.device)
        next_states = torch.tensor(transition_dict['next_states'],
                                   dtype=torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'],
                             dtype=torch.float).view(-1, 1).to(self.device)

        # 时序差分目标
        td_target = rewards + self.gamma * self.critic(next_states) * (1 -
                                                                       dones)
        td_delta = td_target - self.critic(states)  # 时序差分误差
        log_probs = torch.log(self.actor(states).gather(1, actions))
        actor_loss = torch.mean(-log_probs * td_delta.detach())
        # 均方误差损失函数
        critic_loss = torch.mean(
            F.mse_loss(self.critic(states), td_target.detach()))
        self.actor_optimizer.zero_grad()
        self.critic_optimizer.zero_grad()
        actor_loss.backward()  # 计算策略网络的梯度
        critic_loss.backward()  # 计算价值网络的梯度
        self.actor_optimizer.step()  # 更新策略网络的参数
        self.critic_optimizer.step()  # 更新价值网络的参数

In [5]:
class ReplayBuffer:
    ''' 经验回放池 '''
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)  # 队列,先进先出

    def add(self, state, action, reward, next_state, done):  # 将数据加入buffer
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):  # 从buffer中采样数据,数量为batch_size
        transitions = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = zip(*transitions)
        return np.array(state), action, reward, np.array(next_state), done

    def size(self):  # 目前buffer中数据的数量
        return len(self.buffer)

In [6]:
class Qnet(torch.nn.Module):
    ''' 只有一层隐藏层的Q网络 '''
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(Qnet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))  # 隐藏层使用ReLU激活函数
        # x = torch.softmax(x,dim=1)
        return self.fc2(x)

In [7]:
class DQN:
    ''' DQN算法 '''
    def __init__(self, state_dim, hidden_dim, action_dim, learning_rate, gamma,
                 epsilon, target_update, device):
        self.action_dim = action_dim
        self.q_net = Qnet(state_dim, hidden_dim,
                          self.action_dim).to(device)  # Q网络
        # 目标网络
        self.target_q_net = Qnet(state_dim, hidden_dim,
                                 self.action_dim).to(device)
        # 使用Adam优化器
        self.optimizer = torch.optim.Adam(self.q_net.parameters(),
                                          lr=learning_rate)
        self.gamma = gamma  # 折扣因子
        self.epsilon = epsilon  # epsilon-贪婪策略
        self.target_update = target_update  # 目标网络更新频率
        self.count = 0  # 计数器,记录更新次数
        self.device = device

    def take_action(self, state):  # epsilon-贪婪策略采取动作
        if np.random.random() < self.epsilon:
            # action = np.random.randint(self.action_dim)
            action2 = np.random.random(2)
            action = action2.argmax()
        else:
            state = torch.tensor([state], dtype=torch.float).to(self.device)
            action = self.q_net(state).argmax().item()
            action2 =self.q_net(state)[0].tolist()
        return action,action2

    def update(self, transition_dict):
        states = torch.tensor(transition_dict['states'],
                              dtype=torch.float).to(self.device)
        actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(
            self.device)
        rewards = torch.tensor(transition_dict['rewards'],
                               dtype=torch.float).view(-1, 1).to(self.device)
        next_states = torch.tensor(transition_dict['next_states'],
                                   dtype=torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'],
                             dtype=torch.float).view(-1, 1).to(self.device)

        q_values = self.q_net(states).gather(1, actions)  # Q值
        # 下个状态的最大Q值
        max_next_q_values = self.target_q_net(next_states).max(1)[0].view(
            -1, 1)
        q_targets = rewards + self.gamma * max_next_q_values * (1 - dones
                                                                )  # TD误差目标
        dqn_loss = torch.mean(F.mse_loss(q_values, q_targets))  # 均方误差损失函数
        self.optimizer.zero_grad()  # PyTorch中默认梯度会累积,这里需要显式将梯度置为0
        dqn_loss.backward()  # 反向传播更新参数
        self.optimizer.step()

        if self.count % self.target_update == 0:
            self.target_q_net.load_state_dict(
                self.q_net.state_dict())  # 更新目标网络
        self.count += 1

In [8]:
lr = 2e-3
actor_lr = 1e-3
critic_lr = 1e-2
num_episodes = 500
hidden_dim = 128
gamma = 0.98
epsilon = 0.01
target_update = 10
buffer_size = 10000
minimal_size = 500
batch_size = 64
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")

env_name = 'CartPole-v0'
env = gym.make(env_name)
random.seed(0)
np.random.seed(0)
# env.seed(0)
torch.manual_seed(0)
replay_buffer = ReplayBuffer(buffer_size)
state_dim = env.observation_space.shape[0]
# print(state_dim)
action_dim = env.action_space.n
# print(action_dim)
# agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon,
#             target_update, device)
agent = ActorCritic(state_dim, hidden_dim, action_dim, actor_lr, critic_lr,
                    gamma, device)

return_list = []
for i in range(10):
    with tqdm(total=int(num_episodes / 10), desc='Iteration %d' % i) as pbar:
        for i_episode in range(int(num_episodes / 10)):
            episode_return = 0
            state = env.reset()[0]
            # print(state)
            done = False
            transition_dict = {'states': [], 'actions': [], 'next_states': [], 'rewards': [], 'dones': []}
            while not done:
                action,action2 = agent.take_action(state)
                # print(action2)
                # print(action)
                # env.step(action)
                next_state, reward, done, _,_ = env.step(action)
                transition_dict['states'].append(state)
                transition_dict['actions'].append(action)
                transition_dict['next_states'].append(next_state)
                transition_dict['rewards'].append(reward)
                transition_dict['dones'].append(done)
                # print(next_state, reward, done)
                # print(next_state)
                # next_state = next_state[0]
                # replay_buffer.add(state, action, reward, next_state, done)
                
                state = next_state
                episode_return += reward
                # 当buffer数据的数量超过一定值后,才进行Q网络训练
                # if replay_buffer.size() > minimal_size:
                #     b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size)
                #     transition_dict = {
                #         'states': b_s,
                #         'actions': b_a,
                #         'next_states': b_ns,
                #         'rewards': b_r,
                #         'dones': b_d
                #     }
            agent.update(transition_dict)
            return_list.append(episode_return)
            if (i_episode + 1) % 10 == 0:
                pbar.set_postfix({
                    'episode':
                    '%d' % (num_episodes / 10 * i + i_episode + 1),
                    'return':
                    '%.3f' % np.mean(return_list[-10:])
                })
            pbar.update(1)

  logger.warn(
  state = torch.tensor([state], dtype=torch.float).to(self.device)
  if not isinstance(terminated, (bool, np.bool8)):
Iteration 0:  84%|████████▍ | 42/50 [00:04<00:00,  9.25it/s, episode=40, return=19.100]


ValueError: Expected parameter probs (Tensor of shape (1, 2)) of distribution Categorical(probs: torch.Size([1, 2])) to satisfy the constraint Simplex(), but found invalid values:
tensor([[nan, nan]], device='cuda:0', grad_fn=<DivBackward0>)

In [None]:
import pickle 

from copy import copy
data = []
observations = []
actions = []
rewards = []
dones = []
next_observations = []

for episode in range(50):
    observations = []
    actions = []
    rewards = []
    dones = []
    next_observations = []
    state = env.reset()[0]
    done = False
    rs = 0
    for step in range(200):
        # env.render()
        tmp_state = copy(state)
        action,action2 = agent.take_action(state)
        next_state, reward, done, _,_ = env.step(action)

        observations.append(state)
        actions.append(action2)
        print(action2)

        # if action == 0:
        #     actions.append([1, 0])
        # else:
        #     actions.append([0, 1])
        rewards.append([reward])
        dones.append([done])
        next_observations.append(next_state)
        rs+=reward

        state = next_state
        if done:
            break
    print(rs)
    data.append({
        'observations': observations,
        'actions': actions,
        'rewards': rewards,
        'dones': dones,
        'next_observations': next_observations
    })

    #     one_date.append(tmp_state)
    # data.append(one_date)
env.close()
# data['observations'] = observations
# data['actions'] = actions
# data['rewards'] = rewards
# data['dones'] = dones
# data['next_observations'] = next_observations
with open('data.pkl', 'wb') as f:
    pickle.dump(data, f)

[56.154075622558594, 55.94548034667969]
[55.9342041015625, 56.018768310546875]
[56.17174530029297, 55.969112396240234]
[55.931522369384766, 56.03379821777344]
[56.18491744995117, 55.988258361816406]
[55.923641204833984, 56.04429244995117]
[56.19386291503906, 56.003360748291016]
[55.91044998168945, 56.05051803588867]
[56.198726654052734, 56.0147590637207]
[55.8917236328125, 56.052650451660156]
[56.1995735168457, 56.02269744873047]
[55.867103576660156, 56.05076217651367]
[56.196327209472656, 56.02730941772461]
[55.83606719970703, 56.0448112487793]
[56.18885040283203, 56.02866744995117]
[55.797969818115234, 56.034671783447266]
[56.17687225341797, 56.02675247192383]
[55.75196838378906, 56.02009201049805]
[56.16002655029297, 56.02145767211914]
[55.697044372558594, 56.00072479248047]
[56.13225555419922, 56.01066207885742]
[55.63197326660156, 55.976104736328125]
[56.061527252197266, 55.98318099975586]
[55.555274963378906, 55.945648193359375]
[55.97831344604492, 55.949501037597656]
[55.5217742

In [None]:
data[0]['actions'][0]

[56.154075622558594, 55.94548034667969]

In [None]:
# [
#     {
#         "observations": [[], [], [], [], [], [], [], [], [], []],
#         "actions": [[], [], [], [], [], [], [], [], [], []],
#         "rewards": [[], [], [], [], [], [], [], [], [], []],
#         "dones": [[], [], [], [], [], [], [], [], [], []],
#         "next_observations": [[], [], [], [], [], [], [], [], [], []]
#     },
#     {

#     },
#     {

#     },
# ]


In [None]:
# [[],[],[],[]]