In [7]:
import random
import gym
import math
import numpy as np
import collections
from tqdm import tqdm
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import rl_utils
import itertools
from gym import spaces
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"


class VAnet(torch.nn.Module):
    ''' 只有一层隐藏层的A网络和V网络 '''
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(VAnet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)  # 共享网络部分  #共享网络部分的线性层
        self.fc_A = torch.nn.Linear(hidden_dim, action_dim)
        self.fc_V = torch.nn.Linear(hidden_dim, 1)

    def forward(self, x):
        A = self.fc_A(F.relu(self.fc1(x)))
        V = self.fc_V(F.relu(self.fc1(x)))
        Q = V + A - A.mean(1).view(-1, 1)  # Q值由V值和A值计算得到
        return Q
    
#构造网络，训练网络和目标网络共用该结构
class Qnet(torch.nn.Module):
    ''' 只有一层隐藏层的Q网络 '''
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(Qnet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))  # 隐藏层使用ReLU激活函数
        return self.fc2(x)

In [10]:
#模型构建
class DQN:
    ''' DQN算法,包括Double DQN和Dueling DQN '''
    def __init__(self,
                 state_dim,
                 hidden_dim,
                 action_dim,
                 learning_rate,
                 gamma,
                 epsilon_start,
                 epsilon_end,
                 decay_rate,
                 target_update,
                 device,
                 dqn_type='VanillaDQN'):
        self.action_dim = action_dim
        if dqn_type == 'DuelingDQN':  # Dueling DQN采取不一样的网络框架
            self.q_net = VAnet(state_dim, hidden_dim,
                               self.action_dim).to(device)
            self.target_q_net = VAnet(state_dim, hidden_dim,
                                      self.action_dim).to(device)
        else:
            self.q_net = Qnet(state_dim, hidden_dim,
                              self.action_dim).to(device)
            self.target_q_net = Qnet(state_dim, hidden_dim,
                                     self.action_dim).to(device)
        self.optimizer = torch.optim.Adam(self.q_net.parameters(),
                                          lr=learning_rate)
        self.gamma = gamma
#         self.epsilon = epsilon

        self.frame_idx = 0  # 用于epsilon的衰减计数
        self.epsilon = epsilon_start
        self.epsilon_start=epsilon_start
        self.epsilon_end = epsilon_end
        self.decay_rate = decay_rate
        
        self.target_update = target_update
        self.count = 0
        self.dqn_type = dqn_type
        self.device = device

    def update_epsilon(self,step):
        if self.epsilon > self.epsilon_end:
            
            # 逐步递减epsilon值
            decay_factor=math.exp(math.log(self.epsilon_end/self.epsilon_start)/self.decay_rate)
            self.epsilon=self.epsilon_start*(decay_factor**step)
            #self.frame_idx+=1
            print(f"Epsilon: {self.epsilon}")
            
    def take_action(self, state):
        if np.random.random() < self.epsilon:
            action = np.random.randint(self.action_dim)
        else:
            state = torch.tensor([state], dtype=torch.float).to(self.device)
            action = self.q_net(state).argmax().item()
        return action

    def max_q_value(self, state): #计算给定状态的最大Q值
        state = torch.tensor([state], dtype=torch.float).to(self.device)
        return self.q_net(state).max().item()

    def update(self, transition_dict):
        states = torch.tensor(transition_dict['states'],
                              dtype=torch.float).to(self.device)
        actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(
            self.device)
        rewards = torch.tensor(transition_dict['rewards'],
                               dtype=torch.float).view(-1, 1).to(self.device)
        next_states = torch.tensor(transition_dict['next_states'],
                                   dtype=torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'],
                             dtype=torch.float).view(-1, 1).to(self.device)

        q_values = self.q_net(states).gather(1, actions)
        if self.dqn_type == 'DoubleDQN':
            ## .max(1)输出tuple每个特征的最大state_value及其索引，[1]获取的每个特征的动作索引shape=[b]
            max_action = self.q_net(next_states).max(1)[1].view(-1, 1)
            # 下个状态的state_value。下一时刻的状态输入到目标网络，得到每个动作对应的奖励，使用训练出来的action索引选取最优动作
            max_next_q_values = self.target_q_net(next_states).gather(
                1, max_action)
        else:
            max_next_q_values = self.target_q_net(next_states).max(1)[0].view(
                -1, 1)
        #目标网络计算出的，当前状态的state_value
        q_targets = rewards + self.gamma * max_next_q_values * (1 - dones)
        #预测值和目标值的均方误差损失
        dqn_loss = torch.mean(F.mse_loss(q_values, q_targets))
        # 梯度清零，保证每个优化步骤都以新的梯度值开始，避免梯度累积的问题
        self.optimizer.zero_grad()
        # 梯度反传
        dqn_loss.backward()
        # 更新训练网络的参数
        self.optimizer.step()

        # 更新目标网络参数
        if self.count % self.target_update == 0: #满足目标网络更新频率target_update，则将q_net的参数复制到target_q_net，以更新目标网络的权重
            self.target_q_net.load_state_dict(self.q_net.state_dict())
        self.count += 1


lr = 1e-3
num_episodes =4
hidden_dim = 128
gamma = 0.99
# epsilon = 0.01

# 初始化一个EGreedyDecay对象
epsilon_start = 0.1  # 初始epsilon值
epsilon_end = 0.01  # 最终epsilon值
decay_rate = 1000 # epsilon递减率

target_update = 10
buffer_size = 10000
minimal_size = 500
batch_size = 64

maxstdlist=[]
minstdlist=[]
avestdlist=[]

device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")

env_name = 'MyEnv-v0'
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = 24  

def train_DQN(agent, env, num_episodes, replay_buffer, minimal_size,
              batch_size):
    #env.clear_list()
    #return_list = []
    #max_q_value_list = []
    #max_q_value = 0
    for i in range(10):
        with tqdm(total=int(num_episodes / 10),
                  desc='Iteration %d' % i) as pbar:
            for i_episode in range(int(num_episodes / 10)):
                episode_return = 0
                
                state = env.reset()
                done = False
                while not done:
                    for step in range(num_episodes):
                        agent.update_epsilon(step)
                    #print("epsilon",agent.epsilon)
                    action = agent.take_action(state)
                    
#                    print("action",action)
                    max_q_value = agent.max_q_value(
                        state) * 0.005 + max_q_value * 0.995  # 平滑处理
                    max_q_value_list.append(max_q_value)  # 保存每个状态的最大Q值
#                     action_continuous = dis_to_con(action, env,
#                                                    agent.action_dim)
#                    next_state, reward, done, _ = env.step([action_continuous])
                    next_state, reward, done, _ = env.step(action)
#                     print(len(env.stdsoc))
                    replay_buffer.add(state, action, reward, next_state, done)
                    state = next_state
                    episode_return += reward
                    if replay_buffer.size() > minimal_size:
                        b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(
                            batch_size)
                        transition_dict = {
                            'states': b_s,
                            'actions': b_a,
                            'next_states': b_ns,
                            'rewards': b_r,
                            'dones': b_d
                        }
                        agent.update(transition_dict)
                return_list.append(episode_return)
                if (i_episode + 1) % 10 == 0:
                    pbar.set_postfix({
                        'episode':
                        '%d' % (num_episodes / 10 * i + i_episode + 1),
                        'return':
                        '%.3f' % np.mean(return_list[-10:])
                    })
                pbar.update(1)
    return return_list, max_q_value_list


#
#     print("minstd",minstd)
#     print("maxstd",maxstd)
#     print("average",average)
#     print("minstdlist",minstdlist)
#     print("maxstdlist",maxstdlist)
#     print("avestdlist",avestdlist)
    
    



In [9]:
random.seed(0)
np.random.seed(0)
env.seed(0)
torch.manual_seed(0)
#实例化经验池
replay_buffer = rl_utils.ReplayBuffer(buffer_size)

# agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon,
#             target_update, device)
# return_list, max_q_value_list = train_DQN(agent, env, num_episodes,
#                                           replay_buffer, minimal_size,
#                                           batch_size)

# episodes_list = list(range(len(return_list)))
# mv_return = rl_utils.moving_average(return_list, 9)  #平滑曲线，可以使用rl_utils.moving_average函数对回报值进行移动平均处理
# plt.plot(episodes_list, mv_return )
# plt.xlabel('Episodes')
# plt.ylabel('Returns')
# plt.title('DQN on {}'.format(env_name))
# plt.show()

# frames_list = list(range(len(max_q_value_list)))
# plt.plot(frames_list, max_q_value_list)
# plt.axhline(0, c='orange', ls='--')
# plt.axhline(10, c='red', ls='--')
# plt.xlabel('Frames')
# plt.ylabel('Q value')
# plt.title('DQN on {}'.format(env_name))
# plt.show()

# #计算标准差
# # lens=len(env.stdsoc)
# # print("lens",lens)
# # minstd=min(env.stdsoc)
# # maxstd=max(env.stdsoc)
# # average = np.mean(env.stdsoc)
# # print("minstd",minstd)
# # print("maxstd",maxstd)
# # print("average",average)

# cal_std(agent, env, num_episodes, replay_buffer, minimal_size, batch_size)

agent = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon_start,epsilon_end,decay_rate, target_update, device)
return_list_vanilla, max_q_value_list_vanilla = train_DQN(agent, env, num_episodes, replay_buffer, minimal_size, batch_size)

agent1 = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon_start,epsilon_end,decay_rate, target_update, device, 'DoubleDQN')
return_list_double, max_q_value_list_double = train_DQN(agent1, env, num_episodes, replay_buffer, minimal_size, batch_size)

agent2 = DQN(state_dim, hidden_dim, action_dim, lr, gamma, epsilon_start,epsilon_end,decay_rate, target_update, device, 'DuelingDQN')
return_list_dueling, max_q_value_list_dueling = train_DQN(agent2, env, num_episodes, replay_buffer, minimal_size, batch_size)
# Plot returns
episodes_list = list(range(len(return_list_vanilla)))
mv_return_vanilla = rl_utils.moving_average(return_list_vanilla, 9)
mv_return_double = rl_utils.moving_average(return_list_double, 5)
mv_return_dueling = rl_utils.moving_average(return_list_dueling, 5)

plt.figure(figsize=(12, 6))
plt.subplot(2, 1, 1)
plt.plot(episodes_list, mv_return_vanilla, label='Vanilla DQN')
plt.plot(episodes_list, mv_return_double, label='Double DQN')
plt.plot(episodes_list, mv_return_dueling, label='Dueling DQN')
plt.axhline(0.0036540217569138974*-1000, c='red', ls='--')
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('Comparison of Returns on {}'.format(env_name))
plt.legend()

# Plot Q values
frames_list = list(range(len(max_q_value_list_vanilla)))
plt.subplot(2, 1, 2)
plt.plot(frames_list, max_q_value_list_vanilla, label='Vanilla DQN')
plt.plot(frames_list, max_q_value_list_double, label='Double DQN')
plt.plot(frames_list, max_q_value_list_dueling, label='Dueling DQN')
plt.axhline(0, c='orange', ls='--')
plt.axhline(10, c='red', ls='--')
plt.xlabel('Frames')
plt.ylabel('Q value')
plt.title('Comparison of Q Values on {}'.format(env_name))
plt.legend()

plt.tight_layout()
plt.show()

Iteration 0:   0%|                                                                               | 0/4 [00:00<?, ?it/s]

Epsilon: 0.1
Epsilon: 0.09977000638225533
Epsilon: 0.0995405417351527
Epsilon: 0.09931160484209337
Epsilon: 0.09908319448927674
Epsilon: 0.09885530946569387
Epsilon: 0.09862794856312103
Epsilon: 0.09840111057611335
Epsilon: 0.09817479430199841
Epsilon: 0.09794899854086986
Epsilon: 0.09772372209558103
Epsilon: 0.09749896377173865
Epsilon: 0.09727472237769647
Epsilon: 0.09705099672454892
Epsilon: 0.09682778562612486
Epsilon: 0.09660508789898128
Epsilon: 0.096382902362397
Epsilon: 0.09616122783836639
Epsilon: 0.09594006315159324
Epsilon: 0.09571940712948437
Epsilon: 0.09549925860214352
Epsilon: 0.0952796164023651
Epsilon: 0.09506047936562806
Epsilon: 0.09484184633008963
Epsilon: 0.09462371613657922
Epsilon: 0.09440608762859225
Epsilon: 0.09418895965228403
Epsilon: 0.09397233105646369
Epsilon: 0.09375620069258792
Epsilon: 0.09354056741475508
Epsilon: 0.09332543007969898
Epsilon: 0.09311078754678291
Epsilon: 0.09289663867799351
Epsilon: 0.0926829823379348
Epsilon: 0.09246981739382212
Epsilo




UnboundLocalError: local variable 'max_q_value' referenced before assignment