In [4]:
import torch                                    # 导入torch
import torch.nn as nn                           # 导入torch.nn
import torch.nn.functional as F                 # 导入torch.nn.functional
import numpy as np                              # 导入numpy
import gym                                      # 导入gym

# 超参数
EPSILON = 0.9                                   # greedy policy
env = gym.make('CartPole-v0').unwrapped         # 使用gym库中的环境：CartPole，且打开封装(若想了解该环境，请自行百度)
N_ACTIONS = env.action_space.n                  # 杆子动作个数 (2个)
N_STATES = env.observation_space.shape[0]       # 杆子状态个数 (4个)

# 定义Net类 (定义网络)
class Net(nn.Module):
    def __init__(self):                                                         
        # nn.Module的子类函数必须在构造函数中执行父类的构造函数
        super(Net, self).__init__()                                            

        self.fc1 = nn.Linear(N_STATES, 50)                                      
        self.fc1.weight.data.normal_(0, 0.1)
        self.fc2 = nn.Linear(50, 50)                                      
        self.fc2.weight.data.normal_(0, 0.1)
        self.out = nn.Linear(50, N_ACTIONS)                                     
        self.out.weight.data.normal_(0, 0.1)                                   
    def forward(self, x):                                                       
        x = F.relu(self.fc1(x))                                                 
        x = F.relu(self.fc2(x))
        return self.out(x)

# 定义DQN类 (定义两个网络)
class DQN(object):
  def __init__(self):                                                         # 定义DQN的一系列属性
    self.eval_net = torch.load('./dqn_eval_v3.pth')
    self.eval_net.eval()

  def choose_action(self, x):                                                 # 定义动作选择函数 (x为状态)
    x = torch.unsqueeze(torch.FloatTensor(x), 0)                            # 将x转换成32-bit floating point形式，并在dim=0增加维数为1的维度
    if np.random.uniform() < EPSILON:                                       # 生成一个在[0, 1)内的随机数，如果小于EPSILON，选择最优动作
      actions_value = self.eval_net.forward(x)                            # 通过对评估网络输入状态x，前向传播获得动作值
      action = torch.max(actions_value, 1)[1].data.numpy()                # 输出每一行最大值的索引，并转化为numpy ndarray形式
      action = action[0]                                                  # 输出action的第一个数
    else:                                                                   # 随机选择动作
      action = np.random.randint(0, N_ACTIONS)                            # 这里action随机等于0或1 (N_ACTIONS = 2)
    return action                                                           # 返回选择的动作 (0或1)


dqn = DQN()                                                             # 令dqn=DQN类
while True:
  s = env.reset()                                                     # 重置环境
  episode_reward_sum = 0                                              # 初始化该循环对应的episode的总奖励
  while True:                                                         # 开始一个episode (每一个循环代表一步)
#     env.render()                                                    # 显示实验动画
    a = dqn.choose_action(s)                                        # 输入该步对应的状态s，选择动作
    s_, r, done, info = env.step(a)                                 # 执行动作，获得反馈
    x, x_dot, theta, theta_dot = s_
    r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
    r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
    new_r = r1 + r2
    episode_reward_sum += new_r                           
    s = s_                                                
    if done:
      print(f'reward_sum:{round(episode_reward_sum, 2)}')
      break

reward_sum:556.27
reward_sum:196.71
reward_sum:487.37
