In [1]:
import os
from typing import Dict, List, Tuple

import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from IPython.display import clear_output
from torch.distributions import Categorical
import math
import torch.nn as nn

In [2]:
env = gym.make("Pendulum-v1", render_mode="rgb_array")

device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu"
)

obs_dim = env.observation_space.shape[0]
action_info = env.action_space
print(env.observation_space.shape, obs_dim, action_info)

(3,) 3 Box(-2.0, 2.0, (1,), float32)


  return torch._C._cuda_getDeviceCount() > 0


In [3]:
class ActionNetwork(nn.Module):
    def __init__(self, in_dim: int, out_dim: int):
        """Initialization."""
        super(ActionNetwork, self).__init__()

        self.layers = nn.Sequential(
            nn.Linear(in_dim, 128), 
            nn.ReLU(),
            nn.Linear(128, 128), 
            nn.ReLU(), 
            nn.Linear(128, out_dim)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward method implementation."""
        return self.layers(x)
    
class QValueNetwork(nn.Module):
    def __init__(self, in_dim: int, out_dim: int):
        '''状态价值，用来评估动作的好坏程度'''
        super(QValueNetwork, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_dim + out_dim, 128), 
            nn.ReLU(),
            nn.Linear(128, 128), 
            nn.ReLU(), 
            nn.Linear(128, 1)
        )
        
    def forward(self, x: torch.Tensor, a: torch.Tensor) -> torch.Tensor:
        """Forward method implementation."""
        t = torch.cat([x, a], 0)
        return self.layers(t)

In [4]:
actor = ActionNetwork(obs_dim, 1)
critic = QValueNetwork(obs_dim, 1)

actor_optimizer = optim.Adam(actor.parameters(), lr=0.0001)
critic_optimizer = optim.Adam(critic.parameters(), lr=0.0001)

In [5]:
def select_action(state):
    '''策略网络做出决策，给出一个动作'''
    action = actor(torch.FloatTensor(state).to(device))
    return action

def critic_value(state, action):
    return critic(torch.FloatTensor(state).to(device), action.to(device))

In [6]:
gamma = 0.98

steps = []
U_s = []
view_losses = []
max_epoch = 2000
for i in range(max_epoch):
    score = 0
    step = 0
    
    state, _ = env.reset(seed=3)
    trajectories = []
    while True:
        '''让策略网络做预测'''
        action = select_action(state)
        next_state, reward, terminated, truncated, _ = env.step(np.float32(action.detach().numpy()))
        done = terminated or truncated
        
        step += 1
        
        trajectories.append([state, action, reward, next_state, done])
        if done or step > 200:
            steps.append(step)
            break
        
        state = next_state
        
    # 结束一轮游戏
    for state, action, reward, next_state, done in trajectories[::-1]:        
        mask = 1 - done
#         print('---------', torch.tensor(state), action, torch.tensor(state).shape, action.shape)
#         '''让价值网络做预测'''
#         result = torch.cat([torch.tensor(state), action], 0)
#         print('---------', result)
        q_t = critic_value(state, action)
        
        next_action = select_action(next_state)
        next_q_t = critic_value(next_state, next_action)
        '''计算价值网络的 TD 目标和 TD 误差'''
        y_t = reward + gamma * next_q_t * mask
        print(q_t, y_t)
        critic_loss = F.smooth_l1_loss(q_t, y_t)
        '''更新价值网络'''
        critic_optimizer.zero_grad()
        critic_loss.backward()
        critic_optimizer.step()
        
        new_action = select_action(state)
        actor_loss = -critic_value(state, new_action)
    
        '''更新策略网络'''
        actor_optimizer.zero_grad()
        actor_loss.backward()
        actor_optimizer.step()

tensor([0.0805], grad_fn=<AddBackward0>) tensor([-9.5283], grad_fn=<AddBackward0>)
tensor([0.0723], grad_fn=<AddBackward0>) tensor([-9.3830], grad_fn=<AddBackward0>)


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [128, 1]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

In [None]:
# 观察曲线
plt.figure(figsize=(20, 5))
# plt.subplot(131)
# plt.title('frame %s. score: %s' % (frame_idx, np.mean(scores[-10:])))
# plt.title('score')
# plt.plot(scores)
plt.subplot(132)
plt.title('steps')
plt.plot(steps)
# plt.subplot(133)
# plt.title('epsilons')
# plt.plot(epsilons)
plt.show()