<a href="https://colab.research.google.com/github/zoujiulong/Reinforcement-Learning/blob/main/PPO(MC).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from abc import abstractproperty
from gym.wrappers import TimeLimit
import gymnasium as gym
import numpy as np
import torch
from torch import nn
from torch.optim import Adam
import random
from collections import deque
from tqdm import tqdm
from torch.distributions import Categorical
import matplotlib.pyplot as plt

# Hyperparameters
EPISODES = 1000
GAMMA = 0.99
LR = 1e-3
BATCH_SIZE = 8
eps=0.2
# Define the Q-Network

class Actor(nn.Module):
  def __init__(self, state_dim, action_dim):
      super().__init__()
      self.fc = nn.Sequential(
          nn.Linear(state_dim, 128),
          nn.ReLU(),
          nn.Linear(128, 128),
          nn.ReLU(),
          nn.Linear(128, action_dim)
      )
  def forward(self, x):
    return self.fc(x)

class Critic(nn.Module):
  def __init__(self, state_dim):
      super().__init__()
      self.fc = nn.Sequential(
          nn.Linear(state_dim, 128),
          nn.ReLU(),
          nn.Linear(128, 128),
          nn.ReLU(),
          nn.Linear(128, 1),
      )

  def forward(self, x):
      return self.fc(x)

def update(old_net,new_net):
  old_net.load_state_dict(new_net.state_dict())

def to_tensor(x):
  return torch.tensor(x, dtype=torch.float32)

# Training loop
def train():
    env = gym.make("CartPole-v1")
    env = TimeLimit(env, max_episode_steps=500)
    state_dim = env.observation_space.shape[0]
    print('state_dim',state_dim)
    action_dim = env.action_space.n
    # 新旧策略
    actor = Actor(state_dim, action_dim)
    old_actor= Actor(state_dim, action_dim)
    update(old_actor,actor)
    critic = Critic(state_dim)
    actor_opt = Adam(actor.parameters(), lr=LR)
    critic_opt = Adam(critic.parameters(), lr=LR)
    trajectory_batch=[]
    epoch=list(range(EPISODES))
    reward_l=[]
    for episode in tqdm(range(EPISODES), desc="Training"):
        state, _ = env.reset()
        total_reward = 0
        done = False
        trajectory=[]
        # 收集数据
        while not done:
          state_tensor = to_tensor(state).unsqueeze(0)
          logits = old_actor(state_tensor)
          dist = Categorical(logits=logits)
          action = dist.sample()
          log_prob = dist.log_prob(action)
          # old_p=torch.exp(log_prob)
          next_state, reward, done, _ = env.step(action.item())
          trajectory.append([state_tensor, action, reward, next_state, log_prob])
          state = next_state
          total_reward += reward
        trajectory_batch.append(trajectory)
        # 更新
        if (episode+1)%10==0:
          for traj in trajectory_batch:
            states_tensor=torch.stack([step[0] for step in traj])
            actions_tensor=torch.stack([step[1] for step in traj] )
            rewards_tensor=to_tensor(np.array([step[2] for step in traj]) ).unsqueeze(1)
            G=0
            returns=[]
            for r in reversed(rewards_tensor):
              G=r+GAMMA*G
              # print(G.shape)
              returns.insert(0,G)
            returns_tensor=to_tensor(returns).unsqueeze(1)
            # print(returns_tensor.shape)

            next_states=to_tensor(np.array([step[3] for step in traj]) )

            old_p_tensor=torch.stack([step[4] for step in traj]).unsqueeze(1)

            V_s=critic(states_tensor)
            # with torch.no_grad():
            #   V_s1=critic(next_states)
            #   td_target = rewards_tensor + GAMMA * V_s1*(1-dones_tensor)


            # advantage=(td_target-V_s).detach()


            # 概率之比
            advantage=(returns_tensor-V_s).detach()
            # advantage=(advantage-advantage.mean())/(advantage.std()+1e-8)
            logits = actor(states_tensor)
            dist = Categorical(logits=logits)
            log_probs = dist.log_prob(actions_tensor).unsqueeze(1)
            ratios=(log_probs-old_p_tensor.detach()).exp()
            # 约束
            adv=torch.min(ratios*advantage,torch.clamp(ratios,1-eps,1+eps)*advantage)
            # adv=torch.min(ratios*advantage,torch.clamp(ratios,1-eps,1+eps)*advantage).detach()
            # actor_loss=(-log_probs*adv).mean()
            actor_loss=-adv.mean()

            critic_loss=nn.MSELoss()(V_s.squeeze(-1),returns_tensor)
            # critic_loss=nn.MSELoss()(V_s.squeeze(-1),td_target)
            critic_opt.zero_grad()
            actor_opt.zero_grad()
            (critic_loss+actor_loss).backward()
            critic_opt.step()
            actor_opt.step()
          trajectory_batch.clear()
        update(old_actor,actor)
        reward_l.append(total_reward)
        if episode % 10 == 0:
            print(f"Episode {episode}, Total reward: {total_reward:.1f}")
    plt.plot(epoch,reward_l,label='reward')
    plt.legend()
    plt.show()
    env.close()

if __name__ == "__main__":
  # torch.autograd.set_detect_anomaly(True)
  train()