<a href="https://colab.research.google.com/github/t-snd/cartPole/blob/main/runCartPole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque

# 共通のポリシーネットワーク（REINFORCEとGRPO用）
class PolicyNet(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNet, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, action_dim)

    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# 共通のActor-Critic/PPOネットワーク
class ActorCriticNet(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCriticNet, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim)
        )
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, state):
        return self.actor(state), self.critic(state)

# 共通のエピソード実行関数
def run_episode(env, agent, algorithm, max_steps=200):
    state, _ = env.reset()
    states, actions, log_probs = [], [], []
    episode_reward = 0

    for t in range(max_steps):
        state_tensor = torch.FloatTensor(state)
        if algorithm in ['reinforce', 'grpo']:
            logits = agent(state_tensor)
            probs = torch.softmax(logits, dim=-1)
            action = torch.multinomial(probs, 1).item()
            log_prob = torch.log(probs[action])
        else:  # actor-critic or ppo
            action_logits, _ = agent(state_tensor)
            probs = torch.softmax(action_logits, dim=-1)
            action = torch.multinomial(probs, 1).item()
            log_prob = torch.log(probs[action])

        states.append(state)
        actions.append(action)
        log_probs.append(log_prob)

        state, reward, done, _, _ = env.step(action)
        episode_reward += reward
        if done:
            break

    return states, actions, log_probs, episode_reward

# REINFORCEの実装
def reinforce_update(net, optimizer, states, actions, log_probs, total_reward):
    normalized_reward = total_reward / 200.0
    policy_loss = []
    for log_prob in log_probs:
        policy_loss.append(-log_prob * normalized_reward)
    policy_loss = torch.stack(policy_loss).sum()

    optimizer.zero_grad()
    policy_loss.backward()
    optimizer.step()
    return policy_loss.item()

# Actor-Criticの実装
def actor_critic_update(net, optimizer, states, actions, total_reward, gamma=0.99):
    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions)

    actor_losses, critic_losses = [], []
    for t in range(len(states) - 1):
        action_logits, value = net(states[t].unsqueeze(0))
        _, next_value = net(states[t+1].unsqueeze(0))
        advantage = total_reward - value.detach()

        action_probs = torch.softmax(action_logits, dim=-1)
        log_prob = torch.log(action_probs[0, actions[t]])
        actor_losses.append(-log_prob * advantage)
        critic_losses.append(advantage.pow(2))

    total_loss = torch.stack(actor_losses).sum() + 0.5 * torch.stack(critic_losses).sum()
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()
    return total_loss.item()

# PPOの実装
def ppo_update(net, optimizer, states, actions, total_reward, clip_ratio=0.2, value_loss_coef=0.5, entropy_coef=0.01):
    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions)

    old_action_probs, old_values = net(states)
    old_action_probs = old_action_probs.detach()
    old_values = old_values.detach()

    curr_action_probs, curr_values = net(states)
    advantages = total_reward - old_values.squeeze()

    ratios = curr_action_probs.gather(1, actions.unsqueeze(1)).squeeze() / \
             old_action_probs.gather(1, actions.unsqueez(1)).squeeze()
    surr1 = ratios * advantages
    surr2 = torch.clamp(ratios, 1 - clip_ratio, 1 + clip_ratio) * advantages
    policy_loss = -torch.min(surr1, surr2).mean()

    value_loss = (curr_values.squeeze() - total_reward).pow(2).mean()
    entropy_loss = -(curr_action_probs * torch.log(curr_action_probs + 1e-10)).sum(dim=1).mean()

    total_loss = policy_loss + value_loss_coef * value_loss - entropy_coef * entropy_loss
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()
    return total_loss.item()

# GRPOの実装
def grpo_update(net, optimizer, trajectories, eps=0.2, n_iterations=10):
    rewards = [r for _, _, _, r in trajectories]
    mean_reward = np.mean(rewards)
    std_reward = np.std(rewards) + 1e-8
    advantages = [(r - mean_reward) / std_reward for r in rewards]

    for _ in range(n_iterations):
        loss = 0
        for (states, old_log_probs, actions, _), advantage in zip(trajectories, advantages):
            trajectory_loss = 0
            for t in range(len(states)):
                state_tensor = torch.FloatTensor(states[t])
                new_logits = net(state_tensor)
                new_probs = torch.softmax(new_logits, dim=-1)
                new_log_prob = torch.log(new_probs[actions[t]])
                ratio = torch.exp(new_log_prob - old_log_probs[t])
                clipped_ratio = torch.clamp(ratio, 1 - eps, 1 + eps)
                trajectory_loss += -torch.min(ratio * advantage, clipped_ratio * advantage)
            loss += trajectory_loss / len(states)
        loss /= len(trajectories)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss.item()

# メインのトレーニング関数
def train_algorithm(algorithm, env_name='CartPole-v0', num_episodes=5000, lr=0.001):
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    # アルゴリズムに応じたネットワークとオプティマイザーの初期化
    if algorithm in ['reinforce', 'grpo']:
        agent = PolicyNet(state_dim, action_dim)
    else:  # actor-critic or ppo
        agent = ActorCriticNet(state_dim, action_dim)
    optimizer = optim.Adam(agent.parameters(), lr=lr)

    episode_rewards = deque(maxlen=100)
    trajectories = []  # GRPO用

    for episode in range(num_episodes):
        states, actions, log_probs, total_reward = run_episode(env, agent, algorithm)

        if algorithm == 'reinforce':
            reinforce_update(agent, optimizer, states, actions, log_probs, total_reward)
        elif algorithm == 'actor-critic':
            actor_critic_update(agent, optimizer, states, actions, total_reward)
        elif algorithm == 'ppo':
            ppo_update(agent, optimizer, states, actions, total_reward)
        elif algorithm == 'grpo':
            trajectories.append((states, log_probs, actions, total_reward))
            if len(trajectories) >= 5:  # 5つのトラジェクトリを収集したら更新
                grpo_update(agent, optimizer, trajectories)
                trajectories = []

        episode_rewards.append(total_reward)
        avg_reward = np.mean(episode_rewards)

        if episode % 100 == 0:
            print(f'{algorithm.upper()} Episode {episode}, Avg Reward: {avg_reward:.2f}')

        if avg_reward > 195:
            print(f'{algorithm.upper()} Solved in {episode} episodes!')
            break

    env.close()
    return agent, episode_rewards

# 4つのアルゴリズムを比較実行
if __name__ == '__main__':
    algorithms = ['reinforce', 'actor-critic', 'ppo', 'grpo']
    results = {}

    for algo in algorithms:
        print(f"\nTraining {algo.upper()}")
        agent, rewards = train_algorithm(algo)
        results[algo] = rewards

    # 結果の簡単な比較（オプション）
    for algo, rewards in results.items():
        print(f"{algo.upper()} Final Avg Reward: {np.mean(list(rewards)[-100:]):.2f}")


Training REINFORCE
REINFORCE Episode 0, Avg Reward: 14.00


  logger.deprecation(


REINFORCE Episode 100, Avg Reward: 19.02
REINFORCE Episode 200, Avg Reward: 23.30
REINFORCE Episode 300, Avg Reward: 27.34
REINFORCE Episode 400, Avg Reward: 33.68
REINFORCE Episode 500, Avg Reward: 41.95
REINFORCE Episode 600, Avg Reward: 41.45
REINFORCE Episode 700, Avg Reward: 49.55
REINFORCE Episode 800, Avg Reward: 53.29
REINFORCE Episode 900, Avg Reward: 65.83
REINFORCE Episode 1000, Avg Reward: 69.97
REINFORCE Episode 1100, Avg Reward: 82.05
REINFORCE Episode 1200, Avg Reward: 130.43
REINFORCE Episode 1300, Avg Reward: 155.99
REINFORCE Episode 1400, Avg Reward: 137.05
REINFORCE Episode 1500, Avg Reward: 153.22
REINFORCE Episode 1600, Avg Reward: 175.13
REINFORCE Episode 1700, Avg Reward: 185.55
REINFORCE Episode 1800, Avg Reward: 191.20
REINFORCE Episode 1900, Avg Reward: 153.98
REINFORCE Episode 2000, Avg Reward: 195.40
REINFORCE Solved in 2000 episodes!

Training ACTOR-CRITIC
ACTOR-CRITIC Episode 0, Avg Reward: 19.00
ACTOR-CRITIC Episode 100, Avg Reward: 23.15
ACTOR-CRITIC Epi

AttributeError: 'Tensor' object has no attribute 'unsqueez'