In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import gym

class PolicyNet(nn.Module):
    def __init__(self, action_size=2):
        super().__init__()
        self.l1 = nn.Linear(4, 128)  # Assuming state size is 4 for CartPole-v0
        self.l2 = nn.Linear(128, action_size)

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = self.l2(x)
        x = F.softmax(x, dim=-1)
        return x

class ValueNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(4, 128)  # Assuming state size is 4 for CartPole-v0
        self.l2 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = self.l2(x)
        return x

class Agent:
    def __init__(self):
        self.gamma = 0.98
        self.lr_pi = 0.0002
        self.lr_v = 0.0005
        self.action_size = 2

        self.pi = PolicyNet()
        self.v = ValueNet()
        self.optimizer_pi = optim.Adam(self.pi.parameters(), lr=self.lr_pi)
        self.optimizer_v = optim.Adam(self.v.parameters(), lr=self.lr_v)

    def get_action(self, state):
        state = torch.from_numpy(state).float()
        probs = self.pi(state)
        action = torch.multinomial(probs, 1).item()
        return action, probs[action].item()

    def update(self, state, action_prob, reward, next_state, done):
        state = torch.from_numpy(state).float()
        next_state = torch.from_numpy(next_state).float()

        # ========== (1) Update V network ===========
        with torch.no_grad():
            target = reward + self.gamma * self.v(next_state) * (1 - done)
        v = self.v(state)
        loss_v = F.mse_loss(v, target)

        # ========== (2) Update pi network ===========
        delta = target - v
        loss_pi = -torch.log(action_prob) * delta.item()

        self.v.zero_grad()
        self.pi.zero_grad()
        loss_v.backward()
        loss_pi.backward()
        self.optimizer_v.step()
        self.optimizer_pi.step()

# 以下は学習のループ部分です（変更なし）
episodes = 3000
env = gym.make('CartPole-v0')
agent = Agent()
reward_history = []

for episode in range(episodes):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        action, prob = agent.get_action(state)
        next_state, reward, done, info = env.step(action)

        agent.update(state, prob, reward, next_state, done)

        state = next_state
        total_reward += reward

    reward_history.append(total_reward)
    if episode % 100 == 0:
        print("episode :{}, total reward : {:.1f}".format(episode, total_reward))

# plot
import matplotlib.pyplot as plt

def plot_total_reward(reward_history):
    plt.plot(reward_history)
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.title('Total Reward per Episode')
    plt.show()

plot_total_reward(reward_history)


  logger.warn(
  deprecation(
  deprecation(


TypeError: log(): argument 'input' (position 1) must be Tensor, not float