In [1]:
import torch
from torch import nn, Tensor
import torch.nn.functional as F

In [2]:
class PolicyNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 16)
        self.fc2 = nn.Linear(16, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=-1)  # 使用softmax確保輸出是概率分佈
class ValueNetwork(nn.Module):
    def __init__(self, state_size):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 16)
        self.fc2 = nn.Linear(16, 1)  # 僅一個輸出，表示狀態的價值

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [3]:
from src.PPO import PPO
class MyPPO(PPO):
    def __init__(self, env, policyNetwork, valueNetwork):
        super().__init__(policyNetwork, valueNetwork)
        self.env = env

    def rewardFunc(self, observation):
        angle = abs(observation[2])
        position = abs(observation[0])
        reward = ((0.1045 - angle) / 0.1045 + (1.2 - position) / 1.2)
        return reward
    def learn(self, timeStep=1000, dataNum = 4096, lr=0.003, episode=0.2, epoch=10, batchSize=256):
        print("start learning")
        for i in range(timeStep):
            playtime_count = 0
            survive_avg = 0
            print(f"time step:{i + 1}", end=" ")
            while (len(self.ExperienceHistory['oldstate']) < dataNum):
                state, _ = self.env.reset()
                done = False
                while (not done):
                    action = self.getAction(state)

                    next_state, reward, done, _, __ = self.env.step(action)
                    reward = self.rewardFunc(next_state)
                    
                    self.ExperienceHistory['oldstate'].append(state)
                    self.ExperienceHistory['state'].append(next_state)
                    self.ExperienceHistory['action'].append(action)
                    self.ExperienceHistory['reward'].append(reward)
                    self.ExperienceHistory['done'].append(int(done))
                    state = next_state
                    survive_avg += 1
                    if done:
                        playtime_count += 1
                        break
            print("生存平均:", survive_avg / playtime_count)
            self.train(epochs=epoch, lr=lr, episode=episode, batch_size=batchSize)
            
            if (i % 100 == 0):
                self.show()

    
    def show(self):
        state, _ = self.env.reset()
        done = False
        surviveTime = 0
        while (not done):
            action = int(torch.argmax(self.PolicyNetwork(torch.tensor(state))))
            next_state, ___, done, _, __ = self.env.step(action)
            state = next_state
            surviveTime += 1
            if done:
                print(f"survived time:{surviveTime}")
                break

In [4]:
import gym

env = gym.make('CartPole-v1')
policyNetwork = PolicyNetwork(4, 2)
valueNetwork = ValueNetwork(4)
agent = MyPPO(env, policyNetwork=policyNetwork, valueNetwork=valueNetwork)
agent.learn(lr=0.001, dataNum=4096)

start learning
time step:1 

  if not isinstance(terminated, (bool, np.bool8)):


生存平均: 21.305699481865286


  State = torch.tensor(self.ExperienceHistory['oldstate'])


survived time:8
time step:2 生存平均: 20.142156862745097
time step:3 生存平均: 19.830917874396135
time step:4 生存平均: 20.63819095477387
time step:5 生存平均: 20.633165829145728
time step:6 生存平均: 19.70673076923077
time step:7 生存平均: 21.90374331550802
time step:8 生存平均: 27.55704697986577
time step:9 生存平均: 37.5
time step:10 生存平均: 61.59701492537314
time step:11 生存平均: 83.96
time step:12 生存平均: 109.17948717948718
time step:13 生存平均: 151.75862068965517
time step:14 生存平均: 172.79166666666666
time step:15 生存平均: 219.05263157894737
time step:16 生存平均: 220.68421052631578


KeyboardInterrupt: 