In [5]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
import gym

class PPO:
    def __init__(self, state_dim, action_dim, lr, gamma, epsilon):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.model = self.create_model()
        self.optimizer = Adam(lr=self.lr)

    def create_model(self):
        inputs = Input(shape=(self.state_dim,))
        x = Dense(64, activation='relu')(inputs)
        x = Dense(64, activation='relu')(x)
        out_actions = Dense(self.action_dim, activation='softmax')(x)
        out_value = Dense(1)(x)
        model = Model(inputs, [out_actions, out_value])
        return model

    def train(self, states, actions, advantages, rewards, old_predictions):
        with tf.GradientTape() as tape:
            predictions, values = self.model(states, training=True)
            values = values[:, 0]
            advantages = rewards - values
            entropy_loss = -(predictions * tf.math.log(predictions + 1e-10))
            p_ratio = predictions / old_predictions
            p_loss = tf.minimum(p_ratio * advantages, tf.clip_by_value(p_ratio, 1 - self.epsilon, 1 + self.epsilon) * advantages)
            p_loss = -tf.reduce_mean(p_loss)
            v_loss = tf.reduce_mean((rewards - values) ** 2)
            total_loss = p_loss + 0.5 * v_loss - 0.01 * entropy_loss
        grads = tape.gradient(total_loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

    def act(self, state):
        prediction, _ = self.model.predict(np.array([state]))
        action = np.random.choice(self.action_dim, p=prediction[0])
        return action

env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# PPO 인스턴스 생성
ppo = PPO(state_dim, action_dim, lr=1e-3, gamma=0.99, epsilon=0.2)

# 학습 루프
episodes = 1000
max_steps_per_episode = 200  # Set a limit for the number of steps per episode
threshold = 50

for episode in range(episodes):
    state = env.reset()
    done = False
    total_rewards = 0  # Track total rewards to monitor performance
    for step in range(max_steps_per_episode):  # Add a step limit per episode
        action = ppo.act(state)
        next_state, reward, done, _ = env.step(action)
        total_rewards += reward

        state = next_state

        if done:
            break

    print(f"Episode: {episode + 1}, Total Reward: {total_rewards}")

    # Optional: Stop training if the agent is "good enough"
    if total_rewards >= threshold:  # Define 'some_threshold' based on your criteria
        print("Stopping training as the agent has achieved the desired performance.")
        break

print(f'Number of Episodes: {episode + 1}')

env.close()

  and should_run_async(code)
  deprecation(
  deprecation(




  if not isinstance(terminated, (bool, np.bool8)):


Episode: 1, Total Reward: 23.0
Episode: 2, Total Reward: 16.0
Episode: 3, Total Reward: 12.0
Episode: 4, Total Reward: 25.0
Episode: 5, Total Reward: 30.0
Episode: 6, Total Reward: 27.0
Episode: 7, Total Reward: 45.0
Episode: 8, Total Reward: 18.0
Episode: 9, Total Reward: 59.0
Stopping training as the agent has achieved the desired performance.
Number of Episodes: 9
