In [1]:
import numpy as np
import tensorflow as tf
import gym
from gym.wrappers import RecordVideo

2024-08-14 14:23:11.529052: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-14 14:23:11.529711: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-14 14:23:11.532024: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-14 14:23:11.539101: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-14 14:23:11.554168: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been 

In [2]:
# Create the CartPole Environment
env = gym.make("CartPole-v1")
env.reset()[0].shape

(4,)

In [3]:
# Define the actor and critic networks
actor = tf.keras.Sequential([
    tf.keras.layers.Input(env.reset()[0].shape),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(env.action_space.n, activation='softmax')
])

critic = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1)
])


In [4]:
# Define optimizer and loss functions
actor_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
critic_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)


In [5]:
history = {
    "actor_loss": [],
    "critic_loss": [],
    "advantage": [],
    "state": [],
    "action": [],
    "episode": []
}
# Main training loop
num_episodes = 1000
gamma = 0.99

for episode in range(num_episodes):
    state = env.reset()[0]
    episode_reward = 0

    with tf.GradientTape(persistent=True) as tape:
        for t in range(1, 10000):  # Limit the number of time steps
            # Choose an action using the actor
            action_probs = actor(np.array([state]))
            action = np.random.choice(env.action_space.n, p=action_probs.numpy()[0])

            # Take the chosen action and observe the next state and reward
            next_state, reward, done, _, _ = env.step(action)

            # Compute the advantage
            state_value = critic(np.array([state]))[0, 0]
            next_state_value = critic(np.array([next_state]))[0, 0]
            advantage = reward + gamma * next_state_value - state_value

            # Compute actor and critic losses
            actor_loss = -tf.math.log(action_probs[0, action]) * advantage
            critic_loss = tf.square(advantage)

            episode_reward += reward

            # Update actor and critic
            actor_gradients = tape.gradient(actor_loss, actor.trainable_variables)
            critic_gradients = tape.gradient(critic_loss, critic.trainable_variables)
            actor_optimizer.apply_gradients(zip(actor_gradients, actor.trainable_variables))
            critic_optimizer.apply_gradients(zip(critic_gradients, critic.trainable_variables))


            history["action"].append(action)
            history["actor_loss"].append(actor_loss.numpy())
            history["critic_loss"].append(critic_loss.numpy())
            history["state"].append(np.array([state]))
            history["advantage"].append(advantage.numpy())
            history["episode"].append(episode)

            state = next_state

            if done:
                print(f"Done at {t}")
                break

    if episode % 10 == 0:
        print(f"Episode {episode}, Reward: {episode_reward}")

env.close()




  if not isinstance(terminated, (bool, np.bool8)):


Done at 11
Episode 0, Reward: 11.0
Done at 12
Done at 14
Done at 19
Done at 16
Done at 11
Done at 31
Done at 24
Done at 12
Done at 18
Done at 42
Episode 10, Reward: 42.0
Done at 24
Done at 24
Done at 15
Done at 11
Done at 19
Done at 14
Done at 11
Done at 22
Done at 12
Done at 15
Episode 20, Reward: 15.0
Done at 10
Done at 39
Done at 9
Done at 22
Done at 31
Done at 35
Done at 36
Done at 13
Done at 35
Done at 31
Episode 30, Reward: 31.0
Done at 14
Done at 23
Done at 13
Done at 13
Done at 20
Done at 14
Done at 20
Done at 16
Done at 18
Done at 26
Episode 40, Reward: 26.0
Done at 10
Done at 15
Done at 36
Done at 11
Done at 20
Done at 16
Done at 60
Done at 36
Done at 67
Done at 19
Episode 50, Reward: 19.0
Done at 24
Done at 65
Done at 43
Done at 29
Done at 33
Done at 56
Done at 45
Done at 63
Done at 34
Done at 30
Episode 60, Reward: 30.0
Done at 33
Done at 22
Done at 71
Done at 46
Done at 84
Done at 32
Done at 18
Done at 35
Done at 31
Done at 21
Episode 70, Reward: 21.0
Done at 20
Done at 32

In [None]:
import pandas as pd

pd.DataFrame(history)

Unnamed: 0,actor_loss,critic_loss,advantage,state,action,episode
0,0.775312,1.237291,1.112336,"[[0.028243657, -0.008713742, -0.04648111, -0.0...",1,0
1,0.695130,1.005937,1.002964,"[[0.028243657, -0.008713742, -0.04648111, -0.0...",0,0
2,0.772181,1.240795,1.113910,"[[0.028243657, -0.008713742, -0.04648111, -0.0...",1,0
3,0.844709,1.490984,1.221058,"[[0.028243657, -0.008713742, -0.04648111, -0.0...",1,0
4,0.913361,1.754193,1.324459,"[[0.028243657, -0.008713742, -0.04648111, -0.0...",1,0
...,...,...,...,...,...,...
22018,-0.236335,0.107310,-0.327581,"[[0.0041085887, -0.035895947, -0.0006111813, -...",1,999
22019,-0.148713,0.049983,-0.223570,"[[0.0041085887, -0.035895947, -0.0006111813, -...",0,999
22020,-0.115961,0.025781,-0.160564,"[[0.0041085887, -0.035895947, -0.0006111813, -...",1,999
22021,-0.037960,0.002760,-0.052535,"[[0.0041085887, -0.035895947, -0.0006111813, -...",1,999


In [None]:

from IPython import display as ipythondisplay
from PIL import Image

render_env = gym.make("CartPole-v1", render_mode='rgb_array')

def render_episode(env: gym.Env, actor: tf.keras.Model, max_steps: int):
  state, info = env.reset()
  state = tf.constant(state, dtype=tf.float32)
  screen = env.render()
  images = [Image.fromarray(screen)]

  for i in range(1, max_steps + 1):
    state = tf.expand_dims(state, 0)
    action_probs = actor(state)
    action = np.random.choice(env.action_space.n, p=action_probs.numpy()[0])
    # action = np.argmax(np.squeeze(action_probs))


    state, reward, done, truncated, info = env.step(action)
    state = tf.constant(state, dtype=tf.float32)

    # Render screen every 10 steps
    if i % 10 == 0:
      screen = env.render()
      images.append(Image.fromarray(screen))

    if done:
      print(f"Done at {i}")
      break

  return images


# Save GIF image
images = render_episode(render_env, actor, 100000)
image_file = 'cartpole-v1.gif'
# loop=0: loop forever, duration=1: play each frame for 1ms
images[0].save(
    image_file, save_all=True, append_images=images[1:], loop=0, duration=1)


Done at 36


In [None]:
import tensorflow_docs.vis.embed as embed
embed.embed_file(image_file)
