In [5]:
import numpy as np
import tensorflow as tf
import gym

# Hyperparameters
learning_rate = 0.01
gamma = 0.99
num_episodes = 500

# Environment
env = gym.make('CartPole-v1')

# Policy network
policy = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(128, activation='relu', input_shape=(env.observation_space.shape[0],)),
        tf.keras.layers.Dense(env.action_space.n, activation='softmax'),
    ]
)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

# Train the policy network
for episode in range(num_episodes):
    state = env.reset()
    done = False
    episode_states, episode_actions, episode_rewards = [], [], []

    # Collect a trajectory
    while not done:
        action_probs = policy(tf.convert_to_tensor(state.reshape(1, -1), dtype=tf.float32)).numpy().squeeze()
        action = np.random.choice(env.action_space.n, p=action_probs)
        next_state, reward, done, _ = env.step(action)

        episode_states.append(state)
        episode_actions.append(action)
        episode_rewards.append(reward)

        state = next_state

    # Compute discounted rewards
    discounted_rewards = []
    cumulative_reward = 0
    for r in reversed(episode_rewards):
        cumulative_reward = r + gamma * cumulative_reward
        discounted_rewards.insert(0, cumulative_reward)
    discounted_rewards = np.array(discounted_rewards)
    discounted_rewards = (discounted_rewards - np.mean(discounted_rewards)) / np.std(discounted_rewards)

    # Update policy
    with tf.GradientTape() as tape:
        log_probs = tf.math.log(
            policy(tf.convert_to_tensor(np.vstack([s.reshape(1, -1) for s in episode_states]), dtype=tf.float32))
        )
        selected_log_probs = tf.reduce_sum(
            tf.one_hot(episode_actions, depth=env.action_space.n) * log_probs, axis=1
        )
        loss = -tf.reduce_sum(selected_log_probs * discounted_rewards)

    grads = tape.gradient(loss, policy.trainable_variables)
    optimizer.apply_gradients(zip(grads, policy.trainable_variables))

    # Print episode information
    episode_reward = sum(episode_rewards)
    print(f'Episode {episode + 1}, Reward: {episode_reward}')

# Test the trained policy
state = env.reset()
done = False
while not done:
    env.render()
    action_probs = policy(tf.convert_to_tensor(state.reshape(1, -1), dtype=tf.float32)).numpy().squeeze()
    action = np.argmax(action_probs)
    state, _, done, _ = env.step(action)

env.close()


AttributeError: 'tuple' object has no attribute 'reshape'