In [11]:
import gym
import numpy as np
import tensorflow as tf
from collections import deque
import random

# Define hyperparameters
learning_rate = 0.00025
discount_factor = 0.99
epsilon_initial = 1.0
epsilon_min = 0.1
epsilon_decay = 0.9995
batch_size = 32
memory_size = 1000000
target_update_freq = 1000
num_episodes = 5000
max_steps_per_episode = 10000

# Create environment
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]  # Observation space is the state

# Define DQN network
def build_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(24, activation='relu', input_shape=(state_size,)),
        tf.keras.layers.Dense(24, activation='relu'),
        tf.keras.layers.Dense(env.action_space.n, activation='linear')
    ])
    return model

  deprecation(
  deprecation(


In [12]:
DQN = build_model()
target_network = build_model()
target_network.set_weights(DQN.get_weights())

# Define memory buffer
memory = deque(maxlen=memory_size)

# Define optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_function = tf.keras.losses.Huber()

# Function to perform epsilon-greedy action selection
def epsilon_greedy_policy(state, epsilon):
    if np.random.rand() <= epsilon:
        return env.action_space.sample()  # Random action
    else:
        return np.argmax(DQN.predict(np.expand_dims(state, axis=0)))  # Greedy action

In [13]:
state = env.reset()
for _ in range(memory_size):
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    memory.append((state, action, reward, next_state, done))
    if done:
        state = env.reset()
    else:
        state = next_state

# Training loop
for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0
    epsilon = max(epsilon_min, epsilon_initial * epsilon_decay ** episode)

    for step in range(max_steps_per_episode):
        # Select action using epsilon-greedy policy
        action = epsilon_greedy_policy(state, epsilon)

        # Take action and observe next state and reward
        next_state, reward, done, _ = env.step(action)
        total_reward += reward

        # Store transition in replay memory
        memory.append((state, action, reward, next_state, done))

        # Sample random minibatch from replay memory
        if len(memory) > batch_size:
            minibatch = random.sample(memory, batch_size)

            # Extract components from minibatch
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*minibatch))

            # Compute Q-values for current states and target Q-values for next states
            current_q_values = DQN.predict(states_batch)
            next_q_values = target_network.predict(next_states_batch)

            # Update Q-values using Bellman equation
            for i in range(batch_size):
                if done_batch[i]:
                    current_q_values[i][action_batch[i]] = reward_batch[i]
                else:
                    current_q_values[i][action_batch[i]] = reward_batch[i] + discount_factor * np.max(next_q_values[i])

            # Train the DQN
            with tf.GradientTape() as tape:
                q_values = DQN(states_batch, training=True)
                loss = loss_function(current_q_values, q_values)
            gradients = tape.gradient(loss, DQN.trainable_variables)
            optimizer.apply_gradients(zip(gradients, DQN.trainable_variables))

        # Update state
        state = next_state

        # Update target network every target_update_freq steps
        if step % target_update_freq == 0:
            target_network.set_weights(DQN.get_weights())

        # Break if episode is done
        if done:
            break

    # Print episode information
    print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}")

  if not isinstance(terminated, (bool, np.bool8)):






[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Episode 133/5000, Total Reward: 26.0
Episode 134/5000, Total Reward: 15.0
Episode 135/5000, Total Reward: 14.0
Episode 136/5000, Total Reward: 13.0
Episode 137/5000, Total Reward: 10.0
Episode 138/5000, Total Reward: 13.0
Episode 139/5000, Total Reward: 27.0
Episode 140/5000, Total Reward: 15.0
Episode 141/5000, Total Reward: 19.0
Episode 142/5000, Total Reward: 13.0
Episode 143/5000, Total Reward: 32.0
Episode 144/5000, Total Reward: 17.0
Episode 145/5000, Total Reward: 16.0
Episode 146/5000, Total Reward: 17.0
Episode 147/5000, Total Reward: 25.0
Episode 148/5000, Total Reward: 12.0
Episode 149/5000, Total Reward: 34.0
Episode 150/5000, Total Reward: 15.0
Episode 151/5000, Total Reward: 11.0
Episode 152/5000, Total Reward: 36.0
Episode 153/5000, Total Reward: 42.0
Episode 154/5000, Total Reward: 30.0
Episode 155/5000, Total Reward: 22.0
Episode 156/5000, Total Reward: 17.0
Episode 157/5000, Total Reward: 20.0
Episode 15

KeyboardInterrupt: 

In [14]:
total_rewards = []
for _ in range(100):
    state = env.reset()
    total_reward = 0
    while True:
        action = np.argmax(DQN.predict(np.expand_dims(state, axis=0)))
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        state = next_state
        if done:
            break
    total_rewards.append(total_reward)

print(f"Average Total Reward over 100 episodes: {np.mean(total_rewards)}")

Average Total Reward over 100 episodes: 9.29


In [1]:
# import gym
# import numpy as np
# import tensorflow as tf

# # Define hyperparameters
# learning_rate = 0.01
# discount_factor = 0.99
# epsilon = 0.1
# num_episodes = 100

# # Create environment
# env = gym.make('CartPole-v1')
# state_size = env.observation_space.shape[0]
# action_size = env.action_space.n

# # Define Q-network
# model = tf.keras.Sequential([
#     tf.keras.layers.Dense(24, activation='relu', input_shape=(state_size,)),
#     tf.keras.layers.Dense(24, activation='relu'),
#     tf.keras.layers.Dense(action_size, activation='linear')
# ])
# model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
#               loss='mse')

# # Q-learning algorithm
# for episode in range(num_episodes):
#     state = env.reset()
#     state = np.reshape(state, [1, state_size])

#     done = False
#     total_reward = 0

#     while not done:
#         # Choose action
#         if np.random.rand() <= epsilon:
#             action = env.action_space.sample()  # Explore action space
#         else:
#             q_values = model.predict(state)
#             action = np.argmax(q_values[0])  # Exploit learned values

#         # Take action
#         next_state, reward, done, _ = env.step(action)
#         next_state = np.reshape(next_state, [1, state_size])
#         total_reward += reward

#         # Update Q-value
#         target = reward + discount_factor * np.max(model.predict(next_state)[0])
#         q_values = model.predict(state)
#         q_values[0][action] = target
#         model.fit(state, q_values, epochs=1, verbose=0)

#         state = next_state

#     # Decay epsilon
#     if epsilon > 0.01:
#         epsilon *= 0.99

#     # Print episode information
#     print(f"Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}")

# # Evaluate trained model
# total_rewards = []
# for _ in range(100):
#     state = env.reset()
#     state = np.reshape(state, [1, state_size])
#     done = False
#     total_reward = 0
#     while not done:
#         action = np.argmax(model.predict(state)[0])
#         next_state, reward, done, _ = env.step(action)
#         next_state = np.reshape(next_state, [1, state_size])
#         total_reward += reward
#         state = next_state
#     total_rewards.append(total_reward)

# print(f"Average Total Reward over 100 episodes: {np.mean(total_rewards)}")

  deprecation(
  deprecation(




  if not isinstance(terminated, (bool, np.bool8)):


Episode 1/100, Total Reward: 10.0
Episode 2/100, Total Reward: 10.0
Episode 3/100, Total Reward: 11.0
Episode 4/100, Total Reward: 13.0
Episode 5/100, Total Reward: 9.0
Episode 6/100, Total Reward: 10.0
Episode 7/100, Total Reward: 8.0
Episode 8/100, Total Reward: 10.0
Episode 9/100, Total Reward: 9.0
Episode 10/100, Total Reward: 13.0
Episode 11/100, Total Reward: 10.0
Episode 12/100, Total Reward: 10.0
Episode 13/100, Total Reward: 10.0
Episode 14/100, Total Reward: 10.0
Episode 15/100, Total Reward: 9.0
Episode 16/100, Total Reward: 9.0
Episode 17/100, Total Reward: 9.0
Episode 18/100, Total Reward: 10.0
Episode 19/100, Total Reward: 10.0
Episode 20/100, Total Reward: 11.0
Episode 21/100, Total Reward: 9.0
Episode 22/100, Total Reward: 9.0
Episode 23/100, Total Reward: 9.0
Episode 24/100, Total Reward: 9.0
Episode 25/100, Total Reward: 12.0
Episode 26/100, Total Reward: 10.0
Episode 27/100, Total Reward: 10.0
Episode 28/100, Total Reward: 10.0
Episode 29/100, Total Reward: 9.0
Episo