In [3]:
# Re-importing TensorFlow and necessary libraries
import numpy as np
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber
from tensorflow.keras import Model
from collection import deque
import random
import tensorflow as tf
import gym
import datetime
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

class ReplayBuffer:
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        sample_size = min(len(self.buffer), batch_size)
        return random.sample(self.buffer, sample_size)


# Reinitializing the environment and model parameters
env = gym.make('Pendulum-v1', render_mode='human')

num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]

def create_model():
    inputs = Input(shape=(num_states,))
    layer1 = Dense(32, activation="relu")(inputs)
    layer2 = Dense(64, activation="relu")(layer1)
    action = Dense(num_actions, activation="linear")(layer2)
    return Model(inputs=inputs, outputs=action)

model = create_model()
model_target = create_model()

initial_learning_rate = 0.001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate, decay_steps=10000, decay_rate=0.9, staircase=True)
optimizer = Adam(learning_rate=lr_schedule)

loss_function = Huber()

# DQN parameters
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_max = 1.0
epsilon_interval = epsilon_max - epsilon_min
epsilon_random_frames = 50000
epsilon_greedy_frames = 1000000.0
batch_size = 32
max_steps_per_episode = 200
update_after_actions = 4
update_target_network = 10000
max_memory_length = 100000

# Initialize replay buffers
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
running_reward = 0
episode_count = 0
frame_count = 0

# Main training loop
while True:
    observation, _ = env.reset()
    state = np.array(observation, dtype=np.float32)
    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        frame_count += 1
        # Use epsilon-greedy for exploration
        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            action = env.action_space.sample()
        else:
            action_probs = model.predict(state.reshape(1, -1))[0]
            action = np.clip(action_probs, env.action_space.low[0], env.action_space.high[0])

        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)

        state_next, reward, done, _, _ = env.step(action)
        state_next = np.array(state_next)
        episode_reward += reward

        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        done_history.append(done)
        rewards_history.append(reward)
        state = state_next

        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
            indices = np.random.choice(range(len(done_history)), size=batch_size)

            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array([state_next_history[i] for i in indices])
            rewards_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            done_sample = tf.convert_to_tensor([float(done_history[i]) for i in indices])

            future_rewards = model_target.predict(state_next_sample)
            updated_q_values = rewards_sample + gamma * tf.reduce_max(future_rewards, axis=1)
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample
            
            action_sample = np.array(action_sample).astype(int)  # Convert to NumPy array and then cast to int
            masks = tf.one_hot(action_sample, num_actions)

            with tf.GradientTape() as tape:
                q_values = model(state_sample)
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                loss = loss_function(updated_q_values, q_action)

            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if frame_count % update_target_network == 0:
            model_target.set_weights(model.get_weights())

        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]

        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

        if running_reward > -200:
            print(f"Solved at episode {episode_count}: average reward: {running_reward:.2f}!")
            break

    episode_reward = 0
    episode_count += 1

    if episode_count % 10 == 0:
        print(f"Episode {episode_count}: average reward: {running_reward:.2f}")
        env.render() 

    # model.save('DQN.h5')


ImportError: cannot import name 'deque' from 'collection' (c:\Users\zachary\anaconda3\envs\tf_env\lib\site-packages\collection\__init__.py)

Collecting collection
  Downloading collection-0.1.6.tar.gz (5.0 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: collection
  Building wheel for collection (setup.py): started
  Building wheel for collection (setup.py): finished with status 'done'
  Created wheel for collection: filename=collection-0.1.6-py3-none-any.whl size=5098 sha256=64f15095cc9a24cc77dc7e83400d920fb486a77688d28dfb22594588e229f541
  Stored in directory: c:\users\zachary\appdata\local\pip\cache\wheels\a5\70\eb\1d28795e9384ab3b9be6359bdde9e1652f6e7dab9d26844f70
Successfully built collection
Installing collected packages: collection
Successfully installed collection-0.1.6



[notice] A new release of pip is available: 23.3 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
