In [13]:
import numpy as np
import tensorflow as tf
import gym
import matplotlib.pyplot as plt
from collections import deque
import random
from IPython.display import clear_output

In [14]:
class DoubleDQN(tf.keras.Model):
    def __init__(self, n_actions, n_features, learning_rate=0.005, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=3000, batch_size=32, e_greedy_increment=None):
        super(DoubleDQN, self).__init__()
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        self.learn_step_counter = 0
        self.memory = np.zeros((self.memory_size, n_features*2+2))
        self._build_net()
        self.optimizer = tf.optimizers.RMSprop(self.lr)

    def _build_net(self):
        self.eval_net = self.create_network()
        self.target_net = self.create_network()

    def create_network(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(20, activation='relu', input_shape=(self.n_features,)),
            tf.keras.layers.Dense(self.n_actions)
        ])
        return model

    def store_transition(self, state, action, reward, next_state, done):
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0
        # Flatten the action if it's an array
        if isinstance(a, np.ndarray):
            a = a.flatten()
        # Store transition
        transition = np.hstack((s, a, [r], s_))
        index = self.memory_counter % self.memory_size
        self.memory[index, :] = transition
        self.memory_counter += 1

    def choose_action(self, observation):
        if np.random.uniform() < self.epsilon:  # choosing action
            action = np.random.randint(0, self.n_actions)
        else:
            observation = np.array(observation)[np.newaxis, :]
            actions_value = self.eval_net.predict(observation)
            action = np.argmax(actions_value)

        return action

    def learn(self):
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.target_net.set_weights(self.eval_net.get_weights())

        sample_index = np.random.choice(self.memory_size, size=self.batch_size)
        batch_memory = self.memory[sample_index, :]

        q_next = self.target_net.predict(batch_memory[:, -self.n_features:])
        q_eval = self.eval_net.predict(batch_memory[:, :self.n_features])

        q_target = q_eval.copy()
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        eval_act_index = batch_memory[:, self.n_features].astype(int)
        reward = batch_memory[:, self.n_features + 1]

        q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)

        self.eval_net.train_on_batch(batch_memory[:, :self.n_features], q_target)

        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1

In [15]:
# Replay Buffer
class ReplayBuffer:
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        sample_size = min(len(self.buffer), batch_size)
        return random.sample(self.buffer, sample_size)

# Create Model
def create_model(num_states, num_actions):
    inputs = Input(shape=(num_states,))

    # Add the first hidden layer with 64 units and ReLU activation
    layer1 = Dense(64, activation="relu")(inputs)
    # Add dropout to the first hidden layer
    dropout1 = Dropout(0.7)(layer1)

    # Add the second hidden layer with 238 units and ReLU activation
    layer2 = Dense(128, activation="relu")(dropout1)
    # Add dropout to the second hidden layer
    dropout2 = Dropout(0.7)(layer2)

    # Add the output layer with 'num_actions' units and tanh activation
    action = Dense(num_actions, activation="tanh")(dropout2)
    return Model(inputs=inputs, outputs=action)

def smooth_data(data, window_percent):
    window_size = int(len(data) * window_percent)
    window_size = max(1, window_size)
    smoothed = np.convolve(data, np.ones(window_size)/window_size, mode='valid')
    return window_size, smoothed

def plot_live(loss_log, human_score_log, ylabel='Value'):
    # Smooth data
    window_size_ll, smoothed_data_ll = smooth_data(loss_log, 0.05)
    window_size_hs, smoothed_data_hs = smooth_data(human_score_log, 0.05)

    # Plotting
    clear_output(wait=True)
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))

    # Loss Log Graph
    ax1.plot(loss_log, label='Original Data', linestyle='--', alpha=0.4)
    ax1.plot(np.arange(window_size_ll - 1, len(loss_log)), smoothed_data_ll, label='Smoothed Data')
    ax1.set_title("Loss Log Over Time")
    ax1.set_ylabel(ylabel)
    ax1.set_xlabel('Episode')
    ax1.grid(True)
    ax1.legend()

    # Human Score Log Graph
    ax2.plot(human_score_log, label='Original Data', linestyle='--', alpha=0.4)
    ax2.plot(np.arange(window_size_hs - 1, len(human_score_log)), smoothed_data_hs, label='Smoothed Data')
    ax2.set_title("Human Score Log Over Time")
    ax2.set_ylabel(ylabel)
    ax2.set_xlabel('Episode')
    ax2.grid(True)
    ax2.legend()

    plt.tight_layout()
    plt.show()

In [16]:
import numpy as np
import gym
import matplotlib.pyplot as plt
from collections import deque
import random

# Initialize Environment and Model
env = gym.make('Pendulum-v1')
num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]

# Initialize DoubleDQN models
model = DoubleDQN(n_actions=num_actions, n_features=num_states)
model_target = DoubleDQN(n_actions=num_actions, n_features=num_states)

# Replay Buffer and other settings as before

# Training loop
running_reward = 0
episode_count = 0
frame_count = 0
human_score_log = []
loss_log = []

while True:
    state = env.reset()
    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        frame_count += 1

        # Action selection
        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            action = env.action_space.sample()
        else:
            action = model.choose_action(state)
        
        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)

# Environment step
        next_state, reward, done, _, _ = env.step(action)
        modified_reward, human_score = reward_mod(reward)
        human_score_log.append(human_score)
        episode_reward += modified_reward

        # Store transition and update state
        model.store_transition(state, action, modified_reward, next_state, done)
        state = next_state

        if frame_count % update_after_actions == 0 and len(buffer.buffer) > batch_size:
            indices = np.random.choice(range(len(buffer.buffer)), size=batch_size)
            minibatch = [buffer.buffer[i] for i in indices]

            state_sample = np.array([x[0] for x in minibatch])
            action_sample = np.array([x[1] for x in minibatch])
            rewards_sample = np.array([x[2] for x in minibatch])
            next_state_sample = np.array([x[3] for x in minibatch])
            done_sample = np.array([x[4] for x in minibatch])

            future_rewards = model_target.predict(next_state_sample)
            updated_q_values = rewards_sample + gamma * tf.reduce_max(future_rewards, axis=1)
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample

            with tf.GradientTape() as tape:
                q_values = model(state_sample)
                loss = loss_function(updated_q_values, q_values)
                loss_log.append(loss)

            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if frame_count % update_target_network == 0:
            model_target.set_weights(model.get_weights())

        # Check for end of episode
        if done:
            break

    # Running reward update and logging
    running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

    # Plotting and saving
    if episode_count % 500 == 0:
        plot_live(loss_log, human_score_log, ylabel='Value')

    if episode_count % 500 == 0:
        print(f"Episode {episode_count}: average reward: {running_reward:.2f}")

    if episode_count % 5000 == 0:
        model.save_weights(f'save/model_episode_{episode_count}.h5')

    episode_count += 1

    # Check for convergence/solution
    if running_reward > (reward_mod(0)[0] * 200) * 0.98:
        print(f"Solved at episode {episode_count}: average reward: {running_reward:.2f}!")
        model.save_weights('save/DQNSuccess.h5')
        break

NameError: name 'max_steps_per_episode' is not defined