In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import Huber
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import TensorBoard
from collections import deque
import time
import gym
import random
import datetime


# Replay Buffer
class ReplayBuffer:
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        sample_size = min(len(self.buffer), batch_size)
        return random.sample(self.buffer, sample_size)

# log scaling for reward
def logarithmic_scaling(reward):
    if reward > 0:
        return np.log(1 + reward)
    elif reward < 0:
        return -np.log(1 - reward)
    else:
        return 0

def reward_mod(reward):
        normalized_reward = (reward / -16.2736044) * 100
        
        print(f"normalise {normalized_reward}")
        # Logarithmic Scaling
        log_scaled_reward = logarithmic_scaling(normalized_reward)

        print(f"log {log_scaled_reward}")
        # Clipping Rewards
        min_clip_value = -1
        max_clip_value = 1
        clipped_reward = np.clip(log_scaled_reward, min_clip_value, max_clip_value)
        
        return clipped_reward, (reward / -16.2736044) * 100

# Create Model
def create_model(num_states, num_actions):
    inputs = Input(shape=(num_states,))
    layer1 = Dense(32, activation="relu")(inputs)
    layer2 = Dense(64, activation="relu")(layer1)
    action = Dense(num_actions, activation="tanh")(layer2) # Using tanh for output
    return Model(inputs=inputs, outputs=action)

# Initialize Environment and Model
env = gym.make('Pendulum-v1', render_mode='human')
num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]

model = create_model(num_states, num_actions)
model_target = create_model(num_states, num_actions)

# Learning rate schedule
initial_learning_rate = 0.001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate, decay_steps=10000, decay_rate=0.9, staircase=True)
optimizer = Adam(learning_rate=lr_schedule)
loss_function = Huber()

# DQN parameters
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_max = 1.0
epsilon_interval = epsilon_max - epsilon_min
epsilon_random_frames = 50000
epsilon_greedy_frames = 1000000.0
batch_size = 32
max_steps_per_episode = 200
update_after_actions = 4
update_target_network = 10000
max_memory_length = 100000
buffer = ReplayBuffer(max_memory_length)

# TensorBoard setup
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Early Stopping
early_stopping_callback = EarlyStopping(monitor='loss', patience=10)

# Training loop
running_reward = 0
episode_count = 0
frame_count = 0


human_score_log = []

while True:
    observation, _ = env.reset()
    state = np.array(observation, dtype=np.float32)
    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        frame_count += 1
        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            action = env.action_space.sample()
        else:
            action_probs = model.predict(state.reshape(1, -1))[0]
            action = np.clip(action_probs, env.action_space.low[0], env.action_space.high[0])

        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)

        next_state, reward, done, _, _ = env.step(action)

        modified_reward, human_score = reward_mod(reward)
        
        human_score_log.append(human_score)
        episode_reward += modified_reward

        next_state = np.array(next_state)

        buffer.add(state, action, reward, next_state, done)
        state = next_state

        if frame_count % update_after_actions == 0 and len(buffer.buffer) > batch_size:
            indices = np.random.choice(range(len(buffer.buffer)), size=batch_size)
            minibatch = [buffer.buffer[i] for i in indices]

            state_sample = np.array([x[0] for x in minibatch])
            action_sample = np.array([x[1] for x in minibatch])
            rewards_sample = np.array([x[2] for x in minibatch])
            next_state_sample = np.array([x[3] for x in minibatch])
            done_sample = np.array([x[4] for x in minibatch])

            future_rewards = model_target.predict(next_state_sample)
            updated_q_values = rewards_sample + gamma * tf.reduce_max(future_rewards, axis=1)
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample

            with tf.GradientTape() as tape:
                q_values = model(state_sample)
                loss = loss_function(updated_q_values, q_values)

            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if frame_count % update_target_network == 0:
            model_target.set_weights(model.get_weights())

        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward
        if running_reward > -200:
            print(f"Solved at episode {episode_count}: average reward: {running_reward:.2f}!")
            break

    logs = {'reward': episode_reward, 'running_reward': running_reward, 'loss': loss}
    tensorboard_callback.on_epoch_end(episode_count, logs)
    episode_count += 1

    if episode_count % 10 == 0:
        print(f"Episode {episode_count}: average reward: {running_reward:.2f}")

    # Save Model
    if episode_count % 500 == 0:
        model.save('save/model_episode_{}.h5'.format(episode_count))

    if episode_count % 1000 == 0:
        break


  if not isinstance(terminated, (bool, np.bool8)):


normalise 27.872827142757767
log 3.3629009155432357
Solved at episode 0: average reward: 0.05!


NameError: name 'loss' is not defined

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs