### Reinforcement Learning - Resource Manager

Kudos to:

https://www.gymlibrary.dev/content/environment_creation/

https://www.youtube.com/watch?v=bD6V3rcr_54&ab_channel=NicholasRenotte 

Version 1.0:

- Export the Environment to python file
    - Register it
- Add Deep Learning with DQN



### Imports

In [1]:
import gym
from gym import spaces
import numpy as np
import pygame
from gym.envs.registration import register
import tensorflow as tf
from tensorflow import keras

### Environment

In [2]:
from ResourceManagerEnvironment import ResourceManagerEnv

In [3]:
#register the environment
register(
    id='Resource-Manager-v1-1',
    #loading syntax: 'PythonFileName:EnvironmentClassName'
    entry_point='ResourceManagerEnvironment:ResourceManagerEnv',
    max_episode_steps=300,
)


In [4]:
env = gym.make('Resource-Manager-v1-1')

  logger.warn(


## Deep Learning

In [8]:
class DQNAgent:
    def __init__(self, num_actions, grid_size, epsilon=1.0, epsilon_decay=0.995, min_epsilon=0.01, learning_rate=0.001, gamma=0.99):
        #define number of action, here we have 4 actions: up, down, left, right
        self.num_actions = num_actions
        self.grid_size = grid_size

        #hyperparameters
        self.batch_size = 32
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon = epsilon
        self.min_epsilon = min_epsilon
        self.learning_rate = learning_rate
        self.gamma = gamma

        #define q_network and target_network
        self.q_network = self.build_q_network()
        self.target_network = self.build_q_network()

    def build_q_network(self):
        model = keras.Sequential([
            keras.layers.Input(shape=(self.grid_size, self.grid_size)),  # Define the state shape
            keras.layers.Dense(64, activation='relu'),
            keras.layers.Dense(64, activation='relu'),
            keras.layers.Dense(self.num_actions)  # Output layer with num_actions units
            ])
        model.compile(optimizer=keras.optimizers.Adam(learning_rate=self.learning_rate), loss='mse')
        return model


    def choose_action(self, state):

        action = np.random.randint(self.num_actions)
        '''
        if np.random.rand() < self.epsilon:
            #explore action space randomly
            action = np.random.randint(self.num_actions)
        else:
            #exploit learned values
            q_values = self.q_network(state[None, :])
            action = np.argmax(q_values[0])
        '''
        return action

    def train(self, state, action, reward, next_state, done):
        #state = np.array(state)
        state = np.expand_dims(state, axis=0)
        # Get the Q-values of the current state
        q_values = self.q_network.predict(state)
        print(f"Q-values: {q_values}")

        # Calculate the target Q-values using the Bellman equation
        if done:
            q_values[0][action] = reward
        else:
            next_q_values = self.target_network.predict(next_state)
            q_values[0][action] = reward + self.gamma * np.max(next_q_values)

        # Train the Q-network using the updated Q-values
        loss = self.q_network.train_on_batch(state, q_values)

        return loss



    def update_target_network(self):
        self.target_network.set_weights(self.q_network.get_weights())

    def decay_epsilon(self):
        if self.epsilon > self.min_epsilon:
            self.epsilon *= self.epsilon_decay


In [9]:
num_actions = 4
grid_size = 15

env = ResourceManagerEnv(grid_size=grid_size)
dqn_agent = DQNAgent(num_actions, grid_size=grid_size)

In [10]:

episodes = 5  # Number of episodes
max_steps = 300  # Maximum steps per episode

for episode in range(episodes):
    state = env.reset()
    done = False

    for step in range(max_steps):
        # Choose an action using the epsilon-greedy strategy (you may need to implement this in your DQNAgent class)
        action = dqn_agent.choose_action(state)
        print(f"Selected action: {action}")

        # Take the chosen action and observe the next state, reward, and whether the episode is done
        next_state, reward, done, _, info = env.step(action)

        # Train the agent using the observed experience
        dqn_agent.train(state, action, reward, next_state, done)

        # Update the current state
        state = next_state

        if done:
            break

    # Perform any necessary updates after an episode (e.g., decay epsilon, update target network)
    dqn_agent.decay_epsilon()
    dqn_agent.update_target_network()


Selected action: 3
Q-values: [[[ 0.          0.          0.          0.        ]
  [ 0.05117085 -0.08004584 -0.16986465 -0.03255314]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]]]
Selected action: 1
Q-values: [[[-0.00099989 -0.00099989 -0.00099989 -0.00099989]
  [ 0.09023882 -0.15286566 -0.05210505 -0.08724376]
  [-0.00099989 -0.00099989 -0.00099989 -0.00099989]
  [-

## Evaluation