## <center>CSE 546: Reinforcement Learning</center>
### <center>Prof. Alina Vereshchaka</center>
<!-- ### <center>Fall 2022</center> -->

Welcome to the Assignment 2, Part 1: Introduction to Deep Reinforcement Learning and Neural Networks! The goal of this assignment is to make you comfortable with the application of different Neural Network structures depending on how the Reinforcement Learning environment is set up.

In [1]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [2]:
# Imports
import cv2
import gymnasium as gym
from gymnasium import spaces
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import numpy as np
from time import time


# Defining the Wumpus World Environment.
class WumpusWorldEnvironment(gym.Env):
    """This class implements the Wumpus World environment."""

    def __init__(self, observation_type, action_type):
        """This method initializes the environment.

        :param string observation_type: - It can take four values: 1. 'integer' 2. 'vector' 3. 'image' 4. 'float'
                                          determining the type of observation returned to the agent.

        :param string action_type: It can take three values: 1. 'discrete' 2. 'continuous' 3. 'multi_discrete'
                                   determining the type of action the agent can take."""

        self.observation_type = observation_type.lower()

        self.environment_width = 6
        self.environment_height = 6

        self.observation_space = spaces.Discrete(self.environment_width * self.environment_height)

        # Action.
        self.action_type = action_type.lower()

        if self.action_type == 'discrete':
            self.action_space = spaces.Discrete(4)
        elif self.action_type == 'continuous':
            self.action_space = spaces.Box(low=np.array([-1]), high=np.array([1]))
        elif self.action_type == 'multi_discrete':
            self.action_space = spaces.MultiDiscrete([2, 2, 2, 2])
        else:
            raise Exception('Invalid action type. Valid action types are: '
                            '\n1. discrete \n2. continuous \n3. multi_discrete')

        # Positions of environment objects.
        self.agent_pos = np.asarray([0, 0])

        self.breeze_pos = np.asarray([[1, 0], [3, 0], [5, 0], [2, 1], [4, 1], [1, 2], [3, 2], [5, 2], [0, 3],
                                      [2, 3], [1, 4], [3, 4], [5, 4], [0, 5], [2, 5], [4, 5]])

        self.gold_pos = np.asarray([4, 5])

        self.pit_pos = np.asarray([[2, 0], [5, 1], [2, 2], [0, 4], [2, 4], [3, 5], [5, 5]])

        self.stench_pos = np.asarray([[3, 2], [2, 3], [4, 3], [3, 4]])

        self.wumpus_pos = np.asarray([3, 3])

        self.gold_quantity = 1

        self.timesteps = 0
        self.max_timesteps = 1000

        # Creating the mapping from the co-ordinates to integers representing the grid blocks.
        self.coordinates_state_mapping = {}
        for i in range(self.environment_height):
            for j in range(self.environment_width):
                self.coordinates_state_mapping[f'{np.asarray([j, i])}'] = i * self.environment_width + j

        self.start_time = time()

    def reset(self):
        """This method resets the agent position and returns the state as the observation.

        :returns observation: - Observation received by the agent (Type depends on the parameter observation_type)."""

        self.agent_pos = np.asarray([0, 0])

        observation = self.return_observation()
        self.timesteps = 0
        self.gold_quantity = 1
        info = {}

        self.start_time = time()

        return observation, info

    def return_observation(self):
        """This method returns the observation.

        :returns observation - Observation received by the agent (Type depends on the parameter observation_type)."""

        if self.observation_type == 'integer':
            observation = self.coordinates_state_mapping[f'{self.agent_pos}']
        elif self.observation_type == 'vector':
            observation = self.agent_pos
        elif self.observation_type == 'image':
            observation = self.render(plot=False)
        elif self.observation_type == 'float':
            time_elapsed = time() - self.start_time
            observation = time_elapsed
        else:
            raise Exception('Invalid observation type. Valid observation types are: '
                            '\n1. integer \n2. vector \n3. image \n4. float')

        return observation

    def take_action(self, action):
        """This method takes the action.

        :param action: - Action taken by the agent (Type depends on the parameter action_type)."""

        if self.action_type == 'discrete':
            if action == 0:
                self.agent_pos[0] += 1  # Right.
            elif action == 1:
                self.agent_pos[0] -= 1  # Left.
            elif action == 2:
                self.agent_pos[1] += 1  # Up.
            elif action == 3:
                self.agent_pos[1] -= 1  # Down.
            else:
                raise Exception('InvalidAction: Discrete action can take values 0, 1, 2 or 3.')

        elif self.action_type == 'continuous':
            if -1 <= action <= -0.5:
                self.agent_pos[0] += 1  # Right.
            elif -0.5 < action <= 0:
                self.agent_pos[0] -= 1  # Left.
            elif 0 < action <= 0.5:
                self.agent_pos[1] += 1  # Up.
            elif 0.5 < action <= 1:
                self.agent_pos[1] -= 1  # Down.
            else:
                raise Exception('InvalidAction: Continuous action has a range [-1, 1].')

        elif self.action_type == 'multi_discrete':
            if action[0] == 1:
                self.agent_pos[0] += 1  # Right.
            if action[1] == 1:
                self.agent_pos[0] -= 1  # Left.
            if action[2] == 1:
                self.agent_pos[1] += 1  # Up.
            if action[3] == 1:
                self.agent_pos[1] -= 1  # Down.
            if len(action) != 4 or (action[0] not in [0, 1] or action[1] not in [0, 1] or action[2] not in [0, 1]
                                    or action[3] not in [0, 1]):
                raise Exception(
                    'InvalidAction: Multi-Discrete action takes binary values in the array [0, 0, 0, 0]. '
                    'Refer to the assignment problem statement on environment details.')

    def step(self, action):
        """This method implements what happens when the agent takes a particular action. It changes the agent's
        position (While not allowing it to go out of the environment space.), maps the environment co-ordinates to a
        state, defines the rewards for the various states, and determines when the episode ends.

        :param action: - Action taken by the agent (Type depends on the parameter action_type).

        :returns observation: - Observation received by the agent (Type depends on the parameter observation_type).
                 int reward: - Integer value that's used to measure the performance of the agent.
                 bool done: - Boolean describing whether the episode has ended.
                 dict info: - A dictionary that can be used to provide additional implementation information."""

        self.take_action(action)

        # Ensuring that the agent doesn't go out of the environment.
        self.agent_pos = np.clip(self.agent_pos, a_min=[0, 0],
                                 a_max=[self.environment_width - 1, self.environment_height - 1])

        observation = self.return_observation()

        reward, terminated, truncated = None, None, None
        info = {}

        return observation, reward, terminated, truncated, info

    def render(self, mode='human', plot=False):
        """This method renders the environment.

        :param string mode: 'human' renders to the current display or terminal and returns nothing.

        :param boolean plot: Boolean indicating whether we show a plot or not.

                             If False, the method returns a resized NumPy array representation of the environment
                             to be used as the state.

                             If True it plots the environment.

        :returns array preprocessed_image: Grayscale NumPy array representation of the environment."""

        fig, ax = plt.subplots(figsize=(15, 15))
        ax.set_xlim(0, 6)
        ax.set_ylim(0, 6)

        def plot_image(plot_pos):
            """This is a helper function to render the environment. It checks which objects are in a particular
            position on the grid and renders the appropriate image.

            :param arr plot_pos: Co-ordinates of the grid position which needs to be rendered."""

            # Initially setting every object to not be plotted.
            plot_agent, plot_breeze, plot_gold, plot_pit, plot_stench, plot_wumpus = \
                False, False, False, False, False, False

            # Checking which objects need to be plotted by comparing their positions.
            if np.array_equal(self.agent_pos, plot_pos):
                plot_agent = True
            if any(np.array_equal(self.breeze_pos[i], plot_pos) for i in range(len(self.breeze_pos))):
                plot_breeze = True
            if self.gold_quantity > 0:  # Gold isn't plotted if it has already been picked by one of the agents.
                if np.array_equal(plot_pos, self.gold_pos):
                    plot_gold = True
            if any(np.array_equal(self.pit_pos[i], plot_pos) for i in range(len(self.pit_pos))):
                plot_pit = True
            if any(np.array_equal(self.stench_pos[i], plot_pos) for i in range(len(self.stench_pos))):
                plot_stench = True
            if np.array_equal(plot_pos, self.wumpus_pos):
                plot_wumpus = True

            # Plot for Agent.
            if plot_agent and \
                    all(not item for item in
                        [plot_breeze, plot_gold, plot_pit, plot_stench, plot_wumpus]):
                agent = AnnotationBbox(OffsetImage(plt.imread('./images/agent.png'), zoom=0.28),
                                       np.add(plot_pos, [0.5, 0.5]), frameon=False)
                ax.add_artist(agent)

            # Plot for Breeze.
            elif plot_breeze and \
                    all(not item for item in
                        [plot_agent, plot_gold, plot_pit, plot_stench, plot_wumpus]):
                breeze = AnnotationBbox(OffsetImage(plt.imread('./images/breeze.png'), zoom=0.28),
                                        np.add(plot_pos, [0.5, 0.5]), frameon=False)
                ax.add_artist(breeze)

            # Plot for Gold.
            elif plot_gold and \
                    all(not item for item in
                        [plot_agent, plot_breeze, plot_pit, plot_stench, plot_wumpus]):
                gold = AnnotationBbox(OffsetImage(plt.imread('./images/gold.png'), zoom=0.28),
                                      np.add(plot_pos, [0.5, 0.5]), frameon=False)
                ax.add_artist(gold)

            # Plot for Pit.
            elif plot_pit and \
                    all(not item for item in
                        [plot_agent, plot_breeze, plot_gold, plot_stench, plot_wumpus]):
                pit = AnnotationBbox(OffsetImage(plt.imread('./images/pit.png'), zoom=0.28),
                                     np.add(plot_pos, [0.5, 0.5]), frameon=False)
                ax.add_artist(pit)

            # Plot for Stench.
            elif plot_stench and \
                    all(not item for item in
                        [plot_agent, plot_breeze, plot_gold, plot_pit, plot_wumpus]):
                stench = AnnotationBbox(OffsetImage(plt.imread('./images/stench.png'), zoom=0.28),
                                        np.add(plot_pos, [0.5, 0.5]), frameon=False)
                ax.add_artist(stench)

            # Plot for Wumpus.
            elif plot_wumpus and \
                    all(not item for item in
                        [plot_agent, plot_breeze, plot_gold, plot_pit, plot_stench]):
                wumpus = AnnotationBbox(OffsetImage(plt.imread('./images/wumpus.png'), zoom=0.28),
                                        np.add(plot_pos, [0.5, 0.5]), frameon=False)
                ax.add_artist(wumpus)

            # Plot for Agent and Breeze.
            elif all(item for item in [plot_agent, plot_breeze]) and \
                    all(not item for item in
                        [plot_gold, plot_pit, plot_stench, plot_wumpus]):
                agent_breeze = AnnotationBbox(OffsetImage(plt.imread('./images/agent_breeze.png'), zoom=0.28),
                                              np.add(plot_pos, [0.5, 0.5]), frameon=False)
                ax.add_artist(agent_breeze)

            # Plot for Agent and Pit.
            elif all(item for item in [plot_agent, plot_pit]) and \
                    all(not item for item in
                        [plot_breeze, plot_gold, plot_stench, plot_wumpus]):
                agent_pit = AnnotationBbox(OffsetImage(plt.imread('./images/agent_dead_pit.png'), zoom=0.28),
                                           np.add(plot_pos, [0.5, 0.5]), frameon=False)
                ax.add_artist(agent_pit)

            # Plot for Agent and Stench.
            elif all(item for item in [plot_agent, plot_stench]) and \
                    all(not item for item in
                        [plot_breeze, plot_gold, plot_pit, plot_wumpus]):
                agent_stench = AnnotationBbox(OffsetImage(plt.imread('./images/agent_stench.png'), zoom=0.28),
                                              np.add(plot_pos, [0.5, 0.5]), frameon=False)
                ax.add_artist(agent_stench)

            # Plot for Agent, Breeze and Stench.
            elif all(item for item in [plot_agent, plot_breeze, plot_stench]) and \
                    all(not item for item in
                        [plot_gold, plot_pit, plot_wumpus]):
                agent_breeze_stench = AnnotationBbox(OffsetImage(plt.imread('./images/agent_breeze_stench.png'),
                                                                 zoom=0.28), np.add(plot_pos, [0.5, 0.5]),
                                                     frameon=False)
                ax.add_artist(agent_breeze_stench)

            # Plot for Agent and Wumpus.
            elif all(item for item in [plot_agent, plot_wumpus]) and \
                    all(not item for item in
                        [plot_gold, plot_pit, plot_stench, plot_breeze]):
                agent_wumpus = AnnotationBbox(OffsetImage(plt.imread('./images/agent_dead_wumpus_alive.png'),
                                                          zoom=0.28), np.add(plot_pos, [0.5, 0.5]), frameon=False)
                ax.add_artist(agent_wumpus)

            # Plot for Breeze and Gold.
            elif all(item for item in [plot_breeze, plot_gold]) and \
                    all(not item for item in
                        [plot_agent, plot_pit, plot_stench, plot_wumpus]):
                breeze_gold = AnnotationBbox(OffsetImage(plt.imread('./images/breeze_gold.png'), zoom=0.28),
                                             np.add(plot_pos, [0.5, 0.5]), frameon=False)
                ax.add_artist(breeze_gold)

            # Plot for Breeze and Stench.
            elif all(item for item in [plot_breeze, plot_stench]) and \
                    all(not item for item in
                        [plot_agent, plot_gold, plot_pit, plot_wumpus]):
                breeze_stench = AnnotationBbox(OffsetImage(plt.imread('./images/breeze_stench.png'), zoom=0.28),
                                               np.add(plot_pos, [0.5, 0.5]), frameon=False)
                ax.add_artist(breeze_stench)

            # Plot for Breeze, Stench, and Gold.
            elif all(item for item in [plot_breeze, plot_gold, plot_stench]) and \
                    all(not item for item in
                        [plot_agent, plot_pit, plot_wumpus]):
                breeze_gold_stench = AnnotationBbox(OffsetImage(plt.imread('./images/breeze_gold_stench.png'),
                                                                zoom=0.28), np.add(plot_pos, [0.5, 0.5]),
                                                    frameon=False)
                ax.add_artist(breeze_gold_stench)

            # Plot for Stench and Gold.
            elif all(item for item in [plot_stench, plot_gold]) and \
                    all(not item for item in
                        [plot_agent, plot_breeze, plot_pit, plot_wumpus]):
                stench_gold = AnnotationBbox(OffsetImage(plt.imread('./images/stench_gold.png'), zoom=0.28),
                                             np.add(plot_pos, [0.5, 0.5]), frameon=False)
                ax.add_artist(stench_gold)

        coordinates_state_mapping_2 = {}
        for j in range(self.environment_height * self.environment_width):
            coordinates_state_mapping_2[j] = np.asarray(
                [j % self.environment_width, int(np.floor(j / self.environment_width))])

        # Rendering the images for all states.
        for position in coordinates_state_mapping_2:
            plot_image(coordinates_state_mapping_2[position])

        plt.xticks([0, 1, 2, 3, 4, 5])
        plt.yticks([0, 1, 2, 3, 4, 5])
        plt.grid()

        if plot:  # Displaying the plot.
            plt.show()
        else:  # Returning the preprocessed image representation of the environment.
            fig.canvas.draw()
            img = np.array(fig.canvas.renderer.buffer_rgba())[:, :, :1]
            width = int(84)
            height = int(84)
            dim = (width, height)
            # noinspection PyUnresolvedReferences
            preprocessed_image = cv2.resize(img, dim, interpolation=cv2.INTER_AREA)
            plt.close(fig)
            return preprocessed_image

In [3]:
# # Imports
# from environment import WumpusWorldEnvironment

# Environment

We will be working with an implementation of the Wumpus World environment. The environment comes from the book "Artificial Intelligence: A Modern Approach" by Stuart J. Russell and Peter Norvig.

### ENVIRONMENT DETAILS:

The environment is a 6 x 6 grid world containing a total of 36 grid blocks.

#### ENVIRONMENT OBJECTS:
The environment consists of the following objects:

1. **Agent** - The agent starts in the grid block at the bottom left corner whose co-ordinates are [0, 0]. The goal of our agent is to collect the Gold while avoiding the Wumpus and the pits.

2. **Wumpus** - The monster which would eat the agent if they are in the same grid block.

3. **Pit** - The agent must avoid falling into the pits.

4. **Gold** - The agent must collect the Gold.

5. **Breeze** - Breeze surrounds the Pits and warn the agent of a Pit in an adjacent grid block.

6. **Stench** - Stench surrounds the Wumpus and warns the agent of the Wumpus in an adjacent grid block.

#### ENVIRONMENT OBSERVATIONS:

Our implementation of the environment provides you with four different types of observations:

1. **Integer** - Integer in the range [0 - 35]. This represents the grid block the agent is in. E.g., if the agent is in the bottom left grid block (starting position) the observation would be 0, if the agent is in the grid block containing the Gold the observation would be 34, if the agent is in the top right grid block the observation would be 35.

2. **Vector** -

    **2.1.** A vector of length 2 representing the agent co-ordinates. The first entry represents the x co-ordinate and the second entry represets the y co-ordinate. E.g., if the agent is in the bottom left grid block (starting position) the observation would be [0, 0], if the agent is in the grid block containing the Gold the observation would be [4, 5], if the agent is in the top right grid block the observation would be [5, 5].
    
    **2.2.** A vector of length 36 representing the one-hot encoding of the integer observation (refer type 1 above). E.g., if the agent is in the bottom left grid block (starting position) the observation would be [1, 0, ..., 0, 0], if the agent is in the grid block containing the Gold the observation would be [0, 0, ..., 1, 0], if the agent is in the top right grid block the observation would be [0, 0, ..., 0, 1].


3. **Image** - Image render of the environment returned as an NumPy array. The image size is 84 * 84 (same size used in the DQN paper). E.g., if the agent is in the bottom right grid block the observation is:

    Observation: (84 * 84)

     [[255 255 255 ... 255 255 255]

     [255 255 255 ... 255 255 255]

     [255 255 255 ... 255 255 255]

     ...

     [255 255 255 ... 255 255 255]

     [255 255 255 ... 255 255 255]

     [255 255 255 ... 255 255 255]]

    Observation type: <class 'numpy.ndarray'>

    Observation Shape: (84, 84)

    Visually, it looks like:
    <img src="./images/environment_render.png" width="500" height="500">
    

4. **Float** - Float in the range [0 - $\infty$] representing the time elapsed in seconds.

#### ENVIRONMENT ACTIONS:

Our implementation of the environment provides you with three different types of actions:

1. **Discrete** - Integer in the range [0 - 3] representing the four actions possible in the environment as follows: 0 - Right 1 - Left 2 - Up 3 - Down.

2. **Multi-Discrete** - Array of length four where each element takes binary values 0 or 1. Array elements represent if we take a particular action. Array element with index 0 corresponds to the right action, index 1 corresponds to the left action, index 2 corresponds to the up action, and index 3 corresponds to the down action. E.g.,
   action = [1, 0, 0, 0] would result in the agent moving right.
   action = [1, 0, 1, 0] would result in the agent moving right and up.
   action = [0, 1, 0, 1] would result in the agent moving left and down.

3. **Continuous** - Float in the range [-1, 1] determining whether the agent will go left, right, up, or down as follows:

    if -1 <= action <= -0.5:
        Go Right.
    elif -0.5 < action <= 0:
        Go Left.
    elif 0 < action <= 0.5:
        Go Up.
    elif 0.5 < action <= 1:
        Go Down.
        
### YOUR TASK IS TO USE A NEURAL NETWORK TO WORK WITH ALL FOUR TYPES OF OBSERVATIONS AND ALL THREE TYPES OF  ACTIONS.
### Note: You don't have to train your agent/neural network. You just have to build the neural network structure that takes the observation as input and produces the desired output with the initial weights.

#### You can use libraries such as PyTorch/TensorFlow/Keras to build your neural networks.

#### <span style="color:red">You cannot use RL libraries that already provide the neural network to you such as Stable-baselines3, Keras-RL, TF agents, Ray RLLib etc.</span>

<img src="./images/wumpus_world_environment.jpg" width="600" height="600">

# START COMPLETING YOUR ASSIGNMENT HERE

## Observation Type - Integer, Action Type - Discrete

The part of the assignment requires you to create a sequential dense neural network with 1 hidden layer having 64 neurons and the output layer having 4 neurons. The input to the neural network is an integer (refer to environment observations type 1). The output of the neural network is an array represeting the Q-values from which you will choose an action (refer to environment actions type 1).

The following figure shows the network structure you will have to use:

<img src="./images/neural_network_structures/neural_network_1_64_4.png">

In [4]:
import tensorflow as tf
from tensorflow.keras import layers

In [5]:
"""TO DO: Create a neural network, pass it the observation from the environment
and get the predicted Q-values for the four actions. Print the observation and the Q-values."""

environment = WumpusWorldEnvironment(observation_type='integer', action_type='discrete')
observation, info = environment.reset()

# BEGIN_YOUR_CODE

model = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(1,)),
    layers.Dense(4, activation='linear')
])

model.compile(optimizer='adam', loss='mse')
model.summary()

observation = np.array([observation]).reshape(-1, 1)
predicted_q_values = model.predict(observation)

print("Observation:", observation)
print("Predicted Q-values:", predicted_q_values.flatten())

# END_YOUR_CODE

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                128       
                                                                 
 dense_1 (Dense)             (None, 4)                 260       
                                                                 
Total params: 388 (1.52 KB)
Trainable params: 388 (1.52 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Observation: [[0]]
Predicted Q-values: [0. 0. 0. 0.]


## Observation Type - Vector (2.1), Action Type - Discrete

The part of the assignment requires you to create a sequential dense neural network with 1 hidden layer having 64 neurons and the output layer having 4 neurons. The input to the neural network is a vector of length 2 (refer to environment observations type 2.1). The output of the neural network is an array represeting the Q-values from which you will choose an action (refer to environment actions type 1).

The following figure shows the network structure you will have to use:

<img src="./images/neural_network_structures/neural_network_2_64_4.png">

In [6]:
"""TO DO: Create a neural network, pass it the observation from the environment
and get the predicted Q-values for the four actions. Print the observation and the Q-values."""

environment = WumpusWorldEnvironment(observation_type='vector', action_type='discrete')
observation, info = environment.reset()

# BEGIN_YOUR_CODE

model = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(2,)),
    layers.Dense(4, activation='linear')
])

model.compile(optimizer='adam', loss='mse')
model.summary()

observation = np.array([observation])

predicted_q_values = model.predict(observation)

print("Observation:", observation)
print("Predicted Q-values:", predicted_q_values.flatten())

# END_YOUR_CODE

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 64)                192       
                                                                 
 dense_3 (Dense)             (None, 4)                 260       
                                                                 
Total params: 452 (1.77 KB)
Trainable params: 452 (1.77 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Observation: [[0 0]]
Predicted Q-values: [0. 0. 0. 0.]


## Observation Type - Vector (2.2), Action Type - Discrete

The part of the assignment requires you to create a sequential dense neural network with 1 hidden layer having 64 neurons and the output layer having 4 neurons. The input to the neural network is a vector of length 36 (refer to environment observations type 2.2). The output of the neural network is an array represeting the Q-values from which you will choose an action (refer to environment actions type 1).

**HINT:** Use the integer observation and convert it to a one-hot encoded vector.

The following figure shows the network structure you will have to use:

<img src="./images/neural_network_structures/neural_network_36_64_4.png">

In [7]:
"""TO DO: Create a neural network, pass it the observation from the environment
and get the predicted Q-values for the four actions. Print the observation and the Q-values."""

environment = WumpusWorldEnvironment(observation_type='integer', action_type='discrete')
observation, info = environment.reset()
one_hot_observation = to_categorical(observation, num_classes=36)

# BEGIN_YOUR_CODE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Initialize the Sequential model
model = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(36,)),
    layers.Dense(4, activation='linear')
])
model.compile(optimizer='adam', loss='mse')
model.summary()


observation = np.array([one_hot_observation])
predicted_q_values = model.predict(observation)

print("Observation:", observation)
print("Predicted Q-values:", predicted_q_values.flatten())

# END_YOUR_CODE

NameError: name 'to_categorical' is not defined

## Observation Type - Image, Action Type - Discrete

The part of the assignment requires you to create a convolutional neural network with one convolutional layer having 128 filters of size 3 x 3, one hidden layer having 64 neurons, and the output layer having 4 neurons. The input to the neural network is an image of size 84 * 84 (refer to environment observations type 3). The output of the neural network is an array represeting the Q-values from which you will choose an action (refer to environment actions type 1).

The following figure shows the network structure you will have to use:

<img src="./images/neural_network_structures/convolutional_neural_network_84x84_128_64_4.png">

In [None]:
"""TO DO: Create a neural network, pass it the observation from the environment
and get the predicted Q-values for the four actions. Print the observation and the Q-values."""

environment = WumpusWorldEnvironment(observation_type='image', action_type='discrete')
observation, info = environment.reset()

# BEGIN_YOUR_CODE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense

model = tf.keras.Sequential([
    layers.Conv2D(128, (3, 3), activation='relu', input_shape=(84, 84, 1)),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(4, activation='linear')
])


model.compile(optimizer='adam', loss='mse')
model.summary()

observation = np.array([observation])
predicted_q_values = model.predict(observation)

print("Observation:", observation)
print("Predicted Q-values:", predicted_q_values.flatten())

# END_YOUR_CODE

## Observation Type - Float, Action Type - Discrete

The part of the assignment requires you to create a sequential dense neural network with 1 hidden layer having 256 neurons and the output layer having 4 neurons. The input to the neural network is a float (refer to environment observations type 4). The output of the neural network is an array represeting the Q-values from which you will choose an action (refer to environment actions type 1).

The following figure shows the network structure you will have to use:

<img src="./images/neural_network_structures/neural_network_1_64_4.png">

In [None]:
"""TO DO: Create a neural network, pass it the observation from the environment
and get the predicted Q-values for the four actions. Print the observation and the Q-values."""

environment = WumpusWorldEnvironment(observation_type='float', action_type='discrete')
observation, info = environment.reset()

# BEGIN_YOUR_CODE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Initialize the Sequential model
model = tf.keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(1,)),
    layers.Dense(4, activation='linear')
])

model.compile(optimizer='adam', loss='mse')
model.summary()

observation = np.array([observation])
predicted_q_values = model.predict(observation)

print("Observation:", observation)
print("Predicted Q-values:", predicted_q_values.flatten())

# END_YOUR_CODE

## Observation Type - Vector (2.2), Action Type - Multi-Discrete

The part of the assignment requires you to create a sequential dense neural network with 1 hidden layer having 64 neurons and the output layer having 4 neurons. The input to the neural network is a vector of length 36 (refer to environment observations type 2.2). The output of the neural network is an array representing the probability of choosing the actions. (If the value of the array element is >=0.5 you will perform the action.) (refer to environment actions type 2).

**HINT:** Use the integer observation and convert it to a one-hot encoded vector.

The following figure shows the network structure you will have to use:

<img src="./images/neural_network_structures/neural_network_36_64_4_sigmoid.png">

In [None]:
"""TO DO: Create a neural network, pass it the observation from the environment
and get the predicted action probabilities for the four actions. Print the observation and the action probabilities."""

environment = WumpusWorldEnvironment(observation_type='integer', action_type='multi_discrete')
observation, info = environment.reset()

# BEGIN_YOUR_CODE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

one_hot_input = to_categorical(observation, num_classes=36)

model = Sequential([
    Dense(64, activation='relu', input_shape=(1,)),
    Dense(4, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy')
model.summary()

observation = np.array([observation])
predicted_q_values = model.predict(observation)

print("Observation:", observation)
print("Predicted Q-values:", predicted_q_values.flatten())

# END_YOUR_CODE

## Observation Type - Vector (2.2), Action Type - Continuous

The part of the assignment requires you to create a sequential dense neural network with 1 hidden layer having 64 neurons and the output layer having 1 neuron. The input to the neural network is a vector of length 36 (refer to environment observations type 2.2). The output of the neural network is an float in the range [-1, 1] determining the action which will be taken. (refer to environment actions type 3).

**HINT:** Use the integer observation and convert it to a one-hot encoded vector and use the TanH activation function to get the output in the range [-1, 1].

The following figure shows the network structure you will have to use:

<img src="./images/neural_network_structures/neural_network_36_64_1.png">

In [None]:
"""TO DO: Create a neural network, pass it the observation from the environment
and get the predicted action. Print the observation and the predicted action."""

environment = WumpusWorldEnvironment(observation_type='integer', action_type='multi_discrete')
observation, info = environment.reset()

# BEGIN_YOUR_CODE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

one_hot_observation = to_categorical(observation, num_classes=36)

model = Sequential([
    Dense(64, activation='relu', input_shape=(36,)),
    Dense(1, activation='tanh')
])

model.compile(optimizer='adam', loss='mse')
model.summary()


observation = np.array([one_hot_observation])
predicted_q_values = model.predict(observation)

print("Observation:", observation)
print("Predicted Q-values:", predicted_q_values.flatten())
# END_YOUR_CODE