In [1]:
# Import necessary libraries
import gym
import random
import numpy as np
from gym import Env
from gym.spaces import Discrete, Box 
from helper import transition_rules, reward_rules

In [2]:

# Class MouseGrid that inherits from OpenAI Gym Env
class MouseGrid(Env):

  # Constructor that is called when an instance of the class is created
  def __init__(self):

    # Set the observation_space i.e the state to 16 discrete values
    self.observation_space = Discrete(16)

    # Set the action space to 4 discrete values i.e up, left, right and down 
    self.action_space = Discrete(4)

    # Set the initial state of the mouse to 1
    self.state = 1

    # Set the initial reward as 0
    self.reward = 0

  # Define a function step that inputs an action and updates the reward
  def step(self, action):

    # Update the action to the action sent as the parameter of this function
    self.action = action

    # Set a variable prev_state as the current state of the environment
    prev_state = self.state

    # Update the state based on the current state and action by 
    # calling the transition_rules function with the current state and action
    self.state = transition_rules(self.state, self.action)

    # Update the reward by calling the function reward_rules by passing
    # the current state and previous state
    self.reward = self.reward + reward_rules(self.state, prev_state)

    # If the current state is 16 that means our mouse has reached the goal
    # For this, we set the call the reward_rules function again
    # Set done as True
    if self.state==16:
      self.reward = self.reward + reward_rules(self.state, prev_state)
      done = True

    # Else set done as false
    else:
      done = False

    # Return the state, reward and done to indicate whether an episode is complete
    return self.state, self.reward, done


  # The reset function which is called at the end of each episode
  def reset(self):

    # Reset the initial state to the start point
    self.state=1

    # Reset the reward to 0
    self.reward = 0

    # Set done as False
    done = False


## SET THE AGENT TO TEST THE ENVIRONMENT

In [3]:
# Create an instance of the custom environment
env = MouseGrid()

# Set the maximum number of episodes to any integer between 10 and 50
episodes = 10

# Loop over all the steps
for i in range(episodes):

  # Set done as False to run until the end of the episode
  done = False

  # Loop over the entire episode
  while done!=True:

    # Sample an action from the action_space of the environment
    action = env.action_space.sample()

    # Call the step function within the environment
    state, reward, done = env.step(action)

  # Call the reset function at the end of an episode
  env.reset()

  # Print the reward at the end of each episode
  print("The reward of this episode is:",reward)

The reward of this episode is: -142
The reward of this episode is: -14
The reward of this episode is: -300
The reward of this episode is: -61
The reward of this episode is: -876
The reward of this episode is: 205
The reward of this episode is: -7
The reward of this episode is: 109
The reward of this episode is: 48
The reward of this episode is: -37


### ⏸ Here we are using random sampling to pick an action for a given state. However, if you had a policy, which part of the exercise would you change to incorporate it?

#### A. The step() method of the MouseGrid class.
#### B. self.action within the __init__() method of the MouseGrid class.
#### C. getting an action for each step within the for loop over all episodes.
#### D. Call to the step() method in the last cell. 


In [4]:
### edTest(test_chow1) ###
# Submit an answer choice as a string below (eg. if you choose option A, put 'A')
answer1 = 'C'


### ⏸ Which of the following would be an issue if the reset() method is not called at the end of each episode?

#### A. The next episode will not run as the state is 16.
#### B. The action sampled next will be biased on the previous value.
#### C. The reward of the new episode will be combined to the reward of the previous episode.
#### D. The reset() method does not affect anything.

In [5]:
### edTest(test_chow2) ###
# Submit an answer choice as a string below 
# There can be multiple correct answers. Replace the options with a hyphen
# For example if you think the correct choice is A and D, then type 'A-D'
answer2 = 'A-C'