In [1]:
import numpy as np

In [2]:
class GridWorldEnvironment:
    def __init__(self):
        self.num_states = 9
        self.num_actions = 4
        self.current_state = 0  # Initial state
        self.goal_state = 8
        self.pit_state = 5
        self.done = False

    def step(self, action):
        if self.done:
            raise ValueError("Episode is already done. Call reset() to start a new episode.")

        # Define the grid world transitions
        transitions = {
            0: {'up': 0, 'down': 3, 'left': 0, 'right': 1},
            1: {'up': 1, 'down': 4, 'left': 0, 'right': 2},
            2: {'up': 2, 'down': 5, 'left': 1, 'right': 2},
            3: {'up': 0, 'down': 6, 'left': 3, 'right': 4},
            4: {'up': 1, 'down': 7, 'left': 3, 'right': 5},
            5: {'up': 2, 'down': 8, 'left': 4, 'right': 5},
            6: {'up': 3, 'down': 6, 'left': 6, 'right': 7},
            7: {'up': 4, 'down': 7, 'left': 6, 'right': 8},
            8: {'up': 5, 'down': 8, 'left': 7, 'right': 8}
        }

        # Perform the action and get the next state
        next_state = transitions[self.current_state][action]

        # Update the current state
        self.current_state = next_state

        # Check if the agent reached the goal or fell into the pit
        if next_state == self.goal_state:
            reward = 10  # Positive reward for reaching the goal
            self.done = True
        elif next_state == self.pit_state:
            reward = -5  # Negative reward for falling into the pit
            self.done = True
        else:
            reward = 0

        return next_state, reward, self.done, {}

    def reset(self):
        self.current_state = 0
        self.done = False
        return self.current_state


In [57]:
# Create the environment
env = GridWorldEnvironment()

# Initialize the Q-table with zeros
q_table = np.zeros((env.num_states, env.num_actions))
# Define action index mapping
action_index = {'up': 0, 'down': 1, 'left': 2, 'right': 3}
# Set hyperparameters
learning_rate = 0.5
discount_factor = 0.9
num_episodes = 1000


# Q-learning algorithm
for episode in range(num_episodes):
    state = env.reset()  # Reset the environment for a new episode
    done = False

    while not done:
        # Choose an action based on epsilon-greedy policy
        epsilon = 1 # explore only
        if np.random.rand() < epsilon:
            action = np.random.choice(['up', 'down', 'left', 'right'])
        else:
            state = int(state)
            #action = ['up', 'down', 'left', 'right'][np.argmax(q_table[state, :])]
            # Choose randomly among actions with the maximum Q-value
            max_actions = np.random.choice(np.where(q_table[state, :] == np.max(q_table[state, :]))[0])
            action = ['up', 'down', 'left', 'right'][max_actions]


        # Take the chosen action and observe the new state and reward
        next_state, reward, done, _ = env.step(action)

        # Update the Q-value using the Q-learning update rule
        q_table[state, action_index[action]] += learning_rate * (reward + discount_factor * np.max(q_table[next_state, :]) - q_table[state, action_index[action]])

        # Move to the next state
        state = next_state
print(q_table)

[[ 6.561  7.29   6.561  7.29 ]
 [ 7.29   8.1    6.561  6.561]
 [ 6.561 -5.     7.29   6.561]
 [ 6.561  8.1    7.29   8.1  ]
 [ 7.29   9.     7.29  -5.   ]
 [ 0.     0.     0.     0.   ]
 [ 7.29   8.1    8.1    9.   ]
 [ 8.1    9.     8.1   10.   ]
 [ 0.     0.     0.     0.   ]]


In [58]:
# After training, you can use the learned Q-table to navigate from the start state to the goal state
state = env.reset()  # Reset the environment
path = [state]
max_steps = 100
while not env.done and len(path) < max_steps:  # Goal state
    state = int(state)
    action = ['up', 'down', 'left', 'right'][np.random.choice(np.where(q_table[state, :] == np.max(q_table[state, :]))[0])]
    next_state, _, _, _ = env.step(action)
    path.append(next_state)
    state = next_state

print("Optimal Path:", path)


Optimal Path: [0, 3, 4, 7, 8]


In [59]:
# Run the environment multiple times and store unique paths
num_runs = 100
unique_paths = set()

for _ in range(num_runs):
    state = env.reset()  # Reset the environment
    path = [state]
    max_steps = 100

    while not env.done and len(path) < max_steps:  # Goal state
        state = int(state)
        action = ['up', 'down', 'left', 'right'][np.random.choice(np.where(q_table[state, :] == np.max(q_table[state, :]))[0])]
        next_state, _, _, _ = env.step(action)
        path.append(next_state)
        state = next_state

    # Convert the path to a tuple and add it to the set to ensure uniqueness
    unique_paths.add(tuple(path))

# Print the number of unique paths explored
print("Number of Unique Paths Explored:", len(unique_paths))

# Optionally, print or analyze the unique paths themselves
for idx, path in enumerate(unique_paths):
    print(f"Unique Path {idx + 1}: {path}")


Number of Unique Paths Explored: 3
Unique Path 1: (0, 1, 4, 7, 8)
Unique Path 2: (0, 3, 6, 7, 8)
Unique Path 3: (0, 3, 4, 7, 8)
