In [1]:
import numpy as np

# Define the maze layout
maze = np.array([
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 0, 1, 0, 0, 0, 1, 0, 0, 1],
    [1, 0, 1, 0, 1, 0, 1, 0, 1, 1],
    [1, 0, 0, 0, 1, 0, 0, 0, 0, 1],
    [1, 1, 1, 0, 1, 1, 1, 1, 0, 1],
    [1, 0, 1, 'G', 0, 0, 0, 1, 0, 1],
    [1, 0, 0, 0, 1, 1, 0, 0, 0, 1],
    [1, 1, 1, 0, 1, 0, 1, 1, 0, 1],
    [1, 0, 0, 0, 0, 0, 1, 'E', 0, 1],
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
])

# Define starting position and goals
start = (1, 1)  # Coordinates of 'S'
goal_sub = (5, 3)  # Coordinates of 'G'
goal_end = (8, 7)  # Coordinates of 'E'


In [2]:
import random

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate
num_episodes = 1000

# Initialize Q-table
q_table = np.zeros((10, 10, 4))  # 4 actions: up, down, left, right

# Action mapping
actions = {
    0: (-1, 0),  # Up
    1: (1, 0),   # Down
    2: (0, -1),  # Left
    3: (0, 1)    # Right
}

def is_valid_move(maze, position):
    x, y = position
    return 0 <= x < maze.shape[0] and 0 <= y < maze.shape[1] and maze[x, y] != 1

def get_reward(position):
    if position == goal_sub:
        return 10  # Reward for reaching the sub-goal
    elif position == goal_end:
        return 100  # Reward for reaching the end goal
    return -1  # Penalty for each step

# Training the Q-learning agent
for episode in range(num_episodes):
    state = start
    while True:
        # Epsilon-greedy action selection
        if random.uniform(0, 1) < epsilon:
            action = random.choice(range(4))  # Explore
        else:
            action = np.argmax(q_table[state[0], state[1]])  # Exploit

        # Take action and observe new state and reward
        next_state = (state[0] + actions[action][0], state[1] + actions[action][1])
        
        if is_valid_move(maze, next_state):
            reward = get_reward(next_state)
            # Update Q-value
            q_table[state[0], state[1], action] += alpha * (reward + gamma * np.max(q_table[next_state[0], next_state[1]]) - q_table[state[0], state[1], action])
            state = next_state
        else:
            reward = -1  # Penalty for invalid move


KeyboardInterrupt: 

In [None]:
steps_to_goal_sub = []
steps_to_goal_end = []

for episode in range(num_episodes):
    state = start
    steps = 0
    reached_sub_goal = False
    
    while True:
        action = np.argmax(q_table[state[0], state[1]])
        next_state = (state[0] + actions[action][0], state[1] + actions[action][1])
        
        if is_valid_move(maze, next_state):
            steps += 1
            if not reached_sub_goal and next_state == goal_sub:
                reached_sub_goal = True
            if reached_sub_goal and next_state == goal_end:
                steps_to_goal_end.append(steps)
                break
            state = next_state
        else:
            steps += 1  # Count penalties for invalid moves


In [5]:
import numpy as np
import random
import matplotlib.pyplot as plt
import time  # Import time library

# Define the maze layout
maze = np.array([
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    [1, 0, 1, 0, 0, 0, 1, 0, 0, 1],
    [1, 0, 1, 0, 1, 0, 1, 0, 1, 1],
    [1, 0, 0, 0, 1, 0, 0, 0, 0, 1],
    [1, 1, 1, 0, 1, 1, 1, 1, 0, 1],
    [1, 0, 1, 'G', 0, 0, 0, 1, 0, 1],
    [1, 0, 0, 0, 1, 1, 0, 0, 0, 1],
    [1, 1, 1, 0, 1, 0, 1, 1, 0, 1],
    [1, 0, 0, 0, 0, 0, 1, 'E', 0, 1],
    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
])

# Define starting position and goals
start = (1, 1)  # Coordinates of 'S'
goal_sub = (5, 3)  # Coordinates of 'G'
goal_end = (8, 7)  # Coordinates of 'E'

# Hyperparameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate
num_episodes = 1000
time_limit = 5  # Time limit in seconds for each episode

# Initialize Q-table
q_table = np.zeros((10, 10, 4))  # 4 actions: up, down, left, right

# Action mapping
actions = {
    0: (-1, 0),  # Up
    1: (1, 0),   # Down
    2: (0, -1),  # Left
    3: (0, 1)    # Right
}

def is_valid_move(maze, position):
    x, y = position
    return 0 <= x < maze.shape[0] and 0 <= y < maze.shape[1] and maze[x, y] != 1

def get_reward(position):
    if position == goal_sub:
        return 10  # Reward for reaching the sub-goal
    elif position == goal_end:
        return 100  # Reward for reaching the end goal
    return -1  # Penalty for each step

# Training the Q-learning agent
for episode in range(num_episodes):
    state = start
    steps = 0  # Track steps taken in this episode
    reached_sub_goal = False
    start_time = time.time()  # Start timing the episode

    while True:
        # Epsilon-greedy action selection
        if random.uniform(0, 1) < epsilon:
            action = random.choice(range(4))  # Explore
        else:
            action = np.argmax(q_table[state[0], state[1]])  # Exploit

        # Take action and observe new state and reward
        next_state = (state[0] + actions[action][0], state[1] + actions[action][1])
        
        if is_valid_move(maze, next_state):
            reward = get_reward(next_state)
            # Update Q-value
            q_table[state[0], state[1], action] += alpha * (reward + gamma * np.max(q_table[next_state[0], next_state[1]]) - q_table[state[0], state[1], action])
            state = next_state
            
            steps += 1  # Count steps
            if next_state == goal_sub:
                reached_sub_goal = True
            if next_state == goal_end:
                print(f"Episode {episode + 1} completed in {steps} steps.")
                break
        else:
            reward = -1  # Penalty for invalid move
            steps += 1  # Count penalty for invalid moves

        # Check for time limit
        if time.time() - start_time > time_limit:
            print(f"Episode {episode + 1} reached time limit.")
            break

# Function to visualize the maze and agent's path
def visualize_maze(maze, path=None):
    plt.figure(figsize=(6, 6))
    plt.imshow(maze, cmap='gray_r')
    if path is not None:
        path_x, path_y = zip(*path)
        plt.plot(path_y, path_x, color='yellow', linewidth=3)  # Path taken by the agent
    plt.scatter(start[1], start[0], c='blue', label='Start (S)', s=100)
    plt.scatter(goal_sub[1], goal_sub[0], c='green', label='Sub-goal (G)', s=100)
    plt.scatter(goal_end[1], goal_end[0], c='red', label='End goal (E)', s=100)
    plt.xticks(np.arange(10))
    plt.yticks(np.arange(10))
    plt.gca().invert_yaxis()  # Invert y axis to match maze layout
    plt.grid()
    plt.legend()
    plt.title('Maze Navigation with Q-learning')
    plt.show()

# Find the path taken to the end goal
def find_path(q_table, start):
    state = start
    path = [state]
    while state != goal_end:
        action = np.argmax(q_table[state[0], state[1]])
        next_state = (state[0] + actions[action][0], state[1] + actions[action][1])
        if is_valid_move(maze, next_state):
            path.append(next_state)
            state = next_state
        else:
            break  # Break if no valid moves (though this shouldn't happen)
    return path

# Find the path taken to the end goal
final_path = find_path(q_table, start)

# Visualize the maze with the agent's final path
visualize_maze(maze, path=final_path)

# Visualizing Q-values
def visualize_q_values(q_table):
    plt.figure(figsize=(12, 8))
    for i in range(10):
        for j in range(10):
            if maze[i, j] == 1:  # Wall positions
                continue
            plt.subplot(10, 10, i * 10 + j + 1)
            plt.bar(range(4), q_table[i, j])
            plt.ylim(0, np.max(q_table))
            plt.title(f'({i},{j})')
            plt.xticks(range(4), ['Up', 'Down', 'Left', 'Right'])
            plt.grid()
    plt.tight_layout()
    plt.suptitle('Q-values for Each State (Action Value Estimates)', y=1.05)
    plt.show()

# Visualize the Q-values
visualize_q_values(q_table)




Episode 1 reached time limit.
Episode 2 reached time limit.
Episode 3 reached time limit.
