In [1]:
import pygame

import random
from collections import deque

pygame 2.6.1 (SDL 2.28.4, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:
import numpy as np

# Parameters
grid_size = 5  # Grid size for player and enemies (positions range from 0 to 4)
num_enemies = 1  # Number of enemies
visited_states = 2  # Visited status can be 0 or 1
wall_states = 2  # Wall status can be 0 (no wall) or 1 (wall)

movements = np.array([
    [0, 1],   # Move up
    [1, 0],   # Move right
    [-1, 0],  # Move left
    [0, -1]   # Move down
])

# Generate walls: 0 for no wall, 1 for wall
estimated_value_grid = {}
policy = {}

# Update base_shape to include wall_states
# base_shape = [grid_size, grid_size, visited_states, wall_states]
# enemy_shape = [grid_size, grid_size] * num_enemies  # Each enemy has x and y positions
# full_shape = base_shape + enemy_shape

# # Generate estimated_value_grid filled with zeros
# estimated_value_grid = np.zeros(full_shape, dtype=float)

# # Generate random indices into the movements array
# policy_indices = np.random.randint(0, 4, size=full_shape)

# # Create the policy array by indexing into movements
# policy = movements[policy_indices]

# # Now, policy has shape full_shape + (2,), where the last dimension stores (dx, dy)
# print("Policy Shape:", policy.shape)  # For verification

# # Generate reward_states: 40x40 grid of random 1s and 2s
# reward_states = np.random.choice([1, 2], size=(grid_size, grid_size))

# # Output shapes for verification
# print("Estimated Value Grid Shape:", estimated_value_grid.shape)
# print("Reward States Shape:", reward_states.shape)
def optimized_relative_distances(maze_size, player_pos, enemy_positions, grid_size=5):
    x, y = player_pos
    rows, cols = maze_size  # Maze dimensions
    half_grid = grid_size // 2

    # List comprehension for efficiency
    relative_positions = [
        ((ex - x + rows) % rows - (rows if (ex - x + rows) % rows > half_grid else 0),
         (ey - y + cols) % cols - (cols if (ey - y + cols) % cols > half_grid else 0))
        for ex, ey in enemy_positions
        if -half_grid <= (ex - x + rows) % rows - (rows if (ex - x + rows) % rows > half_grid else 0) <= half_grid and
           -half_grid <= (ey - y + cols) % cols - (cols if (ey - y + cols) % cols > half_grid else 0) <= half_grid
    ]

    return relative_positions

def value_iteration(s, estimated_value_grid, policy, reward_maze, enemies, discount=0.9, threshold=0.2):
    delta = float('inf')
    iteration = 0
    grid_size = len(reward_maze[0])  # Assuming square grid
    half_grid = 5 // 2  # Since your local grid is 5x5

    while delta > threshold:
        delta = 0
        iteration += 1
        print(f"Iteration {iteration}")

        for x in range(grid_size):
            for y in range(grid_size):
                # Skip walls

                # Current state representation
                food = int(reward_maze[x][y] == 2)
                wall = int(reward_maze[x][y] == 1)

                # Get relative enemy positions within the local grid
                enemy_positions = [enemy["pos"] for enemy in enemies]
                relative_enemy_positions = optimized_relative_distances(
                    (grid_size, grid_size), (x, y), enemy_positions, grid_size=5
                )

                # Build indices for the current state
                indices_s = (2, 2, food, wall) + tuple(pos for enemy in relative_enemy_positions for pos in enemy)

                # Access the current value safely
                old_value = estimated_value_grid.get(indices_s, 0)

                # Compute the best action and corresponding value
                max_value = float('-inf')
                best_action = None
                for dx, dy in movements:
                    nextx, nexty = (x + dx) % grid_size, (y + dy) % grid_size

                    # Skip invalid moves (walls)

                    # Next state representation
                    next_food = int(reward_maze[nextx][nexty] == 2)
                    next_wall = int(reward_maze[nextx][nexty] == 1)

                    # Get relative enemy positions for the next state
                    relative_enemy_positions_next = optimized_relative_distances(
                        (grid_size, grid_size), (nextx, nexty), enemy_positions, grid_size=5
                    )

                    indices_s_bar = (2, 2, next_food, next_wall) + tuple(
                        pos for enemy in relative_enemy_positions_next for pos in enemy
                    )

                    # Compute reward
                    cell_value = reward_maze[nextx][nexty]
                    if any(enemy["pos"] == (nextx, nexty) for enemy in enemies):
                        reward = -200  # Collision with enemy
                    elif cell_value == 1:  # Wall (should not happen due to earlier check)
                        reward = -60
                    elif cell_value == 2:  # Food
                        reward = 20
                    else:
                        reward = -1  # Empty space

                    # Get the estimated value for the next state safely
                    V_s_bar = estimated_value_grid.get(indices_s_bar, 0)
                    V_s = reward + discount * V_s_bar

                    if V_s > max_value:
                        max_value = V_s
                        best_action = (dx, dy)

                # Update value and policy
                estimated_value_grid[indices_s] = max_value
                policy[indices_s] = best_action

                delta = max(delta, abs(old_value - max_value))

        print(f"Delta: {delta}")

    # After the loop, delta <= threshold
    x, y = s
    food = int(reward_maze[x][y] == 2)
    wall = int(reward_maze[x][y] == 1)
    enemy_positions = [enemy["pos"] for enemy in enemies]
    relative_enemy_positions = optimized_relative_distances(
        (grid_size, grid_size), s, enemy_positions, grid_size=5
    )
    indices_s = (2, 2, food, wall) + tuple(pos for enemy in relative_enemy_positions for pos in enemy)

    # If delta is less than or equal to threshold, return a random valid action
    if delta <= threshold:
        valid_actions = []
        for dx, dy in movements:
            nextx, nexty = (x + dx) % grid_size, (y + dy) % grid_size
            if walls[nextx, nexty] != 1:
                valid_actions.append((dx, dy))
        if valid_actions:
            random_action = random.choice(valid_actions)
        else:
            random_action = (0, 0)  # No valid moves, stay in place
        return random_action
    else:
        # Return the best action from the policy
        return policy.get(indices_s, (0, 0))


# Initialize pygame
pygame.init()

# Constants
WIDTH, HEIGHT = 400,400  # Window size
TILE_SIZE = 10  # Size of each tile
ROWS, COLS = HEIGHT // TILE_SIZE, WIDTH // TILE_SIZE
FPS = 30

# Colors
BLACK = (0, 0, 0)
WHITE = (255, 255, 255)
BLUE = (0, 0, 255)
RED = (255, 0, 0)
GREEN = (0, 255, 0)
YELLOW = (255, 255, 0)

num_enemies=1

DIRECTIONS = {
    pygame.K_w: (-1, 0),  # Up
    pygame.K_s: (1, 0),   # Down
    pygame.K_a: (0, -1),  # Left
    pygame.K_d: (0, 1)    # Right
}
def evaluate_policy(maze,policy, value_grid, reward_grid, discount=0.9, threshold=0.01):
    while True:
        delta = 0
        new_value_grid = [row[:] for row in value_grid]  # Deep copy to hold updated values
        
        for row in range(ROWS):
            for col in range(COLS):
                if maze[row][col] == 1:  # Skip walls
                    continue
                
                action = policy[row][col]  # Get the policy action for the current cell
                next_pos = move_entity((row, col), action)  # Calculate next position
                next_row, next_col = next_pos

                # Compute the updated value
                reward = reward_grid[next_row][next_col]
                new_value = reward + discount * value_grid[next_row][next_col]
                new_value_grid[row][col] = new_value

                # Track the maximum change
                delta = max(delta, abs(value_grid[row][col] - new_value))
        
        value_grid = new_value_grid  # Update the value grid
        if delta < threshold:  # Convergence check
            break
    
    return value_grid


def create_maze(rows, cols):
    """Create a Pac-Man-like maze with guaranteed movement freedom and open edges."""
    maze = [[1 for _ in range(cols)] for _ in range(rows)]

    # Create paths and loops
    for row in range(1, rows - 1):
        for col in range(1, cols - 1):
            maze[row][col] = 0

    # Add walls to create loops and sections
    for i in range(2, rows - 2, 4):  # Horizontal walls
        for j in range(1, cols - 1):
            if random.random() < 0.7:
                maze[i][j] = 1

    for i in range(2, cols - 2, 4):  # Vertical walls
        for j in range(1, rows - 1):
            if random.random() < 0.7:
                maze[j][i] = 1

    # Remove chunks of walls
    remove_wall_chunks(maze, chunk_size=5, remove_fraction=0.6)

    # Ensure all edges are open
    for col in range(cols):
        maze[0][col] = 0  # Top edge
        maze[rows - 1][col] = 0  # Bottom edge
    for row in range(rows):
        maze[row][0] = 0  # Left edge
        maze[row][cols - 1] = 0  # Right edge

    # Ensure food is accessible
    ensure_accessible(maze)

    # Place food everywhere except walls
    for row in range(rows):
        for col in range(cols):
            if maze[row][col] == 0:
                maze[row][col] = 2  # Food

    return maze

def partition_maze_optimized(maze, s):
    x, y = s
    rows, cols = len(maze), len(maze[0])
    half_grid = 5 // 2

    partition = [
        [
            maze[(x + dx) % rows][(y + dy) % cols]
            for dy in range(-half_grid, half_grid + 1)
        ]
        for dx in range(-half_grid, half_grid + 1)
    ]
    return np.array(partition)

def remove_wall_chunks(maze, chunk_size, remove_fraction):
    """
    Randomly remove chunks of walls.
    :param maze: The maze grid.
    :param chunk_size: Size of each chunk (e.g., 5x5).
    :param remove_fraction: Fraction of chunks to remove (0 to 1).
    """
    rows, cols = len(maze), len(maze[0])
    chunk_rows, chunk_cols = rows // chunk_size, cols // chunk_size

    # Create a list of all chunks
    chunks = [
        (r * chunk_size, c * chunk_size)
        for r in range(chunk_rows)
        for c in range(chunk_cols)
    ]

    # Randomly select chunks to remove
    num_chunks_to_remove = int(len(chunks) * remove_fraction)
    chunks_to_remove = random.sample(chunks, num_chunks_to_remove)

    for start_row, start_col in chunks_to_remove:
        for row in range(start_row, min(start_row + chunk_size, rows)):
            for col in range(start_col, min(start_col + chunk_size, cols)):
                maze[row][col] = 0  # Remove wall in this chunk



def ensure_accessible(maze):
    """Ensure all open spaces in the maze are connected."""
    visited = set()
    queue = deque([(1, 1)])  # Start from the top-left corner
    visited.add((1, 1))

    while queue:
        current = queue.popleft()
        for direction in DIRECTIONS.values():
            neighbor = move_entity(current, direction)
            if (
                0 <= neighbor[0] < ROWS
                and 0 <= neighbor[1] < COLS
                and neighbor not in visited
                and maze[neighbor[0]][neighbor[1]] == 0
            ):
                visited.add(neighbor)
                queue.append(neighbor)

    # Remove isolated walls (any 0 not visited is inaccessible)
    for row in range(ROWS):
        for col in range(COLS):
            if maze[row][col] == 0 and (row, col) not in visited:
                maze[row][col] = 1  # Turn inaccessible space into a wall


def draw_maze(maze, player_pos, enemies):
    """Draw the maze, player, food, and enemies."""
    screen = pygame.display.set_mode((WIDTH, HEIGHT))
    screen.fill(BLACK)
    for row in range(ROWS):
        for col in range(COLS):
            if maze[row][col] == 1:  # Wall
                pygame.draw.rect(screen, BLUE, (col * TILE_SIZE, row * TILE_SIZE, TILE_SIZE, TILE_SIZE))
            elif maze[row][col] == 2:  # Food
                pygame.draw.circle(screen, YELLOW, (col * TILE_SIZE + TILE_SIZE // 2, row * TILE_SIZE + TILE_SIZE // 2), TILE_SIZE // 4)
    # Draw player
    pygame.draw.rect(screen, GREEN, (player_pos[1] * TILE_SIZE, player_pos[0] * TILE_SIZE, TILE_SIZE, TILE_SIZE))
    # Draw enemies
    for enemy in enemies:
        pygame.draw.rect(screen, RED, (enemy["pos"][1] * TILE_SIZE, enemy["pos"][0] * TILE_SIZE, TILE_SIZE, TILE_SIZE))


def move_entity(pos, direction):
    """Move an entity (player or enemy) with wrap-around logic."""
    new_row = (pos[0] + direction[0]) % ROWS  # Wrap around vertically
    new_col = (pos[1] + direction[1]) % COLS  # Wrap around horizontally
    return (new_row, new_col)


def is_valid_move(pos, maze):
    """Check if a move is valid."""
    return maze[pos[0]][pos[1]] != 1



def find_far_target(maze, start, distance_threshold=15):
    """Find a random valid target far away from the start position."""
    far_targets = []
    for row in range(ROWS):
        for col in range(COLS):
            if maze[row][col] == 0:  # Valid open space
                distance = abs(start[0] - row) + abs(start[1] - col)  # Manhattan distance
                if distance >= distance_threshold:
                    far_targets.append((row, col))
    return random.choice(far_targets) if far_targets else start  # Return a random far target or stay in place


def bfs_pathfinding(maze, start, target):
    """Find the shortest path from start to target using BFS."""
    queue = deque([(start, [])])  # Queue holds tuples of (current_position, path_to_position)
    visited = set()  # Keep track of visited positions
    visited.add(start)

    while queue:
        current, path = queue.popleft()
        if current == target:
            return path  # Return the path to the target

        for direction in DIRECTIONS.values():
            neighbor = move_entity(current, direction)
            if neighbor not in visited and is_valid_move(neighbor, maze):
                visited.add(neighbor)
                queue.append((neighbor, path + [neighbor]))

    return []  # Return empty path if no valid path found

def move_enemies(enemies, maze):
    """Move all enemies toward far-away targets using BFS."""
    for enemy in enemies:
        # If the enemy has no target or reached its target, assign a new far target
        if enemy["target"] is None or enemy["pos"] == enemy["target"]:
            enemy["target"] = find_far_target(maze, enemy["pos"])

        # Find the path to the target using BFS
        path = bfs_pathfinding(maze, enemy["pos"], enemy["target"])
        if path:  # If there's a valid path, move to the next step on the path
            enemy["pos"] = path[0]

def get_reward(state, maze):
    row, col = state
    if maze[row][col] == 2:  # Food
        return 10
    elif maze[row][col] == 1:  # Wall
        return -1
    elif maze[row][col] == 'E':
        return -100
    else:
        return -0.1
def print_maze_with_entities(maze, player_pos, enemies):
    """
    Print the maze to the console with player and enemies represented.
    Player: 'P'
    Enemies: 'E'
    Walls: '#'
    Food: 'o'
    Empty space: '.'
    """
    # Create a copy of the maze to overlay entities
    maze_copy = [[cell for cell in row] for row in maze]


    # Place the player
    maze_copy[player_pos[0]][player_pos[1]] = 'P'
    
    # Place the enemies
    for enemy in enemies:
        row, col = enemy["pos"]
        maze_copy[row][col] = 'E'
    
    # Print the maze
    for row in maze_copy:
        print("".join(
            "#" if cell == 1 else 
            "o" if cell == 2 else 
            "P" if cell == 'P' else 
            "E" if cell == 'E' else 
            "." for cell in row
        ))
    print("\n" + "=" * 50 + "\n")  # Separator for frames

def extract_view(maze, player_pos, size=4):
    view_range = size // 2  # Half the size for the square area
    player_row, player_col = player_pos

    maze_height = len(maze)
    maze_width = len(maze[0])

    # Determine the slicing boundaries
    start_row = max(player_row - view_range, 0)
    end_row = min(player_row + view_range, maze_height - 1) + 1
    start_col = max(player_col - view_range, 0)
    end_col = min(player_col + view_range, maze_width - 1) + 1

    # Extract the sliced grid
    sliced_view = [row[start_col:end_col] for row in maze[start_row:end_row]]

    # Add padding if necessary to ensure 4x4 size
    while len(sliced_view) < size:  # Pad rows to make it 4x4
        if start_row == 0:  # Add empty rows at the bottom if near the top edge
            sliced_view.append(['#'] * len(sliced_view[0]))
        else:  # Add empty rows at the top if near the bottom edge
            sliced_view.insert(0, ['#'] * len(sliced_view[0]))

    for row in sliced_view:  # Pad columns in each row
        while len(row) < size:
            if start_col == 0:  # Add walls to the right if near the left edge
                row.append('#')
            else:  # Add walls to the left if near the right edge
                row.insert(0, '#')

    return sliced_view

def main():
    """Main game loop."""
    clock = pygame.time.Clock()
    running = True

    # Store the maze to prevent re-generation
    static_maze = create_maze(ROWS, COLS)

    # Initialize the game
    def restart_game():
        player_pos = (1, 1)
        enemies = [
            {"pos": (ROWS - 12, COLS // 2), "target": None},
        ]
        # Use the pre-generated static maze
        maze = [row[:] for row in static_maze]  # Deep copy to reset the maze state
        return maze, player_pos, enemies, 0

    # Initialize the maze, player, enemies, and score
    maze, player_pos, enemies, score = restart_game()

    # Main game loop
    while running:
        # Handle events (e.g., quitting the game)
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                running = False

        # Handle player movement
        current_state = player_pos
        action = value_iteration(current_state,estimated_value_grid,policy,maze,enemies,discount=0.9)
        print(action)
        new_player_pos = move_entity(player_pos, action)
        if is_valid_move(new_player_pos, maze):
            if maze[new_player_pos[0]][new_player_pos[1]] == 2:  # Collect food
                score += 1
                maze[new_player_pos[0]][new_player_pos[1]] = 0
            player_pos = new_player_pos


        # Move enemies
        move_enemies(enemies, maze)

        # Check for collisions
        for enemy in enemies:
            if enemy["pos"] == player_pos:
                print("Game Over! Restarting...")
                maze, player_pos, enemies, score = restart_game()

        # Check if all food is collected
        if all(maze[row][col] != 2 for row in range(ROWS) for col in range(COLS)):
            print("You Win! Restarting...")
            maze, player_pos, enemies, score = restart_game()

        # Print the maze with entities to the console
        print("#############################")
        maze_copy = [[cell for cell in row] for row in maze]
        maze_copy[player_pos[0]][player_pos[1]] = 'P'

        # # Place the enemies
        for enemy in enemies:
            row, col = enemy["pos"]
            maze_copy[row][col] = 'E'

        # # Print the 4x4 view
        for row in maze_copy:
            print("".join(str(cell) for cell in row))

        print("#############################")
        # Draw everything
        draw_maze(maze, player_pos, enemies)
        pygame.display.flip()
        clock.tick(FPS)

    pygame.quit()

if __name__ == "__main__":
    main()


Iteration 1
Delta: 199.99999999999991
Iteration 2
Delta: 80.0
Iteration 3
Delta: 80.0
Iteration 4
Delta: 80.0
Iteration 5
Delta: 80.0
Iteration 6
Delta: 80.0
Iteration 7
Delta: 80.0
Iteration 8
Delta: 80.0
Iteration 9
Delta: 80.0
Iteration 10
Delta: 80.0
Iteration 11
Delta: 80.0
Iteration 12
Delta: 80.0
Iteration 13
Delta: 80.0
Iteration 14
Delta: 80.0
Iteration 15
Delta: 80.0
Iteration 16
Delta: 80.0
Iteration 17
Delta: 80.0
Iteration 18
Delta: 80.0
Iteration 19
Delta: 80.0
Iteration 20
Delta: 80.0
Iteration 21
Delta: 80.0
Iteration 22
Delta: 80.0
Iteration 23
Delta: 80.0
Iteration 24
Delta: 80.0
Iteration 25
Delta: 80.0
Iteration 26
Delta: 80.0
Iteration 27
Delta: 80.0
Iteration 28
Delta: 80.0
Iteration 29
Delta: 80.0
Iteration 30
Delta: 80.0
Iteration 31
Delta: 80.0
Iteration 32
Delta: 80.0
Iteration 33
Delta: 80.0
Iteration 34
Delta: 80.0
Iteration 35
Delta: 80.0
Iteration 36
Delta: 80.0
Iteration 37
Delta: 80.0
Iteration 38
Delta: 80.0
Iteration 39
Delta: 80.0
Iteration 40
Delta: 

KeyboardInterrupt: 

In [None]:
print(estimated_value_grid)

[[[[[[0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]]

    [[0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]]]


   [[[0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]]

    [[0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]]]]



  [[[[0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]]

    [[0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]]]


   [[[0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]]

    [[0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]]]]



  [[[[0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
     [0. 0. 0. 0. 0.]
    

In [None]:
ROWS=40
COLS =40
enemies = [
            {"pos": (ROWS - 12, COLS // 2), "target": None},
            {"pos": (ROWS - 13, COLS // 2), "target": None},
        ]
s=(12,2)# player position/ current state

# we have 2 enemies a 40x40 maze and a visited and not visited hence 
# 40x40x2x(1600*1600) = 2.62144e+10 states
# try2 store relative distance of enemy to player
# 40x40x2  = 3200 states  x 1600^2 fail dosent change number of states
# a 5x5 grid  25*2 visited 50 into 25*25 is 31 250 much moer managable 
def get_state_key(x, y, visited, enemy_positions):
    # Convert enemy_positions to a tuple of tuples to make it hashable
    enemy_positions_tuple = tuple(tuple(pos) for pos in enemy_positions)
    return (x, y, visited, enemy_positions_tuple)
policy = {}
estimated_value_grid = {}
reward_states = {}



In [None]:
import numpy as np

# Parameters
grid_size = 5  # Grid size for player and enemies (positions range from 0 to 4)
num_enemies = 1  # Number of enemies
visited_states = 2  # Visited status can be 0 or 1
wall_states = 2  # Wall status can be 0 (no wall) or 1 (wall)

movements = np.array([
    [0, 1],   # Move up
    [1, 0],   # Move right
    [-1, 0],  # Move left
    [0, -1]   # Move down
])

# Generate walls: 0 for no wall, 1 for wall
walls = np.random.choice([0, 1], size=(grid_size, grid_size), p=[0.8, 0.2])  # 20% chance of wall

# Update base_shape to include wall_states
base_shape = [grid_size, grid_size, visited_states, wall_states]
enemy_shape = [grid_size, grid_size] * num_enemies  # Each enemy has x and y positions
full_shape = base_shape + enemy_shape

# Generate estimated_value_grid filled with zeros
estimated_value_grid = np.zeros(full_shape, dtype=float)

# Generate random indices into the movements array
policy_indices = np.random.randint(0, 4, size=full_shape)

# Create the policy array by indexing into movements
policy = movements[policy_indices]

# Now, policy has shape full_shape + (2,), where the last dimension stores (dx, dy)
print("Policy Shape:", policy.shape)  # For verification

# Generate reward_states: 40x40 grid of random 1s and 2s
reward_states = np.random.choice([1, 2], size=(grid_size, grid_size))

# Output shapes for verification
print("Estimated Value Grid Shape:", estimated_value_grid.shape)
print("Reward States Shape:", reward_states.shape)

Policy Shape: (5, 5, 2, 2, 5, 5, 2)
Estimated Value Grid Shape: (5, 5, 2, 2, 5, 5)
Reward States Shape: (5, 5)
