In [94]:
import numpy as np
import random

# Create the environment
class GridWorld:
    def __init__(self, width, height, bomb_positions, gold_positions, wall_positions):
        self.width = width
        self.height = height
        self.bomb_positions = bomb_positions
        self.gold_positions = gold_positions
        self.wall_positions = wall_positions
        
    def is_valid(self, x, y):
        if (x, y) not in self.wall_positions and 0 <= x < self.width and 0 <= y < self.height:
            return True
        return False

    def step(self, position, action):
        x, y = position
        if action == 0:  # Up
            y -= 1
        elif action == 1:  # Down
            y += 1
        elif action == 2:  # Left
            x -= 1
        elif action == 3:  # Right
            x += 1

        if not self.is_valid(x, y):
            x, y = position

        reward = -1
        if position == (x, y):  # Penalize staying in one place
            reward = -5
        done = False
        if (x, y) in self.bomb_positions:
            reward = -100
            done = True
        elif (x, y) in self.gold_positions:
            reward = 100
            done = True

        return (x, y), reward, done

    def print_grid(self, position):
        for y in range(self.height):
            row = []
            for x in range(self.width):
                if (x, y) == position:
                    row.append('A')
                elif (x, y) in self.bomb_positions:
                    row.append('B')
                elif (x, y) in self.gold_positions:
                    row.append('G')
                elif (x, y) in self.wall_positions:
                    row.append('|')
                else:
                    row.append('.')
            print(''.join(row))
        print()


# Initialize the Q-table
def init_q_table(width, height, num_actions):
    return np.zeros((width, height, num_actions))

def update_q_table(q_table, learning_rate, discount_factor, state, action, new_state, reward):
    x, y = state
    new_x, new_y = new_state
    current_q = q_table[x, y, action]
    new_q = reward + discount_factor * np.max(q_table[new_x, new_y])
    q_table[x, y, action] += learning_rate * (new_q - current_q)

# Q-learning algorithm
def q_learning(env, q_table, num_episodes, learning_rate, discount_factor, exploration_rate, exploration_decay, max_steps_per_episode, start_position):
    env.print_grid(start_position)
    
    for episode in range(num_episodes):
        x, y = start_position
        done = False
        for step in range(max_steps_per_episode):
            action = np.argmax(q_table[x, y])
            if np.random.rand() < exploration_rate:
                action = np.random.randint(4)

            new_position, reward, done = env.step((x, y), action)
            new_x, new_y = new_position

            update_q_table(q_table, learning_rate, discount_factor, (x, y), action, (new_x, new_y), reward)
            x, y = new_x, new_y

            if done:
                break

        exploration_rate *= exploration_decay


    print(q_table)
    return q_table

           
# Test the trained agent
def test_agent(env, q_table, start_position, max_steps):
    x, y = start_position
    path = [start_position]

    for step in range(max_steps):
        action = np.argmax(q_table[x, y])
        new_position, reward, done = env.step((x, y), action)
        new_x, new_y = new_position

        path.append(new_position)
        x, y = new_x, new_y

        if done:
            break

    return path

In [95]:
# Set up the environment and Q-learning parameters
width, height = 20, 20
bomb_positions = [(1,1),(1,0),(1,2),(1,3),(1,4),(1,5),(10, 10), (12, 12)]
gold_positions = [(19, 19)]
wall_positions = [(11, 11)]

env = GridWorld(width, height, bomb_positions, gold_positions, wall_positions)
q_table = init_q_table(width, height, 4)

num_episodes = 5000
learning_rate = 0.3
discount_factor = 0.99
exploration_rate = 0.1
max_steps_per_episode = 500

exploration_rate = 1.0
exploration_decay = 0.9995
start_position = (0, 0)
q_table = q_learning(env, q_table, num_episodes, learning_rate, discount_factor, exploration_rate, exploration_decay, max_steps_per_episode, start_position)

# Test the trained agent
max_test_steps = 1000
path = test_agent(env, q_table, start_position, max_test_steps)

# Print the path
print("Path taken by the agent from the starting position:")
for position in path:
    print(position)

AB..................
.B..................
.B..................
.B..................
.B..................
.B..................
....................
....................
....................
....................
..........B.........
...........|........
............B.......
....................
....................
....................
....................
....................
....................
...................G

[[[  32.510919     37.88981717   32.510919   -100.        ]
  [  36.510919     39.28264361   33.88981717 -100.        ]
  [  37.88981717   40.689539     35.28264361 -100.        ]
  ...
  [  -7.99924968   -7.96848766  -10.8196589    -8.21196062]
  [  -8.06927478   -8.01061475  -10.55239576   -7.87551128]
  [  -8.02823339  -10.49759108  -10.48605541   -8.21647166]]

 [[   0.            0.            0.            0.        ]
  [   0.            0.            0.            0.        ]
  [   0.            0.            0.            0.        ]
  ...
  [  -7.83306494   -7.944