In [1]:
import numpy as np

# Define the grid world environment
class GridWorld:
    def __init__(self, width, height, start, goal):
        self.width = width
        self.height = height
        self.start = start
        self.goal = goal
        self.state = start

    def step(self, action):
        x, y = self.state

        if action == "up":
            x = max(0, x - 1)
        elif action == "down":
            x = min(self.width - 1, x + 1)
        elif action == "left":
            y = max(0, y - 1)
        elif action == "right":
            y = min(self.height - 1, y + 1)

        self.state = (x, y)

        if self.state == self.goal:
            reward = 1.0
            done = True
        else:
            reward = 0.0
            done = False

        return self.state, reward, done

    def reset(self):
        self.state = self.start
        return self.state

# Define a simple Q-learning agent
class QLearningAgent:
    def __init__(self, actions, learning_rate=0.1, discount_factor=0.9, exploration_prob=0.2):
        self.actions = actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_prob = exploration_prob
        self.q_table = {}

    def select_action(self, state):
        if np.random.uniform(0, 1) < self.exploration_prob:
            return np.random.choice(self.actions)
        else:
            if state in self.q_table:
                return max(self.q_table[state], key=self.q_table[state].get)
            else:
                return np.random.choice(self.actions)

    def update_q_table(self, state, action, reward, next_state):
        if state not in self.q_table:
            self.q_table[state] = {a: 0.0 for a in self.actions}

        max_q_value = max(self.q_table[next_state].values()) if next_state in self.q_table else 0.0

        self.q_table[state][action] += self.learning_rate * (
            reward + self.discount_factor * max_q_value - self.q_table[state][action]
        )

# Define the main training loop
def train_agent(agent, environment, num_episodes):
    for episode in range(num_episodes):
        state = environment.reset()
        done = False

        while not done:
            action = agent.select_action(state)
            next_state, reward, done = environment.step(action)
            agent.update_q_table(state, action, reward, next_state)
            state = next_state

# Create a grid world environment
grid_width = 5
grid_height = 5
start_state = (0, 0)
goal_state = (4, 4)
environment = GridWorld(grid_width, grid_height, start_state, goal_state)

# Create a Q-learning agent
actions = ["up", "down", "left", "right"]
agent = QLearningAgent(actions)

# Train the agent
train_agent(agent, environment, num_episodes=1000)

# Test the trained agent
state = environment.reset()
done = False

while not done:
    action = agent.select_action(state)
    next_state, _, done = environment.step(action)
    state = next_state

print(f"Agent reached the goal: {state == goal_state}")

Agent reached the goal: True
