In [2]:
import numpy as np
import random

class GridWorld:
    def __init__(self, width, height, start, goal, obstacles):
        self.width = width
        self.height = height
        self.start = start
        self.goal = goal
        self.obstacles = obstacles
        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # right, left, down, up
        self.state = start

    def reset(self):
        self.state = self.start

    def step(self, action):
        x, y = self.state
        dx, dy = self.actions[action]
        new_x, new_y = x + dx, y + dy
        if (new_x, new_y) not in self.obstacles and 0 <= new_x < self.width and 0 <= new_y < self.height:
            self.state = (new_x, new_y)
        reward = 1 if self.state == self.goal else 0
        done = self.state == self.goal
        return self.state, reward, done

    def get_state(self):
        return self.state

def epsilon_greedy(Q, state, epsilon):
    if random.uniform(0, 1) < epsilon:
        return random.randint(0, len(Q[state]) - 1)
    else:
        return np.argmax(Q[state])
def q_learning(grid_world, num_episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1):
    Q = np.zeros((grid_world.width, grid_world.height, len(grid_world.actions)))
    for _ in range(num_episodes):
        grid_world.reset()
        state = grid_world.get_state()
        while True:
            action = epsilon_greedy(Q, state, epsilon)
            next_state, reward, done = grid_world.step(action)
            Q[state][action] += alpha * (reward + gamma * np.max(Q[next_state]) - Q[state][action])
            state = next_state
            if done:
                break
    return Q

def sarsa(grid_world, num_episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1):
    Q = np.zeros((grid_world.width, grid_world.height, len(grid_world.actions)))
    for _ in range(num_episodes):
        grid_world.reset()
        state = grid_world.get_state()
        action = epsilon_greedy(Q, state, epsilon)
        while True:
            next_state, reward, done = grid_world.step(action)
            next_action = epsilon_greedy(Q, next_state, epsilon)
            Q[state][action] += alpha * (reward + gamma * Q[next_state][next_action] - Q[state][action])
            state = next_state
            action = next_action
            if done:
                break
    return Q

# Example usage
width, height = 5, 5
start, goal = (0, 0), (4, 4)
obstacles = [(1, 1), (2, 2), (3, 3)]
grid_world = GridWorld(width, height, start, goal, obstacles)

q_values = q_learning(grid_world)
print("Q-values for Q-learning:")
print(q_values)

sarsa_values = sarsa(grid_world)
print("Q-values for SARSA:")
print(sarsa_values)

Q-values for Q-learning:
[[[0.4782969  0.42184206 0.27814079 0.38852832]
  [0.531441   0.41053009 0.44627215 0.46034103]
  [0.59049    0.45105452 0.28261363 0.48698545]
  [0.6561     0.50201009 0.62380423 0.5618447 ]
  [0.5632708  0.5660291  0.729      0.64690317]]

 [[0.         0.03208159 0.         0.3950708 ]
  [0.         0.         0.         0.        ]
  [0.54121129 0.         0.12163338 0.        ]
  [0.72876817 0.04772711 0.0753964  0.        ]
  [0.68585552 0.5910182  0.81       0.60694406]]

 [[0.         0.         0.         0.        ]
  [0.         0.         0.         0.        ]
  [0.         0.         0.         0.        ]
  [0.78218593 0.         0.         0.        ]
  [0.77874892 0.61012212 0.9        0.68388267]]

 [[0.         0.         0.         0.        ]
  [0.         0.         0.         0.        ]
  [0.         0.         0.         0.        ]
  [0.         0.         0.         0.        ]
  [0.85760867 0.85753143 1.         0.78106537]]

 [[0.  