In [1]:
import numpy as np

class GridWorld:
    def __init__(self, shape):
        self.shape = shape
        self.obstacles = [(1, 1), (2, 2)]  # List of obstacle positions
        self.goal = (2, 3)  # Goal position
        self.agent = (0, 0)  # Agent's initial position

    def reset(self):
        self.agent = (0, 0)
        return self.agent

    def step(self, action):
        moves = [(0, 1), (1, 0), (0, -1), (-1, 0)]  # Right, Down, Left, Up
        new_pos = (self.agent[0] + moves[action][0], self.agent[1] + moves[action][1])

        if new_pos[0] >= 0 and new_pos[0] < self.shape[0] \
           and new_pos[1] >= 0 and new_pos[1] < self.shape[1] \
           and new_pos not in self.obstacles:
            self.agent = new_pos

        if self.agent == self.goal:
            reward = 1
            done = True
        else:
            reward = 0
            done = False

        return self.agent, reward, done

class QLearningAgent:
    def __init__(self, num_actions, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.num_actions = num_actions
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.Q = {}

    def choose_action(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax([self.Q.get((state, a), 0) for a in range(self.num_actions)])

    def update_Q_value(self, state, action, next_state, reward):
        max_next_Q = max([self.Q.get((next_state, a), 0) for a in range(self.num_actions)])
        current_Q = self.Q.get((state, action), 0)
        new_Q = current_Q + self.alpha * (reward + self.gamma * max_next_Q - current_Q)
        self.Q[(state, action)] = new_Q

def run_experiment(environment):
    num_episodes = 1000
    env = GridWorld((4, 4))
    agent = QLearningAgent(4)

    for _ in range(num_episodes):
        state = env.reset()
        done = False

        while not done:
            action = agent.choose_action(state)
            next_state, reward, done = env.step(action)
            agent.update_Q_value(state, action, next_state, reward)
            state = next_state

    return agent.Q

static_Q_values = run_experiment(GridWorld((4, 4)))
print("Static environment Q-values:")
print(static_Q_values)

class DynamicGridWorld(GridWorld):
    def __init__(self, shape):
        super().__init__(shape)
        self.moving_obstacle = (1, 3)  # Initial position of the moving obstacle

    def step(self, action):
        # Move the obstacle randomly
        self.moving_obstacle = np.random.choice([(1, 2), (2, 3), (1, 3)])

        # Agent's movement logic remains the same
        return super().step(action)

dynamic_Q_values = run_experiment(DynamicGridWorld((4, 4)))
print("\nDynamic environment Q-values:")
print(dynamic_Q_values)

Static environment Q-values:
{((0, 0), 0): 0.6560999999999979, ((0, 1), 0): 0.7289999999999983, ((0, 2), 3): 0.6235151038632805, ((0, 2), 0): 0.8099999999999987, ((0, 3), 0): 0.7256546050168583, ((0, 3), 2): 0.6588508624327397, ((0, 3), 3): 0.7519017360186914, ((0, 3), 1): 0.899999999999999, ((1, 3), 0): 0.82807511138066, ((1, 3), 1): 0.9999999999999996, ((0, 0), 2): 0.5509307835269304, ((0, 1), 2): 0.5639623535137005, ((0, 0), 1): 0.3641106741962231, ((1, 0), 0): 0.0, ((1, 0), 2): 0.0, ((1, 0), 1): 0.0, ((2, 0), 0): 0.0, ((2, 1), 2): 0.0, ((2, 1), 1): 0.0, ((3, 1), 0): 0.0008100000000000002, ((3, 2), 0): 0.025200000000000004, ((3, 3), 0): 0.0, ((3, 3), 1): 0.0, ((3, 3), 3): 0.271, ((1, 0), 3): 0.5324734100868916, ((0, 2), 2): 0.6061674441867534, ((1, 3), 3): 0.727440608577938, ((0, 1), 1): 0.6222698295064646, ((0, 2), 1): 0.6928968314407693, ((1, 2), 0): 0.8958033291733596, ((0, 0), 3): 0.5493236918212985, ((0, 1), 3): 0.6118172731722019, ((1, 3), 2): 0.7219332731036018, ((1, 2), 2): 