In [None]:
# One Grid World

import numpy as np

class BridgeGridEnv:
    def __init__(self, grid_size=(5, 5), bridge_location=(2, 0), bridge_length=3, goal_state=(4, 4), fall_penalty=-10, goal_reward=10):
        self.grid_size = grid_size
        self.bridge_location = bridge_location
        self.bridge_length = bridge_length
        self.goal_state = goal_state
        self.fall_penalty = fall_penalty
        self.goal_reward = goal_reward
        self.actions = ['up', 'down', 'left', 'right']
        self.n_states = grid_size[0] * grid_size[1]
        self.n_actions = len(self.actions)
        self.transitions, self.rewards = self.build_environment()
    
    def build_environment(self):
        transition_prob = np.zeros((self.n_states, self.n_actions, self.n_states))
        rewards = np.full((self.n_states, self.n_actions), -1)
        def state_to_index(row, col):
            return row * self.grid_size[1] + col
        def is_off_bridge(row, col):
            return (row != self.bridge_location[0] or col >= self.bridge_location[1] + self.bridge_length) and row < self.grid_size[0] - 1
        for row in range(self.grid_size[0]):
            for col in range(self.grid_size[1]):
                state = state_to_index(row, col)
                for action_idx, action in enumerate(self.actions):
                    if (row, col) == self.goal_state:
                        rewards[state, action_idx] = self.goal_reward
                        transition_prob[state, action_idx, state] = 1.0
                        continue
                    next_row, next_col = row, col
                    if action == 'up' and row > 0:
                        next_row = row - 1
                    elif action == 'down' and row < self.grid_size[0] - 1:
                        next_row = row + 1
                    elif action == 'left' and col > 0:
                        next_col = col - 1
                    elif action == 'right' and col < self.grid_size[1] - 1:
                        next_col = col + 1
                    next_state = state_to_index(next_row, next_col)
                    if (next_row, next_col) == self.goal_state:
                        rewards[state, action_idx] = self.goal_reward
                    elif is_off_bridge(next_row, next_col):
                        rewards[state, action_idx] = self.fall_penalty
                    transition_prob[state, action_idx, next_state] = 1.0
        return transition_prob, rewards

    def state_index_to_coordinates(self, state_index):
        row = state_index // self.grid_size[1]
        col = state_index % self.grid_size[1]
        return row, col


def value_iteration_bridge(env, gamma=0.9, theta=1e-6):
    V = np.zeros(env.n_states)
    while True:
        delta = 0
        for s in range(env.n_states):
            v = V[s]
            Q_sa = np.zeros(env.n_actions)
            for a in range(env.n_actions):
                Q_sa[a] = sum([env.transitions[s, a, s_prime] * (env.rewards[s, a] + gamma * V[s_prime]) for s_prime in range(env.n_states)])
            V[s] = max(Q_sa)
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break
    policy = np.zeros(env.n_states, dtype=int)
    for s in range(env.n_states):
        Q_sa = np.zeros(env.n_actions)
        for a in range(env.n_actions):
            Q_sa[a] = sum([env.transitions[s, a, s_prime] * (env.rewards[s, a] + gamma * V[s_prime])
                           for s_prime in range(env.n_states)])
        policy[s] = np.argmax(Q_sa)
    return V, policy

env = BridgeGridEnv()
V, policy = value_iteration_bridge(env)
print("Optimal Values:")
print(V.reshape(env.grid_size))
print("\nOptimal Policy:")
print(policy.reshape(env.grid_size))

Optimal Values:
[[27.70775002 32.89750102 38.66389102 37.78099102 45.79999102]
 [41.89750102 47.66389102 54.07099102 53.08999102 61.99999102]
 [47.66389102 54.07099102 61.18999102 70.09999102 79.99999102]
 [62.17099102 70.18999102 79.09999102 88.99999102 99.99999102]
 [70.18999102 79.09999102 88.99999102 99.99999102 99.99999102]]

Optimal Policy:
[[1 1 1 1 1]
 [1 1 1 1 1]
 [3 3 1 1 1]
 [1 1 1 1 1]
 [3 3 3 3 0]]


In [2]:
# Two Grid World

import numpy as np
import random

class BridgeGrid:
    def __init__(self):
        self.grid = np.zeros((5, 5))
        self.goal = (0, 4)
        self.start = (4, 0)
        self.bridge = [(2, 1), (2, 2), (2, 3)]
        self.state = self.start
        self.actions = ['up', 'down', 'left', 'right']
    
    def reset(self):
        self.state = self.start
        return self.state
    
    def step(self, action):
        row, col = self.state
        if action == 'up':
            row = max(0, row - 1)
        elif action == 'down':
            row = min(4, row + 1)
        elif action == 'left':
            col = max(0, col - 1)
        elif action == 'right':
            col = min(4, col + 1)
        next_state = (row, col)
        if next_state == self.goal:
            reward = 1 
            done = True
        elif next_state in self.bridge:
            reward = 0
            done = False
        elif row == 2 and col not in [1, 2, 3]:
            reward = -1
            done = True
        else:
            reward = -0.1 
            done = False
        self.state = next_state
        return next_state, reward, done

    def render(self):
        grid_copy = np.copy(self.grid)
        grid_copy[self.goal] = 1
        grid_copy[self.start] = 0.5
        for bridge_part in self.bridge:
            grid_copy[bridge_part] = 0.3
        print(grid_copy)

class QLearningAgent:
    def __init__(self, env, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma 
        self.epsilon = epsilon
        self.q_table = {}

    def get_q_value(self, state, action):
        return self.q_table.get((state, action), 0.0)

    def update_q_value(self, state, action, reward, next_state):
        max_next_q_value = max([self.get_q_value(next_state, a) for a in self.env.actions])
        current_q_value = self.get_q_value(state, action)
        new_q_value = current_q_value + self.alpha * (reward + self.gamma * max_next_q_value - current_q_value)
        self.q_table[(state, action)] = new_q_value

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(self.env.actions)
        else:
            q_values = [self.get_q_value(state, action) for action in self.env.actions]
            max_q = max(q_values)
            max_actions = [action for action, q_value in zip(self.env.actions, q_values) if q_value == max_q]
            return random.choice(max_actions)

    def train(self, episodes=1000):
        for episode in range(episodes):
            state = self.env.reset()
            done = False
            while not done:
                action = self.choose_action(state)
                next_state, reward, done = self.env.step(action)
                self.update_q_value(state, action, reward, next_state)
                state = next_state
            if episode % 100 == 0:
                print(f"Episode {episode} complete")

env = BridgeGrid()
agent = QLearningAgent(env)
agent.train(episodes=1000)
state = env.reset()
env.render()
done = False
while not done:
    action = agent.choose_action(state)
    next_state, reward, done = env.step(action)
    state = next_state
    env.render()
    print(f"Action: {action}, Reward: {reward}")

Episode 0 complete
Episode 100 complete
Episode 200 complete
Episode 300 complete
Episode 400 complete
Episode 500 complete
Episode 600 complete
Episode 700 complete
Episode 800 complete
Episode 900 complete
[[0.  0.  0.  0.  1. ]
 [0.  0.  0.  0.  0. ]
 [0.  0.3 0.3 0.3 0. ]
 [0.  0.  0.  0.  0. ]
 [0.5 0.  0.  0.  0. ]]
[[0.  0.  0.  0.  1. ]
 [0.  0.  0.  0.  0. ]
 [0.  0.3 0.3 0.3 0. ]
 [0.  0.  0.  0.  0. ]
 [0.5 0.  0.  0.  0. ]]
Action: right, Reward: -0.1
[[0.  0.  0.  0.  1. ]
 [0.  0.  0.  0.  0. ]
 [0.  0.3 0.3 0.3 0. ]
 [0.  0.  0.  0.  0. ]
 [0.5 0.  0.  0.  0. ]]
Action: up, Reward: -0.1
[[0.  0.  0.  0.  1. ]
 [0.  0.  0.  0.  0. ]
 [0.  0.3 0.3 0.3 0. ]
 [0.  0.  0.  0.  0. ]
 [0.5 0.  0.  0.  0. ]]
Action: up, Reward: 0
[[0.  0.  0.  0.  1. ]
 [0.  0.  0.  0.  0. ]
 [0.  0.3 0.3 0.3 0. ]
 [0.  0.  0.  0.  0. ]
 [0.5 0.  0.  0.  0. ]]
Action: up, Reward: -0.1
[[0.  0.  0.  0.  1. ]
 [0.  0.  0.  0.  0. ]
 [0.  0.3 0.3 0.3 0. ]
 [0.  0.  0.  0.  0. ]
 [0.5 0.  0.  0.  0.