In [1]:
import numpy as np
import random

class GridworldEnv:
    def __init__(self, grid_size=(4, 4), goal_state=(3, 3), trap_states=[], goal_reward=10, trap_penalty=-10, step_penalty=-1):
        self.grid_size = grid_size
        self.goal_state = goal_state
        self.trap_states = trap_states
        self.goal_reward = goal_reward
        self.trap_penalty = trap_penalty
        self.step_penalty = step_penalty
        self.actions = ['up', 'down', 'left', 'right']
        self.n_actions = len(self.actions)

    def reset(self):
        self.agent_position = (0, 0)
        return self.agent_position

    def step(self, action):
        row, col = self.agent_position
        if action == 0:  
            next_position = (max(row - 1, 0), col)
        elif action == 1:  
            next_position = (min(row + 1, self.grid_size[0] - 1), col)
        elif action == 2: 
            next_position = (row, max(col - 1, 0))
        elif action == 3: 
            next_position = (row, min(col + 1, self.grid_size[1] - 1))
        reward = self.step_penalty
        done = False
        if next_position == self.goal_state:
            reward = self.goal_reward
            done = True
        elif next_position in self.trap_states:
            reward = self.trap_penalty
            done = True
        self.agent_position = next_position
        return next_position, reward, done

    def get_state(self):
        return self.agent_position

    def action_space(self):
        return self.n_actions

def q_learning(env, episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1):
    Q = np.zeros((env.grid_size[0], env.grid_size[1], env.n_actions))
    for episode in range(episodes):
        state = env.reset()
        done = False
        while not done:
            row, col = state
            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(env.n_actions))
            else:
                action = np.argmax(Q[row, col])
            next_state, reward, done = env.step(action)
            next_row, next_col = next_state
            best_next_action = np.argmax(Q[next_row, next_col])
            td_target = reward + gamma * Q[next_row, next_col, best_next_action]
            Q[row, col, action] = (1 - alpha) * Q[row, col, action] + alpha * td_target
            state = next_state
    return Q


def test_q_learning(Q, env):
    state = env.reset()
    done = False
    steps = 0
    total_reward = 0
    print("Testing the learned policy...")
    while not done:
        row, col = state
        action = np.argmax(Q[row, col])
        next_state, reward, done = env.step(action)
        total_reward += reward
        steps += 1
        print(f"Step {steps}: State {state}, Action {env.actions[action]}, Reward {reward}")
        state = next_state
    print(f"Total Reward: {total_reward}, Steps Taken: {steps}")

env = GridworldEnv(grid_size=(4, 4), goal_state=(3, 3), trap_states=[(1, 1)], goal_reward=10, trap_penalty=-10)
Q = q_learning(env, episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1)
test_q_learning(Q, env)


Testing the learned policy...
Step 1: State (0, 0), Action right, Reward -1
Step 2: State (0, 1), Action right, Reward -1
Step 3: State (0, 2), Action down, Reward -1
Step 4: State (1, 2), Action right, Reward -1
Step 5: State (1, 3), Action down, Reward -1
Step 6: State (2, 3), Action down, Reward 10
Total Reward: 5, Steps Taken: 6
