In [1]:
import numpy as np
import random

class GridworldEnv:
    def __init__(self, grid_size=(4, 4), goal_state=(3, 3), trap_states=[], goal_reward=10, trap_penalty=-10, step_penalty=-1):
        self.grid_size = grid_size
        self.goal_state = goal_state
        self.trap_states = trap_states
        self.goal_reward = goal_reward
        self.trap_penalty = trap_penalty
        self.step_penalty = step_penalty
        self.actions = ['up', 'down', 'left', 'right']
        self.n_actions = len(self.actions)

    def reset(self):
        self.agent_position = (0, 0)
        return self.agent_position

    def step(self, action):
        row, col = self.agent_position
        if action == 0:  
            next_position = (max(row - 1, 0), col)
        elif action == 1: 
            next_position = (min(row + 1, self.grid_size[0] - 1), col)
        elif action == 2:
            next_position = (row, max(col - 1, 0))
        elif action == 3:  
            next_position = (row, min(col + 1, self.grid_size[1] - 1))
        reward = self.step_penalty
        done = False
        if next_position == self.goal_state:
            reward = self.goal_reward
            done = True
        elif next_position in self.trap_states:
            reward = self.trap_penalty
            done = True
        self.agent_position = next_position
        return next_position, reward, done

    def action_space(self):
        return self.n_actions

class ApproximateQLearningAgent:
    def __init__(self, env, alpha=0.1, gamma=0.9, epsilon=0.1, feature_dim=8):
        self.env = env
        self.alpha = alpha 
        self.gamma = gamma 
        self.epsilon = epsilon
        self.feature_dim = feature_dim  
        self.weights = np.zeros(self.feature_dim)  

    def extract_features(self, state, action):
        row, col = state
        feature_vector = np.zeros(self.feature_dim)
        feature_vector[0] = row
        feature_vector[1] = col
        feature_vector[2] = row * col
        feature_vector[3] = action
        feature_vector[4] = row - col
        feature_vector[5] = row + col
        feature_vector[6] = 1 if state == self.env.goal_state else 0
        feature_vector[7] = 1 if state in self.env.trap_states else 0
        return feature_vector

    def q_value(self, state, action):
        features = self.extract_features(state, action)
        return np.dot(self.weights, features)

    def select_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.choice(range(self.env.n_actions))
        else:
            q_values = [self.q_value(state, a) for a in range(self.env.n_actions)]
            return np.argmax(q_values)

    def update(self, state, action, reward, next_state, done):
        if done:
            target = reward
        else:
            next_q_values = [self.q_value(next_state, a) for a in range(self.env.n_actions)]
            target = reward + self.gamma * np.max(next_q_values)
        current_q = self.q_value(state, action)
        td_error = target - current_q
        features = self.extract_features(state, action)
        self.weights += self.alpha * td_error * features

    def train(self, episodes=1000):
        for episode in range(episodes):
            state = self.env.reset()
            done = False
            while not done:
                action = self.select_action(state)
                next_state, reward, done = self.env.step(action)
                self.update(state, action, reward, next_state, done)
                state = next_state

    def test(self):
        state = self.env.reset()
        done = False
        total_reward = 0
        steps = 0
        print("Testing the learned policy...")
        while not done:
            action = self.select_action(state)
            next_state, reward, done = self.env.step(action)
            total_reward += reward
            steps += 1
            print(f"Step {steps}: State {state}, Action {self.env.actions[action]}, Reward {reward}")
            state = next_state
        print(f"Total Reward: {total_reward}, Steps Taken: {steps}")

env = GridworldEnv(grid_size=(4, 4), goal_state=(3, 3), trap_states=[(1, 1)], goal_reward=10, trap_penalty=-10)
agent = ApproximateQLearningAgent(env)
agent.train(episodes=1000)
agent.test()

Testing the learned policy...
Step 1: State (0, 0), Action up, Reward -1
Step 2: State (0, 0), Action up, Reward -1
Step 3: State (0, 0), Action up, Reward -1
Step 4: State (0, 0), Action up, Reward -1
Step 5: State (0, 0), Action up, Reward -1
Step 6: State (0, 0), Action up, Reward -1
Step 7: State (0, 0), Action up, Reward -1
Step 8: State (0, 0), Action up, Reward -1
Step 9: State (0, 0), Action up, Reward -1
Step 10: State (0, 0), Action up, Reward -1
Step 11: State (0, 0), Action up, Reward -1
Step 12: State (0, 0), Action up, Reward -1
Step 13: State (0, 0), Action up, Reward -1
Step 14: State (0, 0), Action down, Reward -1
Step 15: State (1, 0), Action up, Reward -1
Step 16: State (0, 0), Action up, Reward -1
Step 17: State (0, 0), Action up, Reward -1
Step 18: State (0, 0), Action up, Reward -1
Step 19: State (0, 0), Action down, Reward -1
Step 20: State (1, 0), Action up, Reward -1
Step 21: State (0, 0), Action up, Reward -1
Step 22: State (0, 0), Action up, Reward -1
Step 23