In [4]:
import numpy as np
import random

In [5]:
class SmallGridEnv:
    def __init__(self, size=10):
        self.size = size
        self.grid = np.zeros((self.size, self.size))
        self.start = (0, 0)
        self.goal = (self.size - 1, self.size - 1)
        self.position = self.start
        self.actions = [(0, 1), (1, 0), (0, -1), (-1, 0)]  # right, down, left, up

    def reset(self):
        self.position = self.start
        return self.position

    def step(self, action):
        next_pos = (
            self.position[0] + self.actions[action][0],
            self.position[1] + self.actions[action][1]
        )
        
        # Check boundaries
        if 0 <= next_pos[0] < self.size and 0 <= next_pos[1] < self.size:
            self.position = next_pos

        # Define rewards
        if self.position == self.goal:
            return self.position, 1, True  # goal reached
        else:
            return self.position, -0.01, False  # small penalty per step


class ValueIterationAgent:
    def __init__(self, env, gamma=0.9, theta=1e-3):
        self.env = env
        self.gamma = gamma
        self.theta = theta
        self.value_table = np.zeros((env.size, env.size))

    def value_iteration(self):
        while True:
            delta = 0
            for i in range(self.env.size):
                for j in range(self.env.size):
                    old_value = self.value_table[i, j]
                    action_values = []
                    for action in range(4):
                        self.env.position = (i, j)
                        next_pos, reward, _ = self.env.step(action)
                        action_values.append(
                            reward + self.gamma * self.value_table[next_pos]
                        )
                    self.value_table[i, j] = max(action_values)
                    delta = max(delta, abs(old_value - self.value_table[i, j]))

            if delta < self.theta:
                break


class MonteCarloControlAgent:
    def __init__(self, env, gamma=0.9, epsilon=0.1):
        self.env = env
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = np.zeros((env.size, env.size, 4))
        self.returns_sum = np.zeros_like(self.q_table)
        self.returns_count = np.zeros_like(self.q_table)

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, 3)  # explore
        else:
            return np.argmax(self.q_table[state[0], state[1]])  # exploit

    def run_episode(self):
        episode = []
        state = self.env.reset()
        done = False

        while not done:
            action = self.select_action(state)
            next_state, reward, done = self.env.step(action)
            episode.append((state, action, reward))
            state = next_state

        return episode

    def update_q_table(self, episode):
        G = 0
        visited = set()
        for state, action, reward in reversed(episode):
            G = self.gamma * G + reward
            if (state, action) not in visited:
                self.returns_sum[state[0], state[1], action] += G
                self.returns_count[state[0], state[1], action] += 1
                self.q_table[state[0], state[1], action] = (
                    self.returns_sum[state[0], state[1], action] /
                    self.returns_count[state[0], state[1], action]
                )
                visited.add((state, action))

    def train(self, num_episodes=50):
        for _ in range(num_episodes):
            episode = self.run_episode()
            self.update_q_table(episode)


def evaluate_agent(agent, env, episodes=5):
    total_reward = 0
    for _ in range(episodes):
        state = env.reset()
        done = False
        episode_reward = 0
        while not done:
            if isinstance(agent, MonteCarloControlAgent):
                action = np.argmax(agent.q_table[state[0], state[1]])
            else:
                action_values = [agent.value_table[state[0] + dx, state[1] + dy]
                                 if 0 <= state[0] + dx < env.size and 0 <= state[1] + dy < env.size else -np.inf
                                 for dx, dy in env.actions]
                action = np.argmax(action_values)
            state, reward, done = env.step(action)
            episode_reward += reward
        total_reward += episode_reward
    return total_reward / episodes




In [6]:
# Environment setup
environment = SmallGridEnv(size=30)

# Value Iteration MDP Agent
vi_agent = ValueIterationAgent(environment)
vi_agent.value_iteration()
vi_avg_reward = evaluate_agent(vi_agent, environment)



In [None]:
# Monte Carlo Agent
mc_agent = MonteCarloControlAgent(environment)
mc_agent.train()
mc_avg_reward = evaluate_agent(mc_agent, environment)

In [None]:
# Output the results
print("MDP (Value Iteration) Average Reward:", vi_avg_reward)
print("Monte Carlo Control Average Reward:", mc_avg_reward)
