In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import random
import copy
from IPython.display import clear_output
import pandas as pd

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

class PDWorld: # We initialize the PD-World environment
    def __init__(self, grid_size=5, pickup_locs=[(3,5), (4,2)], dropoff_locs=[(1,1), (1,5), (3,3), (5,5)], agent_f_pos=(1,3), agent_m_pos=(5,3)):
        self.grid_size = grid_size
        self.pickup_locs = pickup_locs
        self.dropoff_locs = dropoff_locs
        
        self.pickup_blocks = {loc: 10 for loc in pickup_locs} # Each pickup location starts with 10 blocks
        
        self.dropoff_blocks = {loc: 0 for loc in dropoff_locs} 
        
        self.agent_f_pos = agent_f_pos
        self.agent_m_pos = agent_m_pos
        
        self.agent_f_holding = False
        self.agent_m_holding = False
        
        self.initial_pickup_locs = copy.deepcopy(pickup_locs)
        self.initial_agent_f_pos = agent_f_pos
        self.initial_agent_m_pos = agent_m_pos
        
    def reset(self, keep_qtable=True): # We reset the world but keep q-tables
        self.pickup_blocks = {loc: 10 for loc in self.pickup_locs}
        self.dropoff_blocks = {loc: 0 for loc in self.dropoff_locs}
        self.agent_f_pos = self.initial_agent_f_pos
        self.agent_m_pos = self.initial_agent_m_pos
        self.agent_f_holding = False
        self.agent_m_holding = False
        
    def is_terminal(self): # In this function we check if terminal state is reached, so all drop-off locations have 5 blocks
        return all(blocks == 5 for blocks in self.dropoff_blocks.values())
    
    def get_state(self, agent='F'): # This is where we return the current state for a given agent
        if agent == 'F':
            return (self.agent_f_pos, self.agent_f_holding, self.agent_m_pos)
        else:
            return (self.agent_m_pos, self.agent_m_holding, self.agent_f_pos)
    
    def get_valid_actions(self, agent='F'): # This function returns the valid actions for a given agent

        actions = []
        pos = self.agent_f_pos if agent == 'F' else self.agent_m_pos
        other_pos = self.agent_m_pos if agent == 'F' else self.agent_f_pos
        holding = self.agent_f_holding if agent == 'F' else self.agent_m_holding
        

        if pos[0] > 1 and (pos[0]-1, pos[1]) != other_pos:
            actions.append('N')
        if pos[0] < self.grid_size and (pos[0]+1, pos[1]) != other_pos:
            actions.append('S')
        if pos[1] < self.grid_size and (pos[0], pos[1]+1) != other_pos:
            actions.append('E')
        if pos[1] > 1 and (pos[0], pos[1]-1) != other_pos:
            actions.append('W')
        
        if not holding and pos in self.pickup_locs and self.pickup_blocks.get(pos, 0) > 0:
            actions.append('P')
        
        if holding and pos in self.dropoff_locs and self.dropoff_blocks.get(pos, 0) < 5:
            actions.append('D')
            
        return actions
    
    def execute_action(self, agent, action): # We execute a selected action
        if agent == 'F':
            pos = self.agent_f_pos
            holding = self.agent_f_holding
        else:
            pos = self.agent_m_pos
            holding = self.agent_m_holding
            
        reward = -1
        
        if action == 'N':
            new_pos = (pos[0]-1, pos[1])
        elif action == 'S':
            new_pos = (pos[0]+1, pos[1])
        elif action == 'E':
            new_pos = (pos[0], pos[1]+1)
        elif action == 'W':
            new_pos = (pos[0], pos[1]-1)
        elif action == 'P':
            if pos in self.pickup_locs and self.pickup_blocks[pos] > 0:
                self.pickup_blocks[pos] -= 1
                if agent == 'F':
                    self.agent_f_holding = True
                else:
                    self.agent_m_holding = True
                reward = 13
            new_pos = pos
        elif action == 'D':
            if pos in self.dropoff_locs and self.dropoff_blocks[pos] < 5:
                self.dropoff_blocks[pos] += 1
                if agent == 'F':
                    self.agent_f_holding = False
                else:
                    self.agent_m_holding = False
                reward = 13
            new_pos = pos
        else:
            new_pos = pos
            
        if agent == 'F':
            self.agent_f_pos = new_pos
        else:
            self.agent_m_pos = new_pos
            
        return reward
    
    def visualize(self): # This function visualizes the current state of our world
        grid = np.zeros((self.grid_size, self.grid_size))
        
        for loc in self.pickup_locs:
            if self.pickup_blocks.get(loc, 0) > 0:
                grid[loc[0]-1, loc[1]-1] = 1
        
        
        for loc in self.dropoff_locs:
            grid[loc[0]-1, loc[1]-1] = 2
        
        grid[self.agent_f_pos[0]-1, self.agent_f_pos[1]-1] = 3
        grid[self.agent_m_pos[0]-1, self.agent_m_pos[1]-1] = 4
        
        plt.figure(figsize=(6, 6))
        plt.imshow(grid, cmap='Set3')
        plt.title('PD-World State')
        plt.colorbar(ticks=[0, 1, 2, 3, 4], label='0:Empty, 1:Pickup, 2:Dropoff, 3:AgentF, 4:AgentM')
        plt.grid(True)
        plt.show()

class QLearningAgent: # This class implements our Q-Learning/Sarsa agent
    def __init__(self, agent_name, alpha=0.3, gamma=0.5, algorithm='Q-learning'):
        self.agent_name = agent_name
        self.alpha = alpha
        self.gamma = gamma
        self.algorithm = algorithm
        self.q_table = defaultdict(lambda: defaultdict(float)) # This is a nested dictionary for Q-values: state(which is a tuple) -> action -> Q-value
        
    def get_q_value(self, state, action):
        return self.q_table[state][action]
    
    def choose_action(self, state, valid_actions, policy='PRANDOM'): # This function allows us to choose an action based on the selected policy

        if not valid_actions:
            return None
            

        if 'P' in valid_actions or 'D' in valid_actions:
            if 'P' in valid_actions:
                return 'P'
            if 'D' in valid_actions:
                return 'D'
        
        if policy == 'PRANDOM':
            return random.choice(valid_actions)
        
        elif policy == 'PGREEDY':
            q_values = {action: self.get_q_value(state, action) for action in valid_actions}
            max_q = max(q_values.values())
            best_actions = [action for action, q in q_values.items() if q == max_q]
            return random.choice(best_actions)
        
        elif policy == 'PEXPLOIT':
            if random.random() < 0.8:
                q_values = {action: self.get_q_value(state, action) for action in valid_actions}
                max_q = max(q_values.values())
                best_actions = [action for action, q in q_values.items() if q == max_q]
                return random.choice(best_actions)
            else:
                return random.choice(valid_actions)
    
    def update_q_value(self, state, action, reward, next_state, next_action, valid_next_actions): # This function updates the Q-values based on the selected algorithm
        current_q = self.get_q_value(state, action)
        
        if self.algorithm == 'Q-learning':
            if valid_next_actions:
                max_next_q = max([self.get_q_value(next_state, a) for a in valid_next_actions])
            else:
                max_next_q = 0
            new_q = current_q + self.alpha * (reward + self.gamma * max_next_q - current_q)
        
        else: # SARSA algorithm
            if next_action:
                next_q = self.get_q_value(next_state, next_action)
            else:
                next_q = 0
            new_q = current_q + self.alpha * (reward + self.gamma * next_q - current_q)
        
        self.q_table[state][action] = new_q

def run_experiment(world, agent_f, agent_m, total_steps, policy_schedule, experiment_name, pickup_change_config=None, seed=42): # This is the function we use to run our experiments
    set_seed(seed)
    # An episode consists of both agents taking actions until a terminal state is reached
    rewards_history = []
    steps_per_episode = []
    total_rewards_per_episode = []
    terminal_count = 0
    current_step = 0
    episode_steps = 0
    episode_reward = 0
    
    current_policy = policy_schedule[0][1] # Initialize the current policy
    policy_idx = 0
    
    print(f"Running {experiment_name} (Seed: {seed})")
    
    while current_step < total_steps:
        if world.is_terminal():
            terminal_count += 1
            steps_per_episode.append(episode_steps)
            total_rewards_per_episode.append(episode_reward)
            
            print(f"Terminal state {terminal_count} reached at step {current_step}")
            print(f"Episode steps: {episode_steps}, Episode reward: {episode_reward}")
            
            if pickup_change_config and terminal_count == pickup_change_config[0]: # This would only be relevant for Experiment 4
                world.pickup_locs = pickup_change_config[1]
                print(f"Pickup locations changed to: {world.pickup_locs}")
            
            world.reset()
            episode_steps = 0
            episode_reward = 0
        
        if policy_idx + 1 < len(policy_schedule) and current_step >= policy_schedule[policy_idx + 1][0]: # Change policy if needed, like after 500 steps
            policy_idx += 1
            current_policy = policy_schedule[policy_idx][1]
            print(f"\nSwitching to policy {current_policy} at step {current_step}")
        
        state_f = world.get_state('F')
        valid_actions_f = world.get_valid_actions('F')
        
        if valid_actions_f:
            action_f = agent_f.choose_action(state_f, valid_actions_f, current_policy)
            reward_f = world.execute_action('F', action_f)
            next_state_f = world.get_state('F')
            next_valid_actions_f = world.get_valid_actions('F')
            next_action_f = agent_f.choose_action(next_state_f, next_valid_actions_f, current_policy) if next_valid_actions_f else None
            
            agent_f.update_q_value(state_f, action_f, reward_f, next_state_f, next_action_f, next_valid_actions_f)
            
            episode_reward += reward_f
            rewards_history.append(reward_f)
        
        current_step += 1
        episode_steps += 1
        
        if current_step >= total_steps:
            break
        
        state_m = world.get_state('M')
        valid_actions_m = world.get_valid_actions('M')
        
        if valid_actions_m: # Female goes first, then male
            action_m = agent_m.choose_action(state_m, valid_actions_m, current_policy)
            reward_m = world.execute_action('M', action_m)
            next_state_m = world.get_state('M')
            next_valid_actions_m = world.get_valid_actions('M')
            next_action_m = agent_m.choose_action(next_state_m, next_valid_actions_m, current_policy) if next_valid_actions_m else None
            
            agent_m.update_q_value(state_m, action_m, reward_m, next_state_m, next_action_m, next_valid_actions_m)
            
            episode_reward += reward_m
            rewards_history.append(reward_m)
        
        current_step += 1
        episode_steps += 1
        
        if current_step % 1000 == 0:
            print(f"Step {current_step}/{total_steps} - Terminals: {terminal_count}")
            print("\n")
    
    print(f"\nExperiment completed:")
    print(f"Total terminal states reached: {terminal_count}")
    print(f"Average steps per episode: {np.mean(steps_per_episode) if steps_per_episode else 0:.2f}")
    print(f"Average reward per episode: {np.mean(total_rewards_per_episode) if total_rewards_per_episode else 0:.2f}")
    
    return {
        'rewards_history': rewards_history,
        'steps_per_episode': steps_per_episode,
        'total_rewards_per_episode': total_rewards_per_episode,
        'terminal_count': terminal_count,
        'agent_f_qtable': dict(agent_f.q_table),
        'agent_m_qtable': dict(agent_m.q_table)
    }

def plot_results(results, title): # This function plots the results of our experiments, some more visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle(title, fontsize=16)
    
    cumulative_rewards = np.cumsum(results['rewards_history'])
    axes[0, 0].plot(cumulative_rewards)
    axes[0, 0].set_xlabel('Step')
    axes[0, 0].set_ylabel('Cumulative Reward')
    axes[0, 0].set_title('Cumulative Rewards Over Time')
    
    axes[0, 0].grid(True)
    
    if results['total_rewards_per_episode']:
        
        axes[0, 1].plot(results['total_rewards_per_episode'])
        axes[0, 1].set_xlabel('Episode')
        axes[0, 1].set_ylabel('Total Reward')
        axes[0, 1].set_title('Total Rewards Per Episode')
        
        axes[0, 1].grid(True)
    
    if results['steps_per_episode']:
        axes[1, 0].plot(results['steps_per_episode'])
        axes[1, 0].set_xlabel('Episode')
        axes[1, 0].set_ylabel('Steps')
        axes[1, 0].set_title('Steps Per Episode')
        
        axes[1, 0].grid(True)
    
    window = 100
    if len(results['rewards_history']) >= window:
        moving_avg = np.convolve(results['rewards_history'], np.ones(window)/window, mode='valid')
        
        axes[1, 1].plot(moving_avg)
        axes[1, 1].set_xlabel('Step')
        axes[1, 1].set_ylabel('Average Reward')
        axes[1, 1].set_title(f'Moving Average Reward (window={window})')
        axes[1, 1].grid(True)
    
    plt.tight_layout()
    plt.show()

def print_qtable_sample(qtable, agent_name, num_states=10):
    print(f"{agent_name} Q-Table Sample (first {num_states} states):")
    
    for i, (state, actions) in enumerate(list(qtable.items())[:num_states]):
        print(f"\nState {i+1}: {state}")
        for action, q_value in actions.items():
            print(f"  Action {action}: {q_value:.4f}")

def calculate_manhattan_distance(pos1, pos2):
    return abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])

In [None]:
print("Experiment 1a: PRANDOM for all 8000 steps")

world_1a = PDWorld()
agent_f_1a = QLearningAgent('F', alpha=0.3, gamma=0.5, algorithm='Q-learning')
agent_m_1a = QLearningAgent('M', alpha=0.3, gamma=0.5, algorithm='Q-learning')

policy_schedule_1a = [(0, 'PRANDOM')]
results_1a = run_experiment(world_1a, agent_f_1a, agent_m_1a, 8000, policy_schedule_1a, "Experiment 1a", seed=42)
plot_results(results_1a, "Experiment 1a: PRANDOM")

In [None]:
print("Experiment 1b: PRANDOM (500 steps) then PGREEDY (7500 steps)")


world_1b = PDWorld()
agent_f_1b = QLearningAgent('F', alpha=0.3, gamma=0.5, algorithm='Q-learning')
agent_m_1b = QLearningAgent('M', alpha=0.3, gamma=0.5, algorithm='Q-learning')

policy_schedule_1b = [(0, 'PRANDOM'), (500, 'PGREEDY')]
results_1b = run_experiment(world_1b, agent_f_1b, agent_m_1b, 8000, policy_schedule_1b, "Experiment 1b", seed=42)
plot_results(results_1b, "Experiment 1b: PRANDOM then PGREEDY after 500 steps")

In [None]:
print("Experiment 1c: PRANDOM (500 steps) then PEXPLOIT (7500 steps)")

world_1c = PDWorld()
agent_f_1c = QLearningAgent('F', alpha=0.3, gamma=0.5, algorithm='Q-learning')
agent_m_1c = QLearningAgent('M', alpha=0.3, gamma=0.5, algorithm='Q-learning')

policy_schedule_1c = [(0, 'PRANDOM'), (500, 'PEXPLOIT')]
results_1c = run_experiment(world_1c, agent_f_1c, agent_m_1c, 8000, policy_schedule_1c, "Experiment 1c", seed=42)

plot_results(results_1c, "Experiment 1c: PRANDOM then PEXPLOIT after 500 steps")

print_qtable_sample(results_1c['agent_f_qtable'], "Agent F", num_states=10)

In [None]:
print("Experiment 2: SARSA with PRANDOM (500 steps) then PEXPLOIT (7500 steps)")

world_2 = PDWorld()
agent_f_2 = QLearningAgent('F', alpha=0.3, gamma=0.5, algorithm='SARSA')
agent_m_2 = QLearningAgent('M', alpha=0.3, gamma=0.5, algorithm='SARSA')

policy_schedule_2 = [(0, 'PRANDOM'), (500, 'PEXPLOIT')]
results_2 = run_experiment(world_2, agent_f_2, agent_m_2, 8000, policy_schedule_2, "Experiment 2 (SARSA)", seed=42)
plot_results(results_2, "Experiment 2: SARSA with PRANDOM then PEXPLOIT after 500 steps")

print_qtable_sample(results_2['agent_f_qtable'], "Agent F", num_states=10)

In [None]:
print("Experiment 3a: Q-learning with alpha=0.15")

world_3a = PDWorld()
agent_f_3a = QLearningAgent('F', alpha=0.15, gamma=0.5, algorithm='Q-learning')
agent_m_3a = QLearningAgent('M', alpha=0.15, gamma=0.5, algorithm='Q-learning')

policy_schedule_3 = [(0, 'PRANDOM'), (500, 'PEXPLOIT')]
results_3a = run_experiment(world_3a, agent_f_3a, agent_m_3a, 8000, policy_schedule_3, "Experiment 3a (alpha=0.15)", seed=42)
plot_results(results_3a, "Experiment 3a: alpha=0.15")

print("\n")
print("Experiment 3b: Q-learning with alpha=0.45")

world_3b = PDWorld()
agent_f_3b = QLearningAgent('F', alpha=0.45, gamma=0.5, algorithm='Q-learning')
agent_m_3b = QLearningAgent('M', alpha=0.45, gamma=0.5, algorithm='Q-learning')

results_3b = run_experiment(world_3b, agent_f_3b, agent_m_3b, 8000, policy_schedule_3, "Experiment 3b (alpha=0.45)", seed=42)
plot_results(results_3b, "Experiment 3b: alpha=0.45")

In [None]:
print("Experiment 4: Pickup locations change after 3rd terminal state")

world_4 = PDWorld()
agent_f_4 = QLearningAgent('F', alpha=0.3, gamma=0.5, algorithm='Q-learning')
agent_m_4 = QLearningAgent('M', alpha=0.3, gamma=0.5, algorithm='Q-learning')

policy_schedule_4 = [(0, 'PRANDOM'), (500, 'PEXPLOIT')]
pickup_change = (3, [(1,2), (4,5)])
results_4 = run_experiment(world_4, agent_f_4, agent_m_4, 8000, policy_schedule_4, "Experiment 4", pickup_change_config=pickup_change, seed=42)
plot_results(results_4, "Experiment 4: Adaptation to Pickup Location Changes")

In [None]:
print("Comparison of All Experiments")

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

ax1 = axes[0, 0]

if results_1a['total_rewards_per_episode']:
    ax1.plot(results_1a['total_rewards_per_episode'], label='1a: PRANDOM', alpha=0.7)
if results_1b['total_rewards_per_episode']:
    ax1.plot(results_1b['total_rewards_per_episode'], label='1b: PGREEDY', alpha=0.7)
if results_1c['total_rewards_per_episode']:
    ax1.plot(results_1c['total_rewards_per_episode'], label='1c: PEXPLOIT', alpha=0.7)
    
ax1.set_xlabel('Episode')
ax1.set_ylabel('Total Reward')
ax1.set_title('Experiment 1: Policy Comparison')
ax1.legend()
ax1.grid(True)

ax2 = axes[0, 1]

if results_1c['total_rewards_per_episode']:
    ax2.plot(results_1c['total_rewards_per_episode'], label='Q-learning', alpha=0.7)
if results_2['total_rewards_per_episode']:
    ax2.plot(results_2['total_rewards_per_episode'], label='SARSA', alpha=0.7)
    
ax2.set_xlabel('Episode')
ax2.set_ylabel('Total Reward')
ax2.set_title('Experiment 2: Q-learning vs SARSA')

ax2.legend()
ax2.grid(True)

ax3 = axes[1, 0]
if results_1c['total_rewards_per_episode']:
    ax3.plot(results_1c['total_rewards_per_episode'], label='alpha=0.30', alpha=0.7)
if results_3a['total_rewards_per_episode']:
    ax3.plot(results_3a['total_rewards_per_episode'], label='alpha=0.15', alpha=0.7)
if results_3b['total_rewards_per_episode']:
    ax3.plot(results_3b['total_rewards_per_episode'], label='alpha=0.45', alpha=0.7)
    
ax3.set_xlabel('Episode')
ax3.set_ylabel('Total Reward')
ax3.set_title('Experiment 3: Learning Rate Comparison')
ax3.legend()

ax3.grid(True)

ax4 = axes[1, 1]
if results_4['total_rewards_per_episode']:
    ax4.plot(results_4['total_rewards_per_episode'], label='Exp 4: Adaptation', alpha=0.7, marker='o')
    ax4.axvline(x=3, color='r', linestyle='--', label='Pickup locations changed')
    
ax4.set_xlabel('Episode')
ax4.set_ylabel('Total Reward')
ax4.set_title('Experiment 4: Adaptation to Environment Changes')
ax4.legend()
ax4.grid(True)

plt.tight_layout()
plt.show()

print("\nSummary Statistics")

experiments = {
    '1a (PRANDOM)': results_1a,
    '1b (PGREEDY)': results_1b,
    '1c (PEXPLOIT)': results_1c,
    '2 (SARSA)': results_2,
    '3a (alpha=0.15)': results_3a,
    '3b (alpha=0.45)': results_3b,
    '4 (Adaptation)': results_4
}

for name, results in experiments.items():
    print(f"\n{name}:")
    print(f"  Terminal states reached: {results['terminal_count']}")
    if results['steps_per_episode']:
        print(f"  Avg steps per episode: {np.mean(results['steps_per_episode']):.2f}")
        print(f"  Avg reward per episode: {np.mean(results['total_rewards_per_episode']):.2f}")
    print(f"  Total cumulative reward: {sum(results['rewards_history']):.2f}")

In [None]:
# TODO: Run all experiments again with different random seed for comparison
print("Running experiments again with different random seed for comparison")

world_1c_r2 = PDWorld()
agent_f_1c_r2 = QLearningAgent('F', alpha=0.3, gamma=0.5, algorithm='Q-learning')
agent_m_1c_r2 = QLearningAgent('M', alpha=0.3, gamma=0.5, algorithm='Q-learning')
results_1c_r2 = run_experiment(world_1c_r2, agent_f_1c_r2, agent_m_1c_r2, 8000, policy_schedule_1c, "Experiment 1c Run 2", seed=123)

world_2_r2 = PDWorld()
agent_f_2_r2 = QLearningAgent('F', alpha=0.3, gamma=0.5, algorithm='SARSA')
agent_m_2_r2 = QLearningAgent('M', alpha=0.3, gamma=0.5, algorithm='SARSA')
results_2_r2 = run_experiment(world_2_r2, agent_f_2_r2, agent_m_2_r2, 8000, policy_schedule_2, "Experiment 2 Run 2", seed=123)

world_3a_r2 = PDWorld()
agent_f_3a_r2 = QLearningAgent('F', alpha=0.15, gamma=0.5, algorithm='Q-learning')
agent_m_3a_r2 = QLearningAgent('M', alpha=0.15, gamma=0.5, algorithm='Q-learning')
results_3a_r2 = run_experiment(world_3a_r2, agent_f_3a_r2, agent_m_3a_r2, 8000, policy_schedule_3, "Experiment 3a Run 2", seed=123)

world_4_r2 = PDWorld()
agent_f_4_r2 = QLearningAgent('F', alpha=0.3, gamma=0.5, algorithm='Q-learning')
agent_m_4_r2 = QLearningAgent('M', alpha=0.3, gamma=0.5, algorithm='Q-learning')
results_4_r2 = run_experiment(world_4_r2, agent_f_4_r2, agent_m_4_r2, 8000, policy_schedule_4, "Experiment 4 Run 2", pickup_change_config=pickup_change, seed=123)

print("\n")
print("Compare: Run 1 vs Run 2")

comparisons = [
    ("1c (PEXPLOIT)", results_1c, results_1c_r2),
    ("2 (SARSA)", results_2, results_2_r2),
    ("3a (alpha=0.15)", results_3a, results_3a_r2),
    ("4 (Adaptation)", results_4, results_4_r2)
]

for name, run1, run2 in comparisons:
    print(f"\n{name}:")
    print(f"  Run 1 - Terminals: {run1['terminal_count']}, Avg reward: {np.mean(run1['total_rewards_per_episode']) if run1['total_rewards_per_episode'] else 0:.2f}")
    print(f"  Run 2 - Terminals: {run2['terminal_count']}, Avg reward: {np.mean(run2['total_rewards_per_episode']) if run2['total_rewards_per_episode'] else 0:.2f}")