In [10]:
import numpy as np
import matplotlib.pyplot as plt

In [11]:
grid_size = 5
initial_state = np.array([0,0])
loc_a = np.array([0,1])
loc_a_ = np.array([4,1])
rew_a = 5
loc_b = np.array([0,3])
loc_b_ = np.array([4,3])
rew_b = 5

In [12]:
def next_state(state, action):
    if state[0] == loc_a[0] and state[1] == loc_a[1]:
                    return loc_a_, rew_a
    elif state[0] == loc_b[0] and state[1] == loc_b[1]:
                    return loc_b_, rew_b
    else:
        if action == 'N':
            if state[0] == 0:
                return state, -1
            else:
                state = state + np.array([-1,0])
                return state, 0
        if action == 'S':
            if state[0] == grid_size - 1:
                return state, -1
            else:
                state = state + np.array([1,0])
                return state, 0
        if action == 'W':
            if state[1] == 0:
                return state, -1
            else:
                state = state + np.array([0,-1])
                return state, 0
        if action == 'E':
            if state[1] == grid_size -1 :
                return state, -1
            else:
                state = state + np.array([0,1])
                return state, 0
            

In [13]:
gamma = 0.9
max_steps = 50   # Max steps per episode


In [14]:
def epsilon_greedy_policy(state, Q, epsilon):
    if np.random.uniform(0, 1) < epsilon:
        return np.random.choice(['N', 'S', 'W', 'E']) 
    else:
        best_actions = np.where(Q[state,:] == np.max(Q[state,:]))[0]
        return np.random.choice(best_actions)    # Exploit

In [None]:
def mc_off_policy(episodes, len_episode, epsilon_soft_start, epsilon_decay):
    grid_size = 5
    n_actions = 4
    
    Q = np.zeros((grid_size, grid_size, n_actions))
    C = np.zeros((grid_size, grid_size, n_actions))

    target_policy = 1/n_actions * np.ones((grid_size, grid_size, n_actions))

    total_rewards = np.zeros(episodes)

    #Monte-Carlo Prediction
    for episode in range(episodes):

        state = np.array([2,2])
        rewards = np.zeros(len_episode + 1)
        states = np.zeros((len_episode + 1, 2))
        states[0, :] = state #Save the initial state
        actions = []

        epsilon_soft = max(0.1, epsilon_soft_start * (epsilon_decay ** episode))
        print(epsilon_soft)

        for iter in range(len_episode):
            current_action = epsilon_greedy_policy(state, Q, epsilon_soft)
            new_state, reward = next_state(state, current_action)

            state = new_state
            states[iter+1, :] = state

            if current_action == 'N': 
                current_action = 0
            elif current_action == 'S': 
                current_action = 1
            elif current_action == 'W': 
                current_action = 2
            elif current_action == 'E': 
                current_action = 3

            actions.append(current_action)
            rewards[iter+1] = reward

        G = 0
        W = 1
        total_rewards[episode] = np.sum(rewards)

        for t in range(len_episode , 0, -1):
            
            current_state_x = int(states[t-1, 0])
            current_state_y = int(states[t-1, 1])

            G = G * gamma + rewards[t] 
        
            C[current_state_x, current_state_y, actions[t-1]] += W
            Q[current_state_x, current_state_y, actions[t-1]] += (W / C[current_state_x, current_state_y, actions[t-1]]) * (G - Q[current_state_x, current_state_y, actions[t-1]])

            best_actions = np.where(Q[current_state_x, current_state_y, :] == np.max(Q[current_state_x, current_state_y, :]))[0]
            
            target_policy[current_state_x, current_state_y, :] = 0
            target_policy[current_state_x, current_state_y, best_actions] = 1/len(best_actions)

            W *= 1/(1/(n_actions))
        
    return target_policy, Q


In [16]:
def visualize_action_value_with_numbers(Q):
    Q = np.round(Q, 2)
    n_rows, n_cols, n_dirs = Q.shape

    fig, ax = plt.subplots(figsize=(n_cols, n_rows))
    ax.set_xlim(-0.5, n_cols - 0.5)
    ax.set_ylim(-0.5, n_rows - 0.5)
    ax.set_xticks(np.arange(-0.5, n_cols, 1))
    ax.set_yticks(np.arange(-0.5, n_rows, 1))
    ax.grid(True)

    for i in range(n_rows):
        for j in range(n_cols):
            ax.text(j, i - 0.2, f"{Q[i, j, 0]}", ha='center', va='center', fontsize=8, color="red")  # North
            ax.text(j, i + 0.2, f"{Q[i, j, 1]}", ha='center', va='center', fontsize=8, color="green")  # South
            ax.text(j - 0.2, i, f"{Q[i, j, 2]}", ha='center', va='center', fontsize=8, color="blue")  # West
            ax.text(j + 0.2, i, f"{Q[i, j, 3]}", ha='center', va='center', fontsize=8, color="orange")  # East

    # Reverse the y-axis to align with matrix indexing
    ax.invert_yaxis()
    plt.title("Action Value Visualization")
    plt.show()

In [17]:
def visualize_policy(policy):
    n_rows, n_cols, n_dirs = policy.shape

    fig, ax = plt.subplots(figsize=(n_cols, n_rows))
    ax.set_xlim(-0.5, n_cols - 0.5)
    ax.set_ylim(-0.5, n_rows - 0.5)
    ax.set_xticks(np.arange(-0.5, n_cols, 1))
    ax.set_yticks(np.arange(-0.5, n_rows, 1))
    ax.grid(True)
    
    # Arrow parameters
    arrow_params = {
        "head_width": 0.2,
        "head_length": 0.2,
        "length_includes_head": True,
        "color": "blue",
    }
    
    for i in range(n_rows):
        for j in range(n_cols):
            # Check each direction
            if policy[i, j, 0] > 0:  # North
                ax.arrow(j, i, 0, -0.4, **arrow_params)
            if policy[i, j, 1] > 0:  # South
                ax.arrow(j, i, 0, 0.4, **arrow_params)
            if policy[i, j, 2] > 0:  # West
                ax.arrow(j, i, -0.4, 0, **arrow_params)
            if policy[i, j, 3] > 0:  # East
                ax.arrow(j, i, 0.4, 0, **arrow_params)
    
    # Reverse the y-axis to align with matrix indexing
    ax.invert_yaxis()
    plt.title("Policy Visualization")
    plt.show()


In [18]:
target_policy, Q = mc_off_policy(5000, 50, 1, 0.99)
visualize_policy(target_policy)
visualize_action_value_with_numbers(Q)

TypeError: cannot unpack non-iterable NoneType object