In [7]:
import numpy as np
from rl_envs.grid_world_env import GridWorldEnv
from agents.policy_iteration_agent import TruncatedPolicyIterationAgent
# rl_envs.grid_world_env import GridWorldEnv

%load_ext autoreload 
# %aimport rl_envs.grid_world_env

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
# Taken from Policy Evaluation Exercise!

def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):
    """
    Evaluate a policy given an environment and a full description of the environment's dynamics.
    
    Args:
        policy: [S, A] shaped matrix representing the policy.
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is a number of states in the environment. 
            env.nA is a number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
    
    Returns:
        Vector of length env.nS representing the value function.
    """
    # Start with a random (all 0) value function
    V = np.zeros(env.nS)
    while True:
        delta = 0
        # For each state, perform a "full backup"
        for s in range(env.nS):
            v = 0
            # Look at the possible next actions
            for a, action_prob in enumerate(policy[s]):
                # For each action, look at the possible next states...
                for  prob, next_state, reward, done in env.P[s][a]:
                    # Calculate the expected value
                    v += action_prob * prob * (reward + discount_factor * V[next_state])
            # How much our value function changed (across any states)
            delta = max(delta, np.abs(v - V[s]))
            V[s] = v
        # Stop evaluating once our value function change is below a threshold
        if delta < theta:
            break
    return np.array(V)

In [9]:
def policy_improvement(env, policy_eval_fn=policy_eval, discount_factor=1.0):
    """
    Policy Improvement Algorithm. Iteratively evaluates and improves a policy
    until an optimal policy is found.
    
    Args:
        env: The OpenAI environment.
        policy_eval_fn: Policy Evaluation function that takes 3 arguments:
            policy, env, discount_factor.
        discount_factor: gamma discount factor.
        
    Returns:
        A tuple (policy, V). 
        policy is the optimal policy, a matrix of shape [S, A] where each state s
        contains a valid probability distribution over actions.
        V is the value function for the optimal policy.
        
    """

    def one_step_lookahead(state, V):
        """
        Helper function to calculate the value for all action in a given state.
        
        Args:
            state: The state to consider (int)
            V: The value to use as an estimator, Vector of length env.nS
        
        Returns:
            A vector of length env.nA containing the expected value of each action.
        """
        A = np.zeros(env.nA)
        for a in range(env.nA):
            for prob, next_state, reward, done in env.P[state][a]:
                A[a] += prob * (reward + discount_factor * V[next_state])
        return A
    
    # Start with a random policy
    policy = np.ones([env.nS, env.nA]) / env.nA
    
    while True:
        # Evaluate the current policy
        V = policy_eval_fn(policy, env, discount_factor)
        
        # Will be set to false if we make any changes to the policy
        policy_stable = True
        
        # For each state...
        for s in range(env.nS):
            # The best action we would take under the current policy
            chosen_a = np.argmax(policy[s])
            
            # Find the best action by one-step lookahead
            # Ties are resolved arbitarily
            action_values = one_step_lookahead(s, V)
            best_a = np.argmax(action_values)
            
            # Greedily update the policy
            if chosen_a != best_a:
                policy_stable = False
            policy[s] = np.eye(env.nA)[best_a]
        
        # If the policy is stable we've found an optimal policy. Return it
        if policy_stable:
            return policy, V

In [15]:
def print_actions_policy(policy, env):
    index = 0
    for i in range(env.height):
        print("[", end=" ")
        for j in range(env.width):
            action = np.argmax(policy[index])
            index+=1
            print(env.action_mappings[action], end=" ")
        print("]")
def print_actions_index(agent, env):
    index = 0
    for i in range(env.height):
        print("[", end=" ")
        for j in range(env.width):
            action = agent.get_action(index)
            print(env.action_mappings[action], end=" ")
            index += 1
        print("]")
def print_actions(agent, env):
    for i in range(env.height):
        print("[", end=" ")
        for j in range(env.width):
            action = agent.get_action((i,j))
            print(env.action_mappings[action], end=" ")
        print("]")


In [10]:
env = GridWorldEnv(5, 5, forbidden_grids=[(1,1),(1,2), (2,2),(3,1),(3,3),(4,1)], target_grids=[(3,2)], forbidden_reward=-1, hit_wall_reward=-1)
# env = GridWorldEnv(2, 2, forbidden_grids=[(0,1)], target_grids=[(1,1)])
env.init_model_based_transitions()

In [19]:

policy, V = policy_improvement(env, discount_factor=0.9)
# print("Policy Probability Distribution:")
# print(policy)
# print("")

print("Value Function:")
print(V)
print("")

print_actions_policy(policy, env)

Value Function:
[5.83191665 5.57991665 6.19991665 6.47992498 5.83193248 6.47991665
 7.19991665 7.99991665 7.19992498 6.47993248 7.19991665 7.99991665
 9.99991665 7.99992498 7.19993248 7.99991665 9.99991665 9.99991665
 9.99992498 7.99993248 7.19992498 8.99992498 9.99992498 8.99993248
 8.09993924]

[  ↓   →   ↓   ↓   ↓  ]
[  ↓   ↓   ↓   ↓   ↓  ]
[  →   →   ↓   ↓   ↓  ]
[  →   →   ↺   ←   ←  ]
[  ↑   →   ↑   ←   ←  ]


In [16]:
agent = TruncatedPolicyIterationAgent(action_space_n=env.possible_actions, discounted_factor=0.9, threshold=0.001)
agent.initialize_policy()

agent.RUN(env)
print_actions(agent, env)
print(agent.v.values())

[  ↓   →   ↓   ↓   ↓  ]
[  ↓   ↓   ↓   ↓   ↓  ]
[  →   →   ↓   ↓   ↓  ]
[  →   →   ↺   ←   ←  ]
[  ↑   →   ↑   ←   ←  ]
dict_values([5.832425190730032, 5.580425190730033, 6.200425190730033, 6.480425190730033, 5.832425190730032, 6.480425190730033, 7.200425190730033, 8.000425190730033, 7.200425190730033, 6.480425190730033, 7.200425190730033, 8.000425190730033, 10.000425190730033, 8.000425190730033, 7.200425190730033, 8.000425190730033, 10.000425190730033, 10.000425190730033, 10.000425190730033, 8.000425190730033, 7.200425190730033, 9.000425190730033, 10.000425190730033, 9.000425190730033, 8.100425190730032])


In [20]:
print("\nUnmatch state value below:\n")
myV = list(agent.v.values())
for i in range(len(V)):
    if V[i] != myV[i]:
        print(myV[i], V[i], V[i]-myV[i])


Unmatch state value below:

5.832425190730032 5.831916647515824 -0.0005085432142086788
5.580425190730033 5.579916647515823 -0.0005085432142095669
6.200425190730033 6.199916647515823 -0.0005085432142095669
6.480425190730033 6.479924982764241 -0.0005002079657918657
5.832425190730032 5.831932484487817 -0.0004927062422153128
6.480425190730033 6.479916647515823 -0.0005085432142095669
7.200425190730033 7.199916647515823 -0.0005085432142095669
8.000425190730033 7.999916647515823 -0.0005085432142095669
7.200425190730033 7.199924982764241 -0.0005002079657918657
6.480425190730033 6.479932484487817 -0.000492706242216201
7.200425190730033 7.199916647515823 -0.0005085432142095669
8.000425190730033 7.999916647515823 -0.0005085432142095669
10.000425190730033 9.999916647515823 -0.0005085432142095669
8.000425190730033 7.999924982764242 -0.0005002079657909775
7.200425190730033 7.199932484487817 -0.0004927062422153128
8.000425190730033 7.999916647515823 -0.0005085432142095669
10.000425190730033 9.999916