# MonteCarlo Control

Control is the process of finding the best policy for a given environment. We follow generalised policy iteration and alternate between policy evaluation and policy improvement.


In [1]:
import numpy as np
from gridworld import GridWorld
np.set_printoptions(precision=3,suppress=True)

Return computation function now takes as input a list containing a 3-tuple (state,action,reward) and the discount factor gamma, the output is a value representing the return from that state.

In [2]:
def get_return(state_list, gamma):
    counter = 0
    return_value = 0
    for visit in state_list:
        reward = visit[2]
        return_value += reward * np.power(gamma, counter)
        counter += 1
    return return_value

Get a policy and improve it by taking a greedy action.

In [13]:
def improve_policy(episode_list, policy_matrix, state_action_matrix):
    
    for visit in episode_list:
        observation = visit[0]
        col = observation[1] + (observation[0]*4)
        if(policy_matrix[observation[0], observation[1]] != -1):      
            policy_matrix[observation[0], observation[1]] = np.argmax(state_action_matrix[:,col])
    return policy_matrix


1.Lets create a grid world, the states marked 1 are terminal state and those marked -1 contain obstacles. <br>
2.The agent receives a reward of -0.04 for every move from non-terminal states <br>
3.The  actions are UP(0), RIGHT(1), DOWN(2) and LEFT(3)

In [4]:

env = GridWorld(3, 4)

#Define the state matrix
state_matrix = np.zeros((3,4))
state_matrix[0, 3] = 1
state_matrix[1, 3] = 1
state_matrix[1, 1] = -1
print("State Matrix:")
print(state_matrix)

#Define the reward matrix
reward_matrix = np.full((3,4), -0.04)
reward_matrix[0, 3] = 1
reward_matrix[1, 3] = -1
print("Reward Matrix:")
print(reward_matrix)

#Define the transition matrix
transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                              [0.1, 0.8, 0.1, 0.0],
                              [0.0, 0.1, 0.8, 0.1],
                              [0.1, 0.0, 0.1, 0.8]])

#Random policy
policy_matrix = np.random.randint(low=0, high=4, size=(3, 4)).astype(np.float32)
policy_matrix[1,1] = np.NaN #NaN for the obstacle at (1,1)
policy_matrix[0,3] = policy_matrix[1,3] = -1 #No action for the terminal states

#Set the matrices in the world
env.setStateMatrix(state_matrix)
env.setRewardMatrix(reward_matrix)
env.setTransitionMatrix(transition_matrix)

State Matrix:
[[ 0.  0.  0.  1.]
 [ 0. -1.  0.  1.]
 [ 0.  0.  0.  0.]]
Reward Matrix:
[[-0.04 -0.04 -0.04  1.  ]
 [-0.04 -0.04 -0.04 -1.  ]
 [-0.04 -0.04 -0.04 -0.04]]


In [14]:

state_action_matrix = np.random.random_sample((4,12)) # Q
#init with 1.0e-10 to avoid division by zero
running_mean_matrix = np.full((4,12), 1.0e-10) 
gamma = 0.999
tot_epoch = 100000
print_epoch = 20

for epoch in range(tot_epoch):
    #Starting a new episode
    episode_list = list()
    #Reset and return the first observation and reward
    observation = env.reset(exploring_starts=True)
    #action = np.random.choice(4, 1)
    #action = policy_matrix[observation[0], observation[1]]
    #episode_list.append((observation, action, reward))
    is_starting = True
    for _ in range(1000):
        #Take the action from the action matrix
        action = policy_matrix[observation[0], observation[1]]
        #If the episode just started then it is
            #necessary to choose a random action (exploring starts)
        if(is_starting): 
            action = np.random.randint(0, 4)
            is_starting = False      
        #Move one step in the environment and get obs and reward
        new_observation, reward, done = env.step(action)
        #Append the visit in the episode list
        episode_list.append((observation, action, reward))
        observation = new_observation
        if done: break
    #The episode is finished, now estimating the utilities
    counter = 0
    #Checkup to identify if it is the first visit to a state
    checkup_matrix = np.zeros((4,12))
    #This cycle is the implementation of First-Visit MC.
    #For each state stored in the episode list check if it
    #is the rist visit and then estimate the return.
    for visit in episode_list:
        observation = visit[0]
        action = visit[1]
        col = int(observation[1] + (observation[0]*4))
        row = int(action)
        if(checkup_matrix[row, col] == 0):
            return_value = get_return(episode_list[counter:], gamma)
            running_mean_matrix[row, col] += 1
            state_action_matrix[row, col] += return_value
            checkup_matrix[row, col] = 1
        counter += 1
    #Policy Update
    policy_matrix = improve_policy(episode_list, 
                                  policy_matrix, 
                                  state_action_matrix/running_mean_matrix)
    #Printing
    if(epoch % print_epoch == 0):
        print("")
        #print("State-Action matrix after " + str(epoch+1) + " iterations:") 
        #print(state_action_matrix / running_mean_matrix)
        print("Policy matrix after " + str(epoch+1) + " iterations:") 
        print(policy_matrix)
        
#Time to check the value  matrix obtained
print("value matrix after " + str(tot_epoch) + " iterations:")
print(state_action_matrix / running_mean_matrix)

print("Policy matrix after " + str(epoch+1) + " iterations:") 
print(policy_matrix)



Policy matrix after 1 iterations:
[[ 0.  2.  2. -1.]
 [ 2. nan  0. -1.]
 [ 2.  0.  0.  3.]]

Policy matrix after 21 iterations:
[[ 1.  0.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  1.]]

Policy matrix after 41 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  0.]]

Policy matrix after 61 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  0.]]

Policy matrix after 81 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  0.]]

Policy matrix after 101 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  0.]]

Policy matrix after 121 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  3.]]

Policy matrix after 141 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  3.]]

Policy matrix after 161 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  3.]]

Policy matrix after 181 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  3.]]

Policy matrix after 201 iterations:
[[ 1.  1.  1. -1.]



Policy matrix after 2161 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  3.]]

Policy matrix after 2181 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  3.]]

Policy matrix after 2201 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  3.]]

Policy matrix after 2221 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  3.]]

Policy matrix after 2241 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  3.]]

Policy matrix after 2261 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  3.]]

Policy matrix after 2281 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  3.]]

Policy matrix after 2301 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  3.]]

Policy matrix after 2321 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  3.]]

Policy matrix after 2341 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  2.  0.  3.]]

Policy matrix after 2361 iterations:
[[


Policy matrix after 4221 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 4241 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 4261 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 4281 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 4301 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 4321 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 4341 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 4361 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 4381 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 4401 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 4421 iterations:
[[


Policy matrix after 6261 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 6281 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 6301 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 6321 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 6341 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 6361 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 6381 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 6401 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 6421 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 6441 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 6461 iterations:
[[


Policy matrix after 8161 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 8181 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 8201 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 8221 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 8241 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 8261 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 8281 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 8301 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 8321 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 8341 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 8361 iterations:
[[


Policy matrix after 9981 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 10001 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 10021 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 10041 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 10061 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 10081 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 10101 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 10121 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 10141 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 10161 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 10181 iter


Policy matrix after 11801 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 11821 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 11841 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 11861 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 11881 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 11901 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 11921 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 11941 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 11961 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 11981 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 12001 ite


Policy matrix after 13681 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 13701 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 13721 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 13741 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 13761 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 13781 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 13801 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 13821 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 13841 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 13861 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 13881 ite


Policy matrix after 15581 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 15601 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 15621 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 15641 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 15661 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 15681 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 15701 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 15721 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 15741 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 15761 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 15781 ite


Policy matrix after 17501 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 17521 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 17541 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 17561 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 17581 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 17601 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 17621 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 17641 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 17661 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 17681 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 17701 ite


Policy matrix after 19461 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 19481 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 19501 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 19521 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 19541 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 19561 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 19581 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 19601 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 19621 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 19641 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 19661 ite


Policy matrix after 21361 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 21381 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 21401 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 21421 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 21441 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 21461 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 21481 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 21501 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 21521 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 21541 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 21561 ite


Policy matrix after 23101 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 23121 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 23141 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 23161 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 23181 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 23201 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 23221 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 23241 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 23261 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 23281 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 23301 ite


Policy matrix after 25101 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 25121 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 25141 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 25161 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 25181 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 25201 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 25221 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 25241 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 25261 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 25281 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 25301 ite


Policy matrix after 27141 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 27161 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 27181 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 27201 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 27221 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 27241 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 27261 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 27281 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 27301 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 27321 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  0.  3.]]

Policy matrix after 27341 ite


Policy matrix after 28901 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 28921 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 28941 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 28961 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 28981 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 29001 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 29021 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 29041 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 29061 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 29081 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 29101 ite


Policy matrix after 31041 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 31061 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 31081 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 31101 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 31121 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 31141 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 31161 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 31181 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 31201 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 31221 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 31241 ite


Policy matrix after 33221 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 33241 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 33261 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 33281 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 33301 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 33321 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 33341 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 33361 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 33381 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 33401 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 33421 ite


Policy matrix after 35461 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 35481 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 35501 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 35521 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 35541 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 35561 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 35581 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 35601 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 35621 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 35641 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 35661 ite


Policy matrix after 37701 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 37721 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 37741 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 37761 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 37781 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 37801 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 37821 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 37841 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 37861 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 37881 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 37901 ite


Policy matrix after 39681 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 39701 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 39721 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 39741 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 39761 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 39781 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 39801 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 39821 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 39841 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 39861 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 39881 ite


Policy matrix after 41741 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 41761 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 41781 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 41801 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 41821 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 41841 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 41861 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 41881 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 41901 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 41921 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 41941 ite


Policy matrix after 43801 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 43821 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 43841 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 43861 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 43881 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 43901 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 43921 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 43941 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 43961 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 43981 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 44001 ite


Policy matrix after 45981 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 46001 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 46021 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 46041 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 46061 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 46081 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 46101 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 46121 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 46141 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 46161 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 46181 ite


Policy matrix after 48101 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 48121 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 48141 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 48161 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 48181 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 48201 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 48221 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 48241 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 48261 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 48281 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 48301 ite


Policy matrix after 50201 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 50221 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 50241 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 50261 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 50281 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 50301 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 50321 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 50341 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 50361 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 50381 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 50401 ite


Policy matrix after 52261 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 52281 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 52301 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 52321 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 52341 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 52361 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 52381 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 52401 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 52421 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 52441 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 52461 ite


Policy matrix after 53961 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 53981 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 54001 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 54021 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 54041 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 54061 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 54081 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 54101 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 54121 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 54141 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 54161 ite


Policy matrix after 56161 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 56181 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 56201 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 56221 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 56241 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 56261 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 56281 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 56301 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 56321 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 56341 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 56361 ite


Policy matrix after 58341 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 58361 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 58381 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 58401 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 58421 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 58441 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 58461 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 58481 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 58501 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 58521 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 58541 ite


Policy matrix after 60501 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 60521 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 60541 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 60561 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 60581 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 60601 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 60621 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 60641 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 60661 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 60681 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 60701 ite


Policy matrix after 62241 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 62261 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 62281 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 62301 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 62321 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 62341 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 62361 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 62381 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 62401 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 62421 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 62441 ite


Policy matrix after 63941 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 63961 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 63981 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 64001 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 64021 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 64041 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 64061 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 64081 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 64101 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 64121 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 64141 ite


Policy matrix after 65701 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 65721 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 65741 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 65761 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 65781 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 65801 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 65821 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 65841 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 65861 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 65881 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 65901 ite


Policy matrix after 67441 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 67461 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 67481 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 67501 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 67521 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 67541 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 67561 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 67581 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 67601 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 67621 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 67641 ite


Policy matrix after 69601 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 69621 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 69641 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 69661 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 69681 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 69701 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 69721 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 69741 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 69761 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 69781 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 69801 ite


Policy matrix after 71761 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 71781 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 71801 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 71821 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 71841 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 71861 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 71881 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 71901 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 71921 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 71941 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 71961 ite


Policy matrix after 73501 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 73521 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 73541 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 73561 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 73581 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 73601 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 73621 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 73641 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 73661 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 73681 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 73701 ite


Policy matrix after 75521 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 75541 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 75561 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 75581 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 75601 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 75621 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 75641 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 75661 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 75681 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 75701 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 75721 ite


Policy matrix after 77641 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 77661 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 77681 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 77701 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 77721 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 77741 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 77761 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 77781 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 77801 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 77821 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 77841 ite


Policy matrix after 79381 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 79401 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 79421 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 79441 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 79461 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 79481 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 79501 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 79521 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 79541 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 79561 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 79581 ite


Policy matrix after 81141 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 81161 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 81181 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 81201 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 81221 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 81241 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 81261 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 81281 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 81301 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 81321 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 81341 ite


Policy matrix after 82861 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 82881 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 82901 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 82921 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 82941 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 82961 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 82981 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 83001 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 83021 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 83041 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 83061 ite


Policy matrix after 85061 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 85081 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 85101 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 85121 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 85141 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 85161 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 85181 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 85201 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 85221 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 85241 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 85261 ite


Policy matrix after 86881 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 86901 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 86921 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 86941 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 86961 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 86981 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 87001 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 87021 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 87041 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 87061 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 87081 ite


Policy matrix after 88621 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 88641 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 88661 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 88681 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 88701 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 88721 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 88741 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 88761 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 88781 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 88801 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 88821 ite


Policy matrix after 90741 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 90761 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 90781 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 90801 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 90821 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 90841 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 90861 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 90881 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 90901 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 90921 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 90941 ite


Policy matrix after 92761 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 92781 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 92801 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 92821 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 92841 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 92861 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 92881 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 92901 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 92921 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 92941 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 92961 ite


Policy matrix after 95021 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 95041 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 95061 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 95081 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 95101 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 95121 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 95141 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 95161 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 95181 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 95201 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 95221 ite


Policy matrix after 97201 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 97221 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 97241 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 97261 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 97281 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 97301 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 97321 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 97341 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 97361 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 97381 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 97401 ite


Policy matrix after 99441 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 99461 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 99481 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 99501 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 99521 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 99541 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 99561 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 99581 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 99601 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 99621 iterations:
[[ 1.  1.  1. -1.]
 [ 0. nan  0. -1.]
 [ 0.  3.  3.  3.]]

Policy matrix after 99641 ite