In [175]:
import numpy as np
import pandas as pd
import random

### Import files

In [164]:
def load_matrices(file_f):
    l = np.zeros([81,81])
    with open(file_f, 'r') as f:
        for line in f.readlines():
            line = line.split()
            l[int(line[1])-1,int(line[0])-1] = float(line[2]) # [s',s]
    return l

In [165]:
a1 = load_matrices('hw9_prob_a1.txt')
a2 = load_matrices('hw9_prob_a2.txt')
a3 = load_matrices('hw9_prob_a3.txt')
a4 = load_matrices('hw9_prob_a4.txt')
rewards = np.loadtxt('hw9_rewards.txt')
gamma = 0.9925

In [270]:
actions = ['W', 'N', 'E', 'S']  # west, east, north, south

### 9.3 (a) Compute optimal policy $\pi^{*}(s)$ and optimal value function $V^*(s)$ using policy iteration
- $\pi'(s) = argmax_{a}[\sum_{s'}P(s'|s,a)V^{\pi}(s')]$
- $V^{\pi}=R(s)+\gamma\sum_{s'}P(s'|s,\pi(s))V^{\pi}(s')$

In [264]:
action_dict = {'W': a1, 'N': a2, 'E': a3, 'S': a4}

In [265]:
def policy_eval(policy):
    g_matrix = np.zeros([81,81])
    i_matrix = np.identity(81)
    
    states = range(81)  # fix
    for i in range(len(states)):       # s
        for j in range(len(states)):   # s'
            g_matrix[i,j] = gamma*action_dict[policy[i]][j,i]
    i_g_matrix = i_matrix - g_matrix
    x = np.dot(np.linalg.inv(i_g_matrix),rewards)
    return x

In [266]:
def greedy(old_policy, v_list): 
    states = range(81) # fix
    pi_vals = np.full([len(states),len(actions)],-np.inf)
    # compute all values
    for a in range(len(actions)):
        for s in range(len(states)):
            pi_vals[s,a] = sum([action_dict[actions[a]][sp,s] * v_list[sp] for sp in range(81)])
    # find best action for each row 
    best_actions = [actions[np.argmax(row)] for row in pi_vals]
    return best_actions

In [267]:
def generate_policy():
    l = []
    states = range(81)  # fix
    for i in range(len(states)):  # 81
        l.append(random.choice(actions))
    return l

In [268]:
def policy_iteration():
    # initiate policy at random
    policy = generate_policy()
    state_value_func = policy_eval(policy)

    # repeat until convergence
    while True:
        new_policy = greedy(policy, state_value_func)
        new_state_value_func = policy_eval(new_policy)

        # check if converged
        if all(state_value_func == new_state_value_func):
            break

        policy = new_policy
        state_value_func = new_state_value_func
    
    new_policy = np.array(new_policy).reshape(9,9)
    return new_policy.T

In [269]:
print policy_iteration()

[['W' 'W' 'W' 'W' 'W' 'W' 'W' 'W' 'W']
 ['W' 'E' 'E' 'S' 'W' 'W' 'S' 'W' 'W']
 ['E' 'N' 'W' 'S' 'W' 'W' 'S' 'W' 'W']
 ['W' 'W' 'S' 'W' 'W' 'W' 'S' 'W' 'W']
 ['W' 'W' 'S' 'W' 'W' 'W' 'S' 'W' 'W']
 ['W' 'S' 'W' 'W' 'W' 'W' 'S' 'W' 'W']
 ['W' 'S' 'W' 'E' 'E' 'E' 'E' 'E' 'W']
 ['W' 'E' 'E' 'N' 'W' 'E' 'E' 'N' 'W']
 ['W' 'W' 'W' 'W' 'W' 'W' 'W' 'W' 'W']]
