In [7]:
# %%
# import numpy

# %% [markdown]
#     Environment setup

# %%


class Environment:
    def __init__(self):
        self.nS = 16
        self.nA = 4
        self.states = [i for i in range(16)]
        self.actions = {0: 'Left', 1: 'Up', 2: 'Right', 3: 'Down'}
        self.env = {
            state: {
                action: {
                    's_Prob': 0, 'n_State': 0, 's_Reward': 0, 'Terminated': False
                } for action in [i for i in range(4)]
            } for state in [i for i in range(16)]
        }
        states_set = set(range(16))
        action = 0  # action Left
        for state in [0, 4, 8, 12]:
            self.env[state][0] = {
                's_Prob': 1, 'n_State': state, 's_Reward': -5, 'Terminated': False}

        for state in states_set - set([0, 4, 8, 12]):
            self.env[state][0] = {
                's_Prob': 1, 'n_State': state-1, 's_Reward': -1, 'Terminated': False}

        action = 1  # Action Up
        for state in [0, 1, 2, 3]:
            self.env[state][1] = {
                's_Prob': 1, 'n_State': state, 's_Reward': -5, 'Terminated': False}

        for state in states_set - set([0, 1, 2, 3]):
            self.env[state][1] = {
                's_Prob': 1, 'n_State': state-4, 's_Reward': -1, 'Terminated': False}

        action = 2  # right
        for state in [3, 7, 11]:
            self.env[state][2] = {
                's_Prob': 1, 'n_State': state, 's_Reward': -5, 'Terminated': False}

        for state in states_set - set([3, 7, 11, 15]):
            self.env[state][2] = {
                's_Prob': 1, 'n_State': state+1, 's_Reward': -1, 'Terminated': False}

        action = 3  # action Down
        for state in [12, 13, 14]:
            self.env[state][3] = {
                's_Prob': 1, 'n_State': state, 's_Reward': -5, 'Terminated': False}

        for state in states_set - set([12, 13, 14, 15]):
            self.env[state][3] = {
                's_Prob': 1, 'n_State': state+4, 's_Reward': -1, 'Terminated': False}

        state = 15
        self.env[state][0] = {'s_Prob': 1, 'n_State': state,
                              's_Reward': 0, 'Terminated': True}  # left
        self.env[state][1] = {'s_Prob': 1, 'n_State': state,
                              's_Reward': 0, 'Terminated': True}  # up
        self.env[state][2] = {'s_Prob': 1, 'n_State': state,
                              's_Reward': 0, 'Terminated': True}  # right
        self.env[state][3] = {'s_Prob': 1, 'n_State': state,
                              's_Reward': 0, 'Terminated': True}  # down


# %%
def reward_funtion(prob_matrix, envirnoment):
    state_reward = [[envirnoment.env[state][action]['s_Reward']
                     for action in range(4)] for state in range(16)]
    # print(state_reward)
    reward = state_reward * prob_matrix
    # print(reward)
    return reward

# %%


def value_funtion(environment, values, reward, prob_matrix, gamma):   
    values = reward + gamma * (prob_matrix * reward)
    return values


# %% [markdown]
#     main funtion

# # %%
# def main():
#     environment = Environment()
#     print("Environment --------")
#     print(f'\nnumber of states : {environment.nS}')
#     print(f'number of action at each state : {environment.nA}')
#     print(f'states {environment.states}')
#     print(f'action {environment.actions.values()}')
#     print('\n\n')
#     # print(f'env {environment.env}')
#     prob_matrix_states = [[environment.env[state][action]['s_Prob']
#                            for action in range(4)] for state in range(16)]
#     # print(prob_matrix)

#     # random uniform policy in staring
#     action_prob = numpy.ones((environment.nS, environment.nA)) / environment.nA
#     print(f"policy : \n {action_prob}")
#     # prob matrix to reach state {s'} from {s} given action 'a'
#     prob_matrix = prob_matrix_states * action_prob
#     # print(prob_matrix)
#     # print(prob_matrix.shape)
#     reward = reward_funtion(prob_matrix=prob_matrix, envirnoment=environment)
#     print(f"reward : \n {reward}")

#     values = numpy.zeros((environment.nS, environment.nA))

#     values = value_funtion(environment=environment, values=values,
#                            reward=reward, prob_matrix=prob_matrix, gamma=1)

#     print(f"values : \n {values}")


# # %%
# if __name__ == '__main__':
#     main()

# # %%
# environment = Environment()
# print(environment.env)

# # %%


In [10]:
# %%
import numpy as np

# %%


def one_step_lookahead(environment, state, V, discount_factor):
    # Create a vector of dimensionality same as the number of actions

    action_values = np.zeros(environment.nA)
    # print(f"value : {V}")
    for action in range(environment.nA):
        # print(f"action {action}")
        dic_val = environment.env[state][action]
        list_1 = [tuple(dic_val.values())]
        for probability, next_state, reward, terminated in list_1:
            # print(f"next state {next_state}")
            action_values[action] += probability * \
                (reward + discount_factor * V[int(next_state)])

    return action_values


# %%
def value_iteration(environment, discount_factor=1.0, theta=1e-9, max_iterations=1e9):

    # Create a vector of dimensionality as the number of states
    # V = np.zeros(environment.nS)
    V = [0 for i in range(16)]

    for i in range(int(max_iterations)):

        # early stopping condition
        delta = 0

        for state in range(environment.nS):

            # Perform the one-step lookahead to get the action values for the state
            action_values = one_step_lookahead(
                environment, state, V, discount_factor)

            # Get the best action value
            best_action_value = np.max(action_values)

            # Compute the maximum change in the value for each state
            delta = max(delta, abs(V[state] - best_action_value))

            # Update the best value for the state
            V[state] = best_action_value

        # Early stopping condition
        if(delta < theta):
            print('Value iteration converged at iteration #%d' % i)
            break

    # Find the optimal policy corresponding to the optimal value function
    policy = np.zeros((environment.nS, environment.nA))

    for state in range(environment.nS):

        action_values = one_step_lookahead(
            environment, state, V, discount_factor)

        # Choose the best action
        best_action = np.argmax(action_values)

        policy[state][best_action] = 1.0

    return policy, V


# %%

# environment = md.Environment()
# dic = environment.env[0][0]
# print(f"dict : {dic}")
# print(tuple(dic.values()))

# state = 1
# V = [0 for i in range(16)]
# discount_factor = 0.5
# action_values = one_step_lookahead(environment=environment, state=state,
#                                    V=V, discount_factor=discount_factor)

# print(f"Action values : \n {action_values}")

environment = Environment()
discount_factor = 0.8
policy, value = value_iteration(
    environment=environment, discount_factor=discount_factor)


print(f"policy : \n{policy}") 
print(f"value : \n{value}") 


# %%



Value iteration converged at iteration #6
policy : 
[[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]
value : 
[-3.6892800000000006, -3.3616000000000006, -2.9520000000000004, -2.4400000000000004, -3.3616000000000006, -2.9520000000000004, -2.4400000000000004, -1.8, -2.9520000000000004, -2.4400000000000004, -1.8, -1.0, -2.4400000000000004, -1.8, -1.0, 0.0]
