**Author:** [Tayyib Ul Hassan](https://github.com/tayyibgondal)

In [1]:
import numpy as np

In [2]:
# parameters
num_states = 2 # (0=rocky, 1=Ridge)
num_actions = 3 # (0=drilling, 1=digging, 2=push_debris)
discount_factor = 0.9

In [3]:
# initializing transition probabilities and rewards
transitions = np.zeros((num_states, num_actions, num_states))
rewards = np.zeros((num_states, num_actions, num_states))

In [4]:
# Mdp definition
# transition dynamics/uncertainities
transitions[0, 0, 0] = 0.30  # (rocky, drilling, rocky)
transitions[0, 1, 0] = 0.70  # (rocky, digging, rocky)
transitions[0, 2, 0] = 0.45  # (rocky, pushing, rocky)
transitions[0, 0, 1] = 0.70  # (rocky, drilling, ridge)
transitions[0, 1, 1] = 0.25  # (rocky, digging, ridge)
transitions[0, 2, 1] = 0.55  # (rocky, pushing, ridge)
transitions[1, 0, 0] = 0.60  # (ridge, drilling, rocky)
transitions[1, 1, 0] = 0.00  # (ridge, digging, rocky)
transitions[1, 2, 0] = 0.20   # (ridge, pushing, rocky)
transitions[1, 0, 1] = 0.40  # (ridge, drilling, ridge)
transitions[1, 1, 1] = 0.00  # (ridge, digging, ridge)
transitions[1, 2, 1] = 0.80  # (ridge, pushing, ridge)

# rewards
rewards[0, 0, 0] = 5  # (rocky, drilling, rocky)
rewards[0, 1, 0] = 7  # (rocky, digging, rocky)
rewards[0, 2, 0] = 9  # (rocky, pushing, rocky)
rewards[0, 0, 1] = 1  # (rocky, drilling, ridge)
rewards[0, 1, 1] = 7  # (rocky, digging, ridge)
rewards[0, 2, 1] = 5  # (rocky, pushing, ridge)
rewards[1, 0, 0] = 6  # (ridge, drilling, rocky)
rewards[1, 1, 0] = 0  # (ridge, digging, rocky)
rewards[1, 2, 0] = 10   # (ridge, pushing, rocky)
rewards[1, 0, 1] = 2  # (ridge, drilling, ridge)
rewards[1, 1, 1] = 0  # (ridge, digging, ridge)
rewards[1, 2, 1] = 2  # (ridge, pushing, ridge)

## Deterministic policy evaluation

In [5]:
# policy iteration
def policy_evaluation(policy, transitions, rewards, discount_factor, tol=1e-6):
    values = np.zeros(num_states)

    while True:
        delta = 0
        for s in range(num_states):
            v = values[s]
            action = policy[s]

            values[s] = sum(transitions[s, action, s_next] * (rewards[s, action, s_next] + discount_factor * values[s_next]) for s_next in range(num_states))

            delta = max(delta, abs(v - values[s]))

        if delta < tol:
            break

    return values

In [6]:
policy = np.zeros(num_states, dtype=int)
# deterministic policy
policy[0] = 0 
policy[1] = 2

values = policy_evaluation(policy, transitions, rewards, discount_factor)

In [7]:
# Results
print("The values for the states are: ")
print(values)

The values for the states are: 
[31.69230062 33.23076235]


## Stochastic Policy evaluation

In [8]:
import numpy as np

def stochastic_policy_evaluation(transitions, rewards, tolerance):
    values = np.zeros(num_states)

    while True:
        # Track the maximum change in values in this iteration
        delta = 0
        for s in range(num_states):
            # Store the current value of state s
            v = values[s]
            
            # Calculate the updated value for state s
            update = 0
            for action in range(num_actions):
                update += (1 / num_actions) * sum(
                    transitions[s, action, s_next] * (rewards[s, action, s_next] + discount_factor * values[s_next])
                    for s_next in range(num_states)
                )
            values[s] = update  # Set the new value for state s

            # Calculate the maximum change
            delta = max(delta, abs(v - values[s]))
        
        # Stop if the values have converged
        if delta < tolerance:
            break
    
    return values


In [9]:
# Evaluate the uniform random stochastic policy
values = stochastic_policy_evaluation(transitions, rewards, discount_factor)
print("The values for the states under the random stochastic policy are: ")
print(values)

The values for the states under the random stochastic policy are: 
[16.14838604  9.96899243]


### Gpt-ed

In [10]:
# Uniform random stochastic policy (each action is chosen with equal probability)
def random_stochastic_policy_evaluation(transitions, rewards, discount_factor, tol=1e-6):
    values = np.zeros(num_states)
    while True:
        delta = 0
        for s in range(num_states):
            v = values[s]
            values[s] = 0
            for a in range(num_actions):
                values[s] += (1 / num_actions) * sum(
                    transitions[s, a, s_next] * (rewards[s, a, s_next] + discount_factor * values[s_next]) 
                    for s_next in range(num_states)
                )
            delta = max(delta, abs(v - values[s]))
        if delta < tol:
            break
    return values

In [11]:
# Evaluate the uniform random stochastic policy
values = random_stochastic_policy_evaluation(transitions, rewards, discount_factor)
print("The values for the states under the random stochastic policy are: ")
print(values)

The values for the states under the random stochastic policy are: 
[8.7497855  5.49660592]
