<a href="https://colab.research.google.com/github/uday1257/RL1/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Step 1: Define the MDP
import numpy as np
# Define the transition probabilities in the format:
# P[state][action] = list of (probability, next_state, reward, done)
P = {
    0: {
        0: [(1.0, 0, 0, False)],
        1: [(1.0, 1, 0, False)]
    },
    1: {
        0: [(1.0, 0, 0, False)],
        1: [(1.0, 2, 1, False)]
    },
    2: {
        0: [(1.0, 1, 0, False)],
        1: [(1.0, 3, 10, True)]  # Goal state
    },
    3: {
        0: [(1.0, 3, 0, True)],
        1: [(1.0, 3, 0, True)]
    }
}

n_states = 4
n_actions = 2
gamma = 0.9

#Step 2: Value Iteration
def value_iteration(P, n_states, n_actions, gamma=0.9, theta=1e-6):
    V = np.zeros(n_states)
    while True:
        delta = 0
        for s in range(n_states):
            v = V[s]
            action_values = []
            for a in range(n_actions):
                q = 0
                for prob, next_state, reward, done in P[s][a]:
                    q += prob * (reward + gamma * V[next_state] * (not done))
                action_values.append(q)
            V[s] = max(action_values)
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break
    # Derive policy
    policy = np.zeros(n_states, dtype=int)
    for s in range(n_states):
        action_values = []
        for a in range(n_actions):
            q = 0
            for prob, next_state, reward, done in P[s][a]:
                q += prob * (reward + gamma * V[next_state] * (not done))
            action_values.append(q)
        policy[s] = np.argmax(action_values)
    return policy, V

#Step 3: Policy Iteration
def policy_iteration(P, n_states, n_actions, gamma=0.9):
    policy = np.zeros(n_states, dtype=int)
    V = np.zeros(n_states)
    is_policy_stable = False

    while not is_policy_stable:
        # Policy Evaluation
        while True:
            delta = 0
            for s in range(n_states):
                v = V[s]
                a = policy[s]
                V[s] = sum(prob * (reward + gamma * V[next_state] * (not done))
                           for prob, next_state, reward, done in P[s][a])
                delta = max(delta, abs(v - V[s]))
            if delta < 1e-6:
                break

        # Policy Improvement
        is_policy_stable = True
        for s in range(n_states):
            old_action = policy[s]
            action_values = []
            for a in range(n_actions):
                q = 0
                for prob, next_state, reward, done in P[s][a]:
                    q += prob * (reward + gamma * V[next_state] * (not done))
                action_values.append(q)
            best_action = np.argmax(action_values)
            policy[s] = best_action
            if old_action != best_action:
                is_policy_stable = False
    return policy, V

#Run Both Algorithms
vi_policy, vi_value = value_iteration(P, n_states, n_actions, gamma)
print("Value Iteration Policy:", vi_policy)
print("Value Iteration Value Function:", vi_value)

pi_policy, pi_value = policy_iteration(P, n_states, n_actions, gamma)
print("Policy Iteration Policy:", pi_policy)
print("Policy Iteration Value Function:", pi_value)

Value Iteration Policy: [1 1 1 0]
Value Iteration Value Function: [ 9. 10. 10.  0.]
Policy Iteration Policy: [1 1 1 0]
Policy Iteration Value Function: [ 9. 10. 10.  0.]
