<a href="https://colab.research.google.com/github/syma-afsha/Complete_Reinforcement_Learning/blob/main/MDP_PolicyIteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:

class MDP(object):
    # Initialize the MDP with N number of states (blocks)
    def __init__(self, N):
        self.N = N  # Number of blocks (states)

    # Define the start state (always starts at state 1)
    def StartState(self):
        return 1

    # Check if the given state is the terminal state (goal state)
    def isEnd(self, state):
        return state == self.N

    # Define possible actions for a given state
    def Actions(self, state):
        result = []
        # If walking to the next state is valid, add "walk" action
        if state + 1 <= self.N:
            result.append("walk")
        # If using the tram to double the state is valid, add "tram" action
        if state * 2 <= self.N:
            result.append("tram")
        return result

    # Define transition probabilities, rewards, and next states for each action
    def TransitionProb(self, state, action):
        result = []
        if action == "walk":
            # "walk" always transitions to state+1 with probability 1 and a reward of -1
            result.append((state + 1, 1, -1))
        if action == "tram":  # action == "tram"
            # "tram" has a 50% chance to double the state and a 50% chance to fail
            result.append((state * 2, 0.5, -2))  # Successful tram
            result.append((state, 0.5, -2))  # Failed tram (stay in the same state)
        return result

    # Define the discount factor (γ), which reduces the value of future rewards
    def discount(self):
        return 0.8

    # Return a list of all possible states (1 to N)
    def states(self):
        return range(1, self.N + 1)

# Perform Policy Iteration to compute optimal state values and policy
def PolicyIteration(mdp, epsilon=1e-10):
    # Initialize policy and state values
    policy = {state: "walk" for state in mdp.states() if not mdp.isEnd(state)}
    policy[mdp.N] = None  # Terminal state has no policy
    V = {state: 0 for state in mdp.states()}

    # Function to compute Q-value for a given state and action
    def Q(state, action):
        return sum(prob * (reward + mdp.discount() * V[newState])
                   for newState, prob, reward in mdp.TransitionProb(state, action))

    while True:
        # Policy Evaluation: Iteratively compute v_pi(s)
        while True:
            delta = 0
            newV = V.copy()  # Create a copy to store new values
            for state in mdp.states():
                if mdp.isEnd(state):
                    newV[state] = 0
                else:
                    # Compute the value of the state under the current policy
                    action = policy[state]
                    newV[state] = Q(state, action)

                # Update delta to track the maximum change
                delta = max(delta, abs(V[state] - newV[state]))

            V = newV  # Update all state values after processing all states
            if delta < epsilon:
                break

        # Policy Improvement: Update the policy to be greedy w.r.t. V(s)
        policy_stable = True
        for state in mdp.states():
            if mdp.isEnd(state):
                best_action = None
            else:
              # Find the best action based on the current value function
               best_action = max((Q(state, action), action) for action in mdp.Actions(state))[1]


            if policy[state] != best_action:
                policy_stable = False
                policy[state] = best_action

        # If the policy is stable, we've found the optimal policy
        if policy_stable:
            break

    # Print the final results
    print("Optimal Policy and State Values:")
    for state in mdp.states():
        if mdp.isEnd(state):
            print(f"State: {state}, Optimal Policy: None, Optimal Value: {V[state]:.2f}")
        else:
            print(f"State: {state}, Optimal Policy: {policy[state]}, Optimal Value: {V[state]}")

# Create an MDP with 10 states (blocks)
mdp = MDP(N=10)
PolicyIteration(mdp)


Optimal Policy and State Values:
State: 1, Optimal Policy: walk, Optimal Value: -4.317333333384255
State: 2, Optimal Policy: walk, Optimal Value: -4.146666666692127
State: 3, Optimal Policy: walk, Optimal Value: -3.933333333346064
State: 4, Optimal Policy: walk, Optimal Value: -3.666666666673032
State: 5, Optimal Policy: tram, Optimal Value: -3.3333333333365163
State: 6, Optimal Policy: walk, Optimal Value: -2.9520000000000004
State: 7, Optimal Policy: walk, Optimal Value: -2.4400000000000004
State: 8, Optimal Policy: walk, Optimal Value: -1.8
State: 9, Optimal Policy: walk, Optimal Value: -1.0
State: 10, Optimal Policy: None, Optimal Value: 0.00
