<a href="https://colab.research.google.com/github/syma-afsha/Complete_Reinforcement_Learning/blob/main/MDP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Define a Markov Decision Process (MDP) with
𝑁 states (blocks) and two possible actions for moving between states:


"walk": Moves the state forward by 1 (deterministic, costs
−
1
−1).

"tram": Attempts to double the state (50% chance of success, 50% chance of failure, costs
−
2
−2).

The goal is to find the optimal policy and state values to minimize the total penalty (maximize rewards) starting from state 1 and ending at the terminal state
𝑁.

In [65]:
class MDP(object):
    # Initialize the MDP with N number of states (blocks)
    def __init__(self, N):
        self.N = N  # Number of blocks (states)

    # Define the start state (always starts at state 1)
    def StartState(self):
        return 1

    # Check if the given state is the terminal state (goal state)
    def isEnd(self, state):
        return state == self.N

    # Define possible actions for a given state
    def Actions(self, state):
        result = []
        # If walking to the next state is valid, add "walk" action
        if state + 1 <= self.N:
            result.append("walk")
        # If using the tram to double the state is valid, add "tram" action
        if state * 2 <= self.N:
            result.append("tram")
        return result

    # Define transition probabilities, rewards, and next states for each action
    def TransitionProb(self, state, action):
        result = []
        if action == "walk":
            # "walk" always transitions to state+1 with probability 1 and a reward of -1
            result.append((state + 1, 1, -1))
        if action == "tram":  # action == "tram"
            # "tram" has a 50% chance to double the state and a 50% chance to fail
            result.append((state * 2, 0.5, -2))  # Successful tram
            result.append((state, 0.5, -2))  # Failed tram (stay in the same state)
        return result

    # Define the discount factor (γ), which reduces the value of future rewards
    def discount(self):
        return 1.0

    # Return a list of all possible states (1 to N)
    def states(self):
        return range(1, self.N + 1)

# Perform Value Iteration to compute optimal state values and policy
def ValueIteration(mdp):
    # Initialize state values (V) to 0 for all states
    V = {}
    for state in mdp.states():
        V[state] = 0

    # Function to compute Q-value for a given state and action
    def Q(state, action):
        # Q(s, a) = Σ (probability * (reward + γ * V(new state)))
        return sum(prob * (reward + mdp.discount() * V[newState])
                   for newState, prob, reward in mdp.TransitionProb(state, action))

    while True:
        # Initialize a new dictionary for updated state values
        newV = {}
        for state in mdp.states():
            if mdp.isEnd(state):
                # Terminal state has a value of 0
                newV[state] = 0
            else:
                # Update state value as the maximum Q-value across all actions
                newV[state] = max(Q(state, action) for action in mdp.Actions(state))

        # Check for convergence (difference between old and new values is small)
        if max(abs(V[state] - newV[state]) for state in mdp.states()) < 1e-10:
            break

        # Update state values
        V = newV

        # Compute the optimal policy based on the updated values
        policy = {}
        for state in mdp.states():
            if mdp.isEnd(state):
                # Terminal state has no action
                policy[state] = None
            else:
                # Optimal action is the one with the highest Q-value
                policy[state] = max((Q(state, action), action) for action in mdp.Actions(state))[1]

        # Print the current state, optimal action, and value for each state
        for states in mdp.states():
            print(f"State:{states}, Optimal Policy: {policy[states]}, Optimal Value: {V[states]}")
        print("Done")
# Create an MDP with 10 states (blocks)
mdp = MDP(N=10)
for state in mdp.states():
    print(f"State: {state}")
    actions = mdp.Actions(state)
    print(f"  Actions: {actions}")
    for action in actions:
        transitions = mdp.TransitionProb(state, action)
        print(f"Action: {action}, Transition Probabilities: {transitions}")





State: 1
  Actions: ['walk', 'tram']
Action: walk, Transition Probabilities: [(2, 1, -1)]
Action: tram, Transition Probabilities: [(2, 0.5, -2), (1, 0.5, -2)]
State: 2
  Actions: ['walk', 'tram']
Action: walk, Transition Probabilities: [(3, 1, -1)]
Action: tram, Transition Probabilities: [(4, 0.5, -2), (2, 0.5, -2)]
State: 3
  Actions: ['walk', 'tram']
Action: walk, Transition Probabilities: [(4, 1, -1)]
Action: tram, Transition Probabilities: [(6, 0.5, -2), (3, 0.5, -2)]
State: 4
  Actions: ['walk', 'tram']
Action: walk, Transition Probabilities: [(5, 1, -1)]
Action: tram, Transition Probabilities: [(8, 0.5, -2), (4, 0.5, -2)]
State: 5
  Actions: ['walk', 'tram']
Action: walk, Transition Probabilities: [(6, 1, -1)]
Action: tram, Transition Probabilities: [(10, 0.5, -2), (5, 0.5, -2)]
State: 6
  Actions: ['walk']
Action: walk, Transition Probabilities: [(7, 1, -1)]
State: 7
  Actions: ['walk']
Action: walk, Transition Probabilities: [(8, 1, -1)]
State: 8
  Actions: ['walk']
Action: wal

In [66]:
# Perform Value Iteration to find the optimal policy and state values
ValueIteration(mdp)

State:1, Optimal Policy: walk, Optimal Value: -1.0
State:2, Optimal Policy: walk, Optimal Value: -1.0
State:3, Optimal Policy: walk, Optimal Value: -1.0
State:4, Optimal Policy: walk, Optimal Value: -1.0
State:5, Optimal Policy: walk, Optimal Value: -1.0
State:6, Optimal Policy: walk, Optimal Value: -1.0
State:7, Optimal Policy: walk, Optimal Value: -1.0
State:8, Optimal Policy: walk, Optimal Value: -1.0
State:9, Optimal Policy: walk, Optimal Value: -1.0
State:10, Optimal Policy: None, Optimal Value: 0
Done
State:1, Optimal Policy: walk, Optimal Value: -2.0
State:2, Optimal Policy: walk, Optimal Value: -2.0
State:3, Optimal Policy: walk, Optimal Value: -2.0
State:4, Optimal Policy: walk, Optimal Value: -2.0
State:5, Optimal Policy: walk, Optimal Value: -2.0
State:6, Optimal Policy: walk, Optimal Value: -2.0
State:7, Optimal Policy: walk, Optimal Value: -2.0
State:8, Optimal Policy: walk, Optimal Value: -2.0
State:9, Optimal Policy: walk, Optimal Value: -1.0
State:10, Optimal Policy: No