<a href="https://colab.research.google.com/github/rahul-727/Reinforcement-Learning-/blob/main/2348544_Lab3_RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

# Define the MDP Environment
class MDP:
    def __init__(self, states, actions, transition_prob, rewards, gamma=0.9):
        self.states = states
        self.actions = actions
        self.transition_prob = transition_prob  # Probabilities of moving from one state to another, given an action
        self.rewards = rewards  # Reward matrix R[s, a, s']
        self.gamma = gamma  # Discount factor

    # Value Iteration Algorithm
    def value_iteration(self, threshold=0.001):
        # Initialize value function for all states to zero
        V = np.zeros(len(self.states))
        policy = np.zeros(len(self.states), dtype=int)  # Optimal policy

        iteration = 0
        while True:
            delta = 0  # Change in value function
            new_V = np.copy(V)

            for s in range(len(self.states)):
                # Calculate the value of each action in state 's'
                action_values = np.zeros(len(self.actions))
                for a in range(len(self.actions)):
                    action_value = 0
                    for next_state in range(len(self.states)):
                        prob = self.transition_prob[s, a, next_state]
                        reward = self.rewards[s, a, next_state]
                        action_value += prob * (reward + self.gamma * V[next_state])
                    action_values[a] = action_value

                # Update the value of the state to the maximum action value
                best_action_value = np.max(action_values)
                new_V[s] = best_action_value
                policy[s] = np.argmax(action_values)  # Store the best action

                # Calculate the maximum difference between old and new value function
                delta = max(delta, abs(new_V[s] - V[s]))

            V = new_V
            iteration += 1

            # Stop if the change in value function is below the threshold
            if delta < threshold:
                break

        print(f"Converged after {iteration} iterations.")
        return V, policy


# Define the states, actions, transition probabilities, and rewards
states = [0, 1, 2]  # Example: 3 states
actions = [0, 1]  # Example: 2 actions

# Transition probability matrix P[s, a, s']
transition_prob = np.array([
    [[0.7, 0.3, 0], [0.4, 0.6, 0]],  # From state 0: action 0 and 1
    [[0.1, 0.8, 0.1], [0.5, 0.5, 0]],  # From state 1: action 0 and 1
    [[0, 0.2, 0.8], [0.3, 0.7, 0]],  # From state 2: action 0 and 1
])

# Reward matrix R[s, a, s']
rewards = np.array([
    [[5, 10, 0], [1, 3, 0]],  # Rewards from state 0
    [[0, 2, 1], [1, 2, 0]],   # Rewards from state 1
    [[0, 1, 10], [1, 5, 0]],  # Rewards from state 2
])

# Initialize the MDP
mdp = MDP(states, actions, transition_prob, rewards, gamma=0.9)

# Perform Value Iteration
V, policy = mdp.value_iteration()

# Output the results
print("Optimal Value Function:", V)
print("Optimal Policy (action to take at each state):", policy)


Converged after 82 iterations.
Optimal Value Function: [48.52841551 42.43085454 56.5597744 ]
Optimal Policy (action to take at each state): [0 1 0]
