<a href="https://colab.research.google.com/github/vidhi-sys/Machine-Learning-Journey/blob/main/MDP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# **Understanding Markov Decision Process MDP(Reinforcement Learning)**


In [8]:

rewards = np.array([
    [-1, -10],  # rainy
    [0, 0],     # cloudy
    [-2, 5]     # sunny
])

transition = np.array([
    [[0.8, 0.1, 0.1],  # from rainy with umbrella
     [0.8, 0.1, 0.1]], # from rainy without umbrella

    [[0.2, 0.6, 0.2],  # from cloudy with umbrella
     [0.2, 0.6, 0.2]], # from cloudy without umbrella

    [[0.1, 0.2, 0.7],
     [0.1, 0.2, 0.7]]
])

print("Reward Matrix:")
display(rewards)
print("\nTransition Probability Matrix:")
display(transition)

Reward Matrix:


array([[ -1, -10],
       [  0,   0],
       [ -2,   5]])


Transition Probability Matrix:


array([[[0.8, 0.1, 0.1],
        [0.8, 0.1, 0.1]],

       [[0.2, 0.6, 0.2],
        [0.2, 0.6, 0.2]],

       [[0.1, 0.2, 0.7],
        [0.1, 0.2, 0.7]]])

In [9]:
def value_iteration(rewards, transition, discount_factor=0.9, threshold=1e-6):

    num_states, num_actions = rewards.shape
    V = np.zeros(num_states)
    policy = np.zeros(num_states, dtype=int)

    while True:
        delta = 0
        V_new = np.copy(V)

        for s in range(num_states):
            action_values = np.zeros(num_actions)
            for a in range(num_actions):
                action_values[a] = rewards[s, a] + discount_factor * np.sum(
                    transition[s, a, :] * V
                )
            V_new[s] = np.max(action_values)
            policy[s] = np.argmax(action_values)

            delta = max(delta, abs(V_new[s] - V[s]))

        V = np.copy(V_new)

        if delta < threshold:
            break

    return V, policy

# Example usage:
optimal_V, optimal_policy = value_iteration(rewards, transition)

print("Optimal Value Function:")
display(optimal_V)
print("\nOptimal Policy (0: Umbrella, 1: No umbrella):")
display(optimal_policy)

optimal_policy_names = [action[a] for a in optimal_policy]
print("\nOptimal Policy (Action Names):")
display(optimal_policy_names)

Optimal Value Function:


array([ 6.133107  , 10.21640631, 19.97549165])


Optimal Policy (0: Umbrella, 1: No umbrella):


array([0, 0, 1])


Optimal Policy (Action Names):


['Umbrella', 'Umbrella', 'No umbrella']

In [12]:
def value_iteration(rewards, transition, discount_factor=0.9, threshold=1e-6):

    num_states, num_actions = rewards.shape
    V = np.zeros(num_states)
    policy = np.zeros(num_states, dtype=int)

    while True:
        delta = 0
        V_new = np.copy(V)

        for s in range(num_states):
            action_values = np.zeros(num_actions)
            for a in range(num_actions):
                action_values[a] = rewards[s, a] + discount_factor * np.sum(
                    transition[s, a, :] * V
                )
            V_new[s] = np.max(action_values)
            policy[s] = np.argmax(action_values)

            delta = max(delta, abs(V_new[s] - V[s]))

        V = np.copy(V_new)

        if delta < threshold:
            break

    return V, policy

# Define action names again to ensure accessibility
action=['Umbrella','No umbrella']

# Example usage:
optimal_V, optimal_policy = value_iteration(rewards, transition)

print("Optimal Value Function:")
display(optimal_V)
print("\nOptimal Policy (0: Umbrella, 1: No umbrella):")
display(optimal_policy)

# Map policy indices back to action names
optimal_policy_names = [action[a] for a in optimal_policy]
print("\nOptimal Policy (Action Names):")
display(optimal_policy_names)

Optimal Value Function:


array([ 6.133107  , 10.21640631, 19.97549165])


Optimal Policy (0: Umbrella, 1: No umbrella):


array([0, 0, 1])


Optimal Policy (Action Names):


['Umbrella', 'Umbrella', 'No umbrella']

In [14]:
# Create a DataFrame to display the results in a tabular format
results_df = pd.DataFrame({
    'State': ['Rainy', 'Cloudy', 'Sunny'],
    'Optimal Value Function': optimal_V,
    'Optimal Policy (Index)': optimal_policy,
    'Optimal Policy (Action)': optimal_policy_names
})

print("Optimal Value Function and Policy:")
display(results_df)

Optimal Value Function and Policy:


Unnamed: 0,State,Optimal Value Function,Optimal Policy (Index),Optimal Policy (Action)
0,Rainy,6.133107,0,Umbrella
1,Cloudy,10.216406,0,Umbrella
2,Sunny,19.975492,1,No umbrella
