In [24]:
import pandas as pd

# Setting up the initial values and transitions based on the MDP structure provided in the image.
# Here is an initial setup for each state, action, transition probabilities, and rewards.

# Constants
gamma = 1.0  # Discount factor

# Initializing a dataframe to track the information for each state and action pair
# Columns: State, Action, Transition Probability, Next State, Reward
data = {
    'State': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'c' ],
    'Action': ['x', 'x', 'y', 'y', 'x', 'y', 'y', 'x'],
    'Transition Probability': [0.5, 0.5, 0.25, 0.75, 1.0, 0.5, 0.5, 1.0],
    'Next State': ['b', 'c', 'b', 'c', 'e', 'e', 'e', 'f'],
    'Reward': [0, 0, 0, 0, 3, 6, 2, 8]
}

mdp_df = pd.DataFrame(data)

print(mdp_df)




  State Action  Transition Probability Next State  Reward
0     a      x                    0.50          b       0
1     a      x                    0.50          c       0
2     a      y                    0.25          b       0
3     a      y                    0.75          c       0
4     b      x                    1.00          e       3
5     b      y                    0.50          e       6
6     b      y                    0.50          e       2
7     c      x                    1.00          f       8


In [25]:
# Initialize a dictionary to store the state values for the policy that always chooses action 'x'
state_values_policy_x = {'a': 0, 'b': 0, 'c': 0, 'e': 0, 'f': 0}

# Number of iterations for value iteration to converge (as gamma=1, we might need a few rounds)
iterations = 5

# Performing value iteration under policy where action 'x' is always chosen
for iteration in range(iterations):
    new_values = state_values_policy_x.copy()  # To store updated values in each iteration
    for state in ['a', 'b', 'c']:
        # Filter for rows where the current state and action is 'x'
        state_action_x = mdp_df[(mdp_df['State'] == state) & (mdp_df['Action'] == 'x')]
        
        # Calculating expected value for the state under action 'x'
        value = 0
        for _, row in state_action_x.iterrows():
            prob = row['Transition Probability']
            next_state = row['Next State']
            reward = row['Reward']
            value += prob * (reward + gamma * state_values_policy_x[next_state])
        
        # Updating the new value for the state
        new_values[state] = value
    
    # Updating state values after each iteration
    state_values_policy_x = new_values

    print(f"Iteration {iteration + 1}: {state_values_policy_x}")


state_values_policy_x

# {'a': 5.5, 'b': 3.0, 'c': 8.0, 'e': 0, 'f': 0}



Iteration 1: {'a': 0.0, 'b': 3.0, 'c': 8.0, 'e': 0, 'f': 0}
Iteration 2: {'a': 5.5, 'b': 3.0, 'c': 8.0, 'e': 0, 'f': 0}
Iteration 3: {'a': 5.5, 'b': 3.0, 'c': 8.0, 'e': 0, 'f': 0}
Iteration 4: {'a': 5.5, 'b': 3.0, 'c': 8.0, 'e': 0, 'f': 0}
Iteration 5: {'a': 5.5, 'b': 3.0, 'c': 8.0, 'e': 0, 'f': 0}


{'a': 5.5, 'b': 3.0, 'c': 8.0, 'e': 0, 'f': 0}

In [26]:
# Initialize a dictionary to store the optimal state values
optimal_state_values = {'a': 0, 'b': 0, 'c': 0, 'e': 0, 'f': 0}


iterations = 100  # Number of iterations for value iteration to converge
# Performing value iteration to find optimal state values (considering both actions 'x' and 'y')
for _ in range(iterations):
    new_values = optimal_state_values.copy()  # To store updated values in each iteration
    for state in ['a', 'b', 'c']:
        # Filter for rows where the current state matches
        state_actions = mdp_df[mdp_df['State'] == state]
        
        # Calculate the value for each action and take the maximum
        action_values = []
        for action in ['x', 'y']:
            action_df = state_actions[state_actions['Action'] == action]
            value = 0
            for _, row in action_df.iterrows():
                prob = row['Transition Probability']
                next_state = row['Next State']
                reward = row['Reward']
                value += prob * (reward + gamma * optimal_state_values[next_state])
            action_values.append(value)
        
        # Update with the maximum action value for the state (optimal value)
        new_values[state] = max(action_values)
    
    # Update the state values after each iteration
    optimal_state_values = new_values

optimal_state_values


{'a': 7.0, 'b': 4.0, 'c': 8.0, 'e': 0, 'f': 0}

In [29]:
# Initialize a dictionary to store optimal Q-values for each (state, action) pair
optimal_q_values = {}

# Calculating Q-values for each (state, action) pair
for state in ['a', 'b', 'c']:
    # Filter for rows where the current state matches
    state_actions = mdp_df[mdp_df['State'] == state]
    
    for action in ['x', 'y']:
        # Filter for the specific action
        action_df = state_actions[state_actions['Action'] == action]
        
        # Calculate Q-value for the specific (state, action) pair
        q_value = 0
        for _, row in action_df.iterrows():
            prob = row['Transition Probability']
            next_state = row['Next State']
            reward = row['Reward']
            q_value += prob * (reward + gamma * optimal_state_values[next_state])
        
        # Store the Q-value
        optimal_q_values[(state, action)] = q_value

optimal_q_values


{('a', 'x'): 6.0,
 ('a', 'y'): 7.0,
 ('b', 'x'): 3.0,
 ('b', 'y'): 4.0,
 ('c', 'x'): 8.0,
 ('c', 'y'): 0}