# Monte Carlo Test

In [1]:
import numpy as np

In [2]:
# Define the policy for choosing paths starting from state "A"
# pi_A[0]: Probability of choosing path "A-B-C"
# pi_A[1]: Probability of choosing path "A-C"
pi_A = np.array([1 / 3, 2 / 3])

# Total number of episodes (experiments) to run
num_episode = 20000

# Array to store the return (reward) obtained in each episode
G_array = np.zeros((num_episode,))

# Run the Monte Carlo simulation
for idx in range(num_episode):
    # Randomly select a path ("A-B-C" or "A-C") based on the policy probabilities
    path = np.random.choice(["A-B-C", "A-C"], p=pi_A)

    # Assign rewards based on the chosen path:
    # If the path is "A-B-C", the reward is 2; else, it's 1 for "A-C"
    if path == "A-B-C":
        reward = 2
    else:
        reward = 1

    # Store the obtained reward for this episode
    G_array[idx] = reward

In [3]:
# Estimate the value of state "A" by averaging returns (first-visit MC estimate)
v_a = G_array.sum() / num_episode
print(v_a)

1.33495


In [4]:
# Now, estimate the value using incremental (online/recursive) mean calculation
v_a_updated = 0
for idx in range(num_episode):
    G = G_array[idx]
    # Update the running mean (average) with the return from the current episode
    # v_a_updated = prev_mean + (1/N) * (current_value - prev_mean)
    v_a_updated = v_a_updated + 1 / (idx + 1) * (G - v_a_updated)

print(v_a_updated)

1.3349499999999959
