In [17]:
import gym
import numpy as np

In [28]:
env = gym.make("FrozenLake-v1", is_slippery=False)

In [41]:
Q = np.zeros((env.observation_space.n, env.action_space.n))
alpha, gamma, epsilon = 0.1, 0.99, 0.1
n_episodes = 2000

In [64]:
for episode in range(n_episodes):
    s, _ = env.reset()
    done = False
    while not done:
        if np.random.rand() < epsilon:
            a = env.action_space.sample()
        else:
            a = np.argmax(Q[s])
        next_s, r, terminated, truncated, _ = env.step(a)
        done = terminated or truncated
        Q[s, a] += alpha * (r + gamma * np.max(Q[next_s]) - Q[s, a])
        s = next_s

In [83]:
pi_t = np.zeros_like(Q)
for s in range(env.observation_space.n):
    best_a = np.argmax(Q[s])
    pi_t[s, best_a] = 1.0

In [88]:
pi_b = np.ones_like(Q) / env.action_space.n

In [104]:
def generate_trajectories(env, policy, n_trajectories=100):
    trajectories = []
    for _ in range(n_trajectories):
        s, _ = env.reset()
        S, A, R, P = [s], [], [], []
        done = False
        while not done:
            a_prob = policy[s]
            a = np.random.choice(len(a_prob), p=a_prob)
            next_s, r, terminated, truncated, _ = env.step(a)
            done = terminated or truncated
            S.append(next_s)
            A.append(a)
            R.append(r)
            P.append(a_prob[a])
            s = next_s
        trajectories.append((S, A, R, P))
    return trajectories

In [119]:
D = generate_trajectories(env, pi_b, n_trajectories=200)

In [132]:
def monte_carlo_eval(env, policy, n_episodes=100, gamma=0.99):
    returns = []
    for _ in range(n_episodes):
        s, _ = env.reset()
        done = False
        G = 0
        t = 0
        while not done:
            a = np.argmax(policy[s])
            next_s, r, terminated, truncated, _ = env.step(a)
            done = terminated or truncated
            G += (gamma**t) * r
            t += 1
            s = next_s
        returns.append(G)
    return np.mean(returns)

J_true = monte_carlo_eval(env, pi_t)

In [142]:
for i, traj in enumerate(D[:3]):
    S, A, R, P = traj
    print(f"Траектория {i+1}:")
    print("S:", S)
    print("A:", A)
    print("R:", R)
    print("P:", P)
    print()


Траектория 1:
S: [0, 0, 0, 0, 0, 1, 2, 1, 2, 6, 10, 11]
A: [0, 0, 3, 3, 2, 2, 0, 2, 1, 1, 2]
R: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
P: [0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25, 0.25]

Траектория 2:
S: [0, 1, 5]
A: [2, 1]
R: [0.0, 0.0]
P: [0.25, 0.25]

Траектория 3:
S: [0, 1, 5]
A: [2, 1]
R: [0.0, 0.0]
P: [0.25, 0.25]

