In [None]:
import numpy as np
import gym

In [None]:
def monte_carlo_es(env, num_episodes, epsilon, gamma):
    # Initialize empty dictionary for storing state-action values
    Q = {}
    for state in range(env.observation_space.n):
        for action in range(env.action_space.n):
            Q[(state, action)] = 0.0
    
    # Initialize empty dictionary for storing state-action counts
    N = {}
    for state in range(env.observation_space.n):
        for action in range(env.action_space.n):
            N[(state, action)] = 0
    
    # Define helper function to generate epsilon-greedy policy
    def generate_epsilon_greedy_policy():
        def policy(state):
            if np.random.uniform(0, 1) < epsilon:
                return env.action_space.sample()
            else:
                q_values = np.array([Q[(state, a)] for a in range(env.action_space.n)])
                return np.random.choice(np.where(q_values == q_values.max())[0])
        return policy
    
    # Loop over episodes
    for episode in range(num_episodes):
        # Initialize episode
        state = env.reset()
        done = False
        policy = generate_epsilon_greedy_policy()
        episode_data = []
        
        # Generate episode
        while not done:
            action = policy(state)
            next_state, reward, done, _ = env.step(action)
            episode_data.append((state, action, reward))
            state = next_state
        
        # Update Q using Monte Carlo first-visit policy evaluation
        G = 0
        for t in range(len(episode_data) - 1, -1, -1):
            state, action, reward = episode_data[t]
            G = gamma * G + reward
            if (state, action) not in [(x[0], x[1]) for x in episode_data[:t]]:
                N[(state, action)] += 1
                alpha = 1 / N[(state, action)]
                Q[(state, action)] += alpha * (G - Q[(state, action)])
        
    # Generate final policy
    policy = {}
    for state in range(env.observation_space.n):
        q_values = np.array([Q[(state, a)] for a in range(env.action_space.n)])
        policy[state] = np.random.choice(np.where(q_values == q_values.max())[0])
    
    return Q, policy

  and should_run_async(code)


In [None]:
def on_policy_mc_control(env, num_episodes, epsilon, gamma):
    # Initialize empty dictionary for storing state-action values
    Q = {}
    for state in range(env.observation_space.n):
        for action in range(env.action_space.n):
            Q[(state, action)] = 0.0
    
    # Initialize empty dictionary for storing state-action counts
    N = {}
    for state in range(env.observation_space.n):
        for action in range(env.action_space.n):
            N[(state, action)] = 0
    
    # Define helper function to generate epsilon-soft policy
    def generate_epsilon_soft_policy():
        def policy(state):
            probs = np.ones(env.action_space.n, dtype=float) * epsilon / env.action_space.n
            q_values = np.array([Q[(state, a)] for a in range(env.action_space.n)])
            best_action = np.random.choice(np.where(q_values == q_values.max())[0])
            probs[best_action] += (1.0 - epsilon)
            return np.random.choice(env.action_space.n, p=probs)
        return policy
    
    # Loop over episodes
    for episode in range(num_episodes):
        # Initialize episode
      
        # Initialize episode
        state = env.reset()
        done = False
        policy = generate_epsilon_soft_policy()
        episode_data = []
        
        # Generate episode
        while not done:
            action = policy(state)
            next_state, reward, done, _ = env.step(action)
            episode_data.append((state, action, reward))
            state = next_state
        
        # Update Q using Monte Carlo on-policy first-visit control
        G = 0
        W = 1
        for t in range(len(episode_data) - 1, -1, -1):
            state, action, reward = episode_data[t]
            G = gamma * G + reward
            N[(state, action)] += W
            alpha = W / N[(state, action)]
            Q[(state, action)] += alpha * (G - Q[(state, action)])
            if action != np.argmax([Q[(state, a)] for a in range(env.action_space.n)]):
                break
            W = W / (1 - epsilon + epsilon / env.action_space.n)
        
    # Generate final policy
    policy = {}
    for state in range(env.observation_space.n):
        q_values = np.array([Q[(state, a)] for a in range(env.action_space.n)])
        policy[state] = np.argmax(q_values)
    
    return Q, policy

In [None]:
env = gym.make('CliffWalking-v0')

  deprecation(
  deprecation(


In [None]:
actions = env.action_space.n

## Monte Carlo ES

In [None]:
# Monte Carlo ES
Q_mc_es, policy_mc_es = monte_carlo_es(env, num_episodes=500, epsilon=1.0, gamma=1.0)
print('Monte Carlo ES')
print('Number of episodes:', 500)
print('Number of steps:', np.sum(list(Q_mc_es.values())))
#print('Optimal policy:')
#print(np.array([env.actions[policy_mc_es[s]] for s in range(env.observation_space.n)]).reshape(env.shape))

Monte Carlo ES
Number of episodes: 500
Number of steps: -8649054.6012345


## On-Policy First Visit MC Control

In [None]:
 # On-policy first-visit MC control
Q_on_policy_mc, policy_on_policy_mc = on_policy_mc_control(env, num_episodes=500, epsilon=0.1, gamma=1.0)
print('On-policy first-visit MC control')
print('Number of episodes:', 500)
print('Number of steps:', np.sum(list(Q_on_policy_mc.values())))
#print('Optimal policy:')
#print(np.array([env.actions[policy_on_policy_mc[s]] for s in range(env.observation_space.n)]).reshape(env.shape))

On-policy first-visit MC control
Number of episodes: 500
Number of steps: -1.0
