# Mountain Car

https://gymnasium.farama.org/environments/classic_control/mountain_car/

Classe MountainCarAgent

Esta classe encapsula o agente de Q-learning, definindo a Q-table inicial (com valores aleatórios entre –2 e 0), hiperparâmetros como taxa de aprendizado (α), fator de desconto (γ) e parâmetros de exploração (ε, ε_min e decaimento). Métodos internos permitem escolher ações seguindo a política ε-greedy, atualizar os valores de Q e reduzir gradualmente ε ao longo dos episódios.

In [None]:
import gymnasium as gym
import numpy as np
import os
import shutil
from moviepy.editor import VideoFileClip, concatenate_videoclips

class MountainCarAgent:
    def __init__(
        self,
        num_states,
        num_actions,
        learning_rate=0.1,
        discount_factor=0.95,
        exploration_rate=1.0,
        exploration_decay_rate=0.999,
        min_exploration_rate=0.01,
    ):
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay_rate = exploration_decay_rate
        self.min_exploration_rate = min_exploration_rate

        self.q_table = np.random.uniform(low=-2, high=0, size=(*num_states, num_actions))

    def choose_action(self, state):
        if np.random.random() < self.exploration_rate:
            return np.random.randint(0, self.num_actions)
        return np.argmax(self.q_table[state])

    def update_q_table(self, state, action, reward, next_state):
        current_q = self.q_table[state][action]
        max_future_q = np.max(self.q_table[next_state])

        new_q = current_q + self.learning_rate * (
            reward + self.discount_factor * max_future_q - current_q # usa a formula de atualizacao
        )
        self.q_table[state][action] = new_q

    def decay_exploration_rate(self):
        self.exploration_rate = max(
            self.min_exploration_rate, self.exploration_rate * self.exploration_decay_rate # multiplica a taxa de exploração pela taxa de decaimento, mas não deixa cair abaixo de um mínimo
        )



  if event.key is 'enter':



Classe EnvironmentWrapper

Como o estado do MountainCar é contínuo (posição e velocidade), essa classe realiza sua discretização em uma grade de “buckets” (padrão de 20×20). No construtor, ela coleta os limites do ambiente e gera intervalos (np.linspace) para cada dimensão. O método discretize_state converte um estado contínuo em índices inteiros de bucket usando np.digitize, enquanto get_num_actions e get_state_space_size expõem, respectivamente, o número de ações possíveis e o formato do espaço de estados discretizados.

In [None]:
class EnvironmentWrapper:
    def __init__(self, env_name, buckets=(20, 20)):
        self.env = gym.make(env_name)

        self.buckets = buckets

        self.state_bins = [
            np.linspace(low, high, num)
            for low, high, num in zip(
                self.env.observation_space.low,
                self.env.observation_space.high,
                self.buckets,
            )
        ]
        self.env.close()

    def discretize_state(self, state):
        discretized = []


        for i, value in enumerate(state):
            bin_index = np.digitize(value, self.state_bins[i]) - 1
            discretized.append(bin_index)



        return tuple(discretized)

    def get_num_actions(self):
        return self.env.action_space.n

    def get_state_space_size(self):
        return self.buckets




Função train(agent, env_wrapper, episodes)



Responsável pelo ciclo de aprendizado, a função cria o ambiente em cada execução e, para cada episódio, reinicia o estado, discretiza-o e então entra em um loop de interação até o término do episódio. A cada passo, seleciona ação (ε-greedy), aplica env.step(), recebe recompensa e próximos estado, atualiza a Q-table com update_q_table e repete. Após cada episódio, decai a taxa de exploração. A cada 5 000 episódios, exibe no console o progresso e o valor atual de ε para monitorar a convergência.

In [None]:
def train(agent, env_wrapper, episodes=25000):

    env = gym.make(env_wrapper.env.spec.id)
    for episode in range(episodes):
        state_continuous, _ = env.reset()
        state_discrete = env_wrapper.discretize_state(state_continuous)

        terminated = False
        truncated = False

        while not terminated and not truncated:
            action = agent.choose_action(state_discrete)
            next_state_continuous, reward, terminated, truncated, _ = env.step(action)
            next_state_discrete = env_wrapper.discretize_state(next_state_continuous)

            agent.update_q_table(state_discrete, action, reward, next_state_discrete)
            state_discrete = next_state_discrete

        agent.decay_exploration_rate()

        if (episode + 1) % 5000 == 0:
            print(f"Episode: {episode + 1}, Exploration Rate: {agent.exploration_rate:.4f}")

    env.close()

In [None]:
def demonstrate(agent, env_name, episodes=5, output_filename="mountaincar_demonstration.mp4"):
    video_folder = "videos_temp"
    if os.path.exists(video_folder):
        shutil.rmtree(video_folder)

    env = gym.make(env_name, render_mode="rgb_array")
    env = gym.wrappers.RecordVideo(env, video_folder, episode_trigger=lambda e: True, name_prefix="rl-video")

    env_wrapper_demo = EnvironmentWrapper(env_name)
    agent.exploration_rate = 0.0

    for _ in range(episodes):
        state_continuous, _ = env.reset()
        state_discrete = env_wrapper_demo.discretize_state(state_continuous)

        terminated = False
        truncated = False

        while not terminated and not truncated:
            action = agent.choose_action(state_discrete)
            next_state_continuous, _, terminated, truncated, _ = env.step(action)
            state_discrete = env_wrapper_demo.discretize_state(next_state_continuous)

    env.close()

    video_files = sorted(
        [f for f in os.listdir(video_folder) if f.endswith(".mp4")],
        key=lambda x: int(x.split('-')[-1].split('.')[0])
    )




    if not video_files:
        print("No video files found to concatenate.")
        shutil.rmtree(video_folder)
        return

    full_paths = [os.path.join(video_folder, file) for file in video_files]
    clips = [VideoFileClip(p) for p in full_paths]

    final_clip = concatenate_videoclips(clips)
    final_clip.write_videofile(output_filename, codec="libx264", fps=30)

    for clip in clips:
        clip.close()

    shutil.rmtree(video_folder)
    print(f"\nDemonstration video saved to {output_filename}")

In [None]:
if __name__ == "__main__":
    ENV_NAME = "MountainCar-v0"

    env_wrapper_train = EnvironmentWrapper(ENV_NAME)

    agent = MountainCarAgent(
        num_states=env_wrapper_train.get_state_space_size(),
        num_actions=env_wrapper_train.get_num_actions()
    )

    print("Training agent...")
    train(agent, env_wrapper_train, episodes=25000)

Training agent...
Episode: 5000, Exploration Rate: 0.0100
Episode: 10000, Exploration Rate: 0.0100
Episode: 15000, Exploration Rate: 0.0100
Episode: 20000, Exploration Rate: 0.0100
Episode: 25000, Exploration Rate: 0.0100


In [None]:
print("Shape da Q-table:", agent.q_table.shape)
print(agent.q_table)

In [None]:
print("\nGenerating demonstration video...")
demonstrate(agent, ENV_NAME, episodes=5, output_filename="mountaincar_demonstration.mp4")


Generating demonstration video...
Moviepy - Building video mountaincar_demonstration.mp4.
Moviepy - Writing video mountaincar_demonstration.mp4






Moviepy - Done !
Moviepy - video ready mountaincar_demonstration.mp4

Demonstration video saved to mountaincar_demonstration.mp4


In [None]:
from IPython.display import Video, display
import os

VIDEO_FILENAME = "mountaincar_demonstration.mp4"

if os.path.exists(VIDEO_FILENAME):
  print(f"\nExibindo o vídeo '{VIDEO_FILENAME}':")
  video = Video(VIDEO_FILENAME, embed=True, width=600)
  display(video)
else:
  print(f"\nErro: O arquivo de vídeo '{VIDEO_FILENAME}' não foi encontrado.")




Exibindo o vídeo 'mountaincar_demonstration.mp4':
