In [1]:
import gymnasium as gym
import numpy as np
if not hasattr(np, "bool8"):
    np.bool8 = np.bool_
from stable_baselines3 import DQN

In [None]:
env = gym.make("CartPole-v1")

model = DQN(
    policy="MlpPolicy",
    env=env,
    learning_rate=5e-4,           
    buffer_size=50000,               
    learning_starts=1000,
    batch_size=64,                   
    tau=1.0,
    gamma=0.995,                    
    train_freq=1,
    target_update_interval=500,      
    exploration_fraction=0.05,       
    exploration_final_eps=0.01,      
    max_grad_norm=10,
    policy_kwargs=dict(net_arch=[128, 128]),
    verbose=1
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [3]:

model.learn(total_timesteps=100000)

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 16       |
|    ep_rew_mean      | 16       |
|    exploration_rate | 0.987    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3638     |
|    time_elapsed     | 0        |
|    total_timesteps  | 64       |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 19.5     |
|    ep_rew_mean      | 19.5     |
|    exploration_rate | 0.969    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3595     |
|    time_elapsed     | 0        |
|    total_timesteps  | 156      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 20.6     |
|    ep_rew_mean      | 20.6     |
|    exploration_rate | 0.951    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 3770     |
|    time_elapsed     | 0        |
|    total_timesteps  | 247      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 20.2     |
|    ep_rew_mean      | 20.2     |
|    exploration_rate | 0.936    |
| time/               |          |
|    episodes       

<stable_baselines3.dqn.dqn.DQN at 0x7763dfd57c20>

In [4]:

model.save("dqn_lunarlander")

In [5]:

model = DQN.load("dqn_lunarlander")

In [6]:
# Evaluate the trained agent
env = gym.make("CartPole-v1",render_mode="human")
observation,info = env.reset()

while(True):
    action, _ = model.predict(observation, deterministic=True)
    observation, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    env.render()
    if done:
        break


  from pkg_resources import resource_stream, resource_exists


In [7]:
env = gym.make("CartPole-v1")
for i in range(5):
    obs, info = env.reset()
    done = False
    steps = 0

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        steps += 1

    print(f"Episode {i+1}: {steps} steps")

env.close()


Episode 1: 500 steps
Episode 2: 500 steps
Episode 3: 500 steps
Episode 4: 500 steps
Episode 5: 500 steps
