# reference

https://d3rlpy.readthedocs.io/en/stable/tutorials/getting_started.html

In [1]:
!pip install d3rlpy



# prepare dataset

In [2]:
from d3rlpy.datasets import get_cartpole # CartPole-v1 dataset
from d3rlpy.datasets import get_pendulum # Pendulum-v1 dataset
from d3rlpy.datasets import get_atari    # Atari 2600 task datasets
from d3rlpy.datasets import get_d4rl     # D4RL datasets

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  from .autonotebook import tqdm as notebook_tqdm


# dataset

In [4]:
dataset, env = get_cartpole()

Downloading cartpole.pkl into d3rlpy_data/cartpole_replay_v1.1.0.h5...
[2m2025-09-15 22:16.10[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int32')], shape=[(1,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(4,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(1,)])[0m
[2m2025-09-15 22:16.10[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.DISCRETE: 2>[0m
[2m2025-09-15 22:16.10[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m2[0m


# Algos

## DQN

In [6]:
from d3rlpy.algos import DQNConfig

# if you don't use GPU, set device=None instead.
dqn = DQNConfig().create(device="mps:0") # cuda:0 for GPU, cpu for CPU

# initialize neural networks with the given observation shape and action size.
# this is not necessary when you directly call fit or fit_online method.
dqn.build_with_dataset(dataset)

# setup metrics



In [7]:
from d3rlpy.metrics import TDErrorEvaluator

# calculate metrics with training dataset
td_error_evaluator = TDErrorEvaluator(episodes=dataset.episodes)

In [10]:
from d3rlpy.metrics import EnvironmentEvaluator
import gymnasium as gym 
# set environment in scorer function
env_evaluator = EnvironmentEvaluator(env)

# evaluate algorithm on the environment
rewards = env_evaluator(dqn, dataset=None)

# start training

In [11]:
dqn.fit(
    dataset,
    n_steps=10000,
    evaluators={
        'td_error': td_error_evaluator,
        'environment': env_evaluator,
    },
)

[2m2025-09-15 22:18.02[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(4,)]), action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=2)[0m
[2m2025-09-15 22:18.02[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/DQN_20250915221802[0m
[2m2025-09-15 22:18.02[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [4], 'action_size': 2, 'config': {'type': 'dqn', 'params': {'batch_size': 32, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_scaler': {'type': 'none', 'params': {}}, 'reward_scaler': {'type': 'none', 'params': {}}, 'compile_graph': False, 'learning_rate': 6.25e-05, 'optim_factory': {'type': 'adam', 'params': {'clip_grad_

Epoch 1/1: 100%|██████████| 10000/10000 [01:00<00:00, 164.07it/s, loss=0.0055]


[2m2025-09-15 22:19.23[0m [[32m[1minfo     [0m] [1mDQN_20250915221802: epoch=1 step=10000[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.00023308279514312744, 'time_algorithm_update': 0.005771638035774231, 'loss': 0.005504346831805742, 'time_step': 0.006068303322792053, 'td_error': 0.9951041568644884, 'environment': 11.6}[0m [36mstep[0m=[35m10000[0m
[2m2025-09-15 22:19.23[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DQN_20250915221802/model_10000.d3[0m


[(1,
  {'time_sample_batch': 0.00023308279514312744,
   'time_algorithm_update': 0.005771638035774231,
   'loss': 0.005504346831805742,
   'time_step': 0.006068303322792053,
   'td_error': 0.9951041568644884,
   'environment': 11.6})]

Once the training is done, your algorithm is ready to make decisions.

In [13]:
import numpy as np # added
observation, _ = env.reset()

# return actions based on the greedy-policy
action = dqn.predict(np.expand_dims(observation, axis=0))

# estimate action-values
value = dqn.predict_value(np.expand_dims(observation, axis=0), action)

# Save and load models

d3rlpy provides several ways to save trained models.

In [None]:
import d3rlpy

# save full parameters and configurations in a single file.
dqn.save('dqn.d3')
# load full parameters and build algorithm
dqn2 = d3rlpy.load_learnable("dqn.d3")

# save full parameters only
dqn.save_model('dqn.pt')
# load full parameters with manual setup
dqn3 = DQN()
dqn3.build_with_dataset(dataset)
dqn3.load_model('dqn.pt')

# save the greedy-policy as TorchScript
dqn.save_policy('policy.pt')
# save the greedy-policy as ONNX
dqn.save_policy('policy.onnx')