From c1ac21be22bd9572beeb1ffe4d5d0d6184e3959c Mon Sep 17 00:00:00 2001 From: Stefan Doerr Date: Wed, 27 Mar 2024 16:51:39 +0200 Subject: [PATCH 1/4] support a zip of ckpt files for ensemble models --- torchmdnet/models/model.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/torchmdnet/models/model.py b/torchmdnet/models/model.py index c090f90f..eed99024 100644 --- a/torchmdnet/models/model.py +++ b/torchmdnet/models/model.py @@ -144,7 +144,7 @@ def load_model(filepath, args=None, device="cpu", return_std=False, **kwargs): If a list of paths is given, an :py:mod:`Ensemble` model is returned. Args: - filepath (str or list): Path to the checkpoint file or a list of paths. + filepath (str or list): Path to the checkpoint file or a list of paths or a zip of checkpoints. args (dict, optional): Arguments for the model. Defaults to None. device (str, optional): Device on which the model should be loaded. Defaults to "cpu". return_std (bool, optional): Whether to return the standard deviation of an Ensemble model. Defaults to False. @@ -159,6 +159,23 @@ def load_model(filepath, args=None, device="cpu", return_std=False, **kwargs): return_std=return_std, ) + if filepath.endswith(".zip"): + import zipfile + import tempfile + from glob import glob + import os + + with tempfile.TemporaryDirectory() as tmpdir: + with zipfile.ZipFile(filepath, "r") as z: + z.extractall(tmpdir) + + filepath = glob(os.path.join(tmpdir, "*.ckpt")) + + return Ensemble( + [load_model(f, args=args, device=device, **kwargs) for f in filepath], + return_std=return_std, + ) + ckpt = torch.load(filepath, map_location="cpu") if args is None: args = ckpt["hyper_parameters"] From 9ece01b87d04168a5809bf3c660df69bc4b98262 Mon Sep 17 00:00:00 2001 From: Stefan Doerr Date: Wed, 27 Mar 2024 16:57:36 +0200 Subject: [PATCH 2/4] add test --- tests/test_model.py | 60 ++++++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 1dd5e354..f606559e 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -116,26 +116,28 @@ def test_cuda_graph_compatible(model_name): if not torch.cuda.is_available(): pytest.skip("CUDA not available") z, pos, batch = create_example_batch() - args = {"model": model_name, - "embedding_dimension": 128, - "num_layers": 2, - "num_rbf": 32, - "rbf_type": "expnorm", - "trainable_rbf": False, - "activation": "silu", - "cutoff_lower": 0.0, - "cutoff_upper": 5.0, - "max_z": 100, - "max_num_neighbors": 128, - "equivariance_invariance_group": "O(3)", - "prior_model": None, - "atom_filter": -1, - "derivative": True, - "check_errors": False, - "static_shapes": True, - "output_model": "Scalar", - "reduce_op": "sum", - "precision": 32 } + args = { + "model": model_name, + "embedding_dimension": 128, + "num_layers": 2, + "num_rbf": 32, + "rbf_type": "expnorm", + "trainable_rbf": False, + "activation": "silu", + "cutoff_lower": 0.0, + "cutoff_upper": 5.0, + "max_z": 100, + "max_num_neighbors": 128, + "equivariance_invariance_group": "O(3)", + "prior_model": None, + "atom_filter": -1, + "derivative": True, + "check_errors": False, + "static_shapes": True, + "output_model": "Scalar", + "reduce_op": "sum", + "precision": 32, + } model = create_model(args).to(device="cuda") model.eval() z = z.to("cuda") @@ -260,3 +262,21 @@ def test_ensemble(): assert neg_dy_std.shape == deriv.shape assert (y_std == 0).all() assert (neg_dy_std == 0).all() + + import zipfile + import tempfile + + with tempfile.TemporaryDirectory() as tmpdir: + ensemble_zip = join(tmpdir, "ensemble.zip") + with zipfile.ZipFile(ensemble_zip, "w") as zipf: + for i, ckpt in enumerate(ckpts): + zipf.write(ckpt, f"model_{i}.ckpt") + ensemble_model = load_model(ensemble_zip, return_std=True) + pred_ensemble, deriv_ensemble, y_std, neg_dy_std = ensemble_model(z, pos, batch) + + torch.testing.assert_close(pred, pred_ensemble, atol=1e-5, rtol=1e-5) + torch.testing.assert_close(deriv, deriv_ensemble, atol=1e-5, rtol=1e-5) + assert y_std.shape == pred.shape + assert neg_dy_std.shape == deriv.shape + assert (y_std == 0).all() + assert (neg_dy_std == 0).all() From 78482a8a91ebbedff4fa461e2bf95ebde342af4a Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Thu, 28 Mar 2024 07:35:33 +0100 Subject: [PATCH 3/4] Update docstring --- torchmdnet/models/model.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/torchmdnet/models/model.py b/torchmdnet/models/model.py index eed99024..9ec852b6 100644 --- a/torchmdnet/models/model.py +++ b/torchmdnet/models/model.py @@ -142,16 +142,21 @@ def create_model(args, prior_model=None, mean=None, std=None): def load_model(filepath, args=None, device="cpu", return_std=False, **kwargs): """Load a model from a checkpoint file. - If a list of paths is given, an :py:mod:`Ensemble` model is returned. + If a list of paths or a path to a zip file is given, an :py:mod:`Ensemble` model is returned. Args: - filepath (str or list): Path to the checkpoint file or a list of paths or a zip of checkpoints. + filepath (str or list): Can be any of the following: + + - Path to a checkpoint file. In this case, a :py:mod:`TorchMD_Net` model is returned. + - Path to a zip file containing multiple checkpoint files. In this case, an :py:mod:`Ensemble` model is returned. + - List of paths to checkpoint files. In this case, an :py:mod:`Ensemble` model is returned. + args (dict, optional): Arguments for the model. Defaults to None. device (str, optional): Device on which the model should be loaded. Defaults to "cpu". return_std (bool, optional): Whether to return the standard deviation of an Ensemble model. Defaults to False. **kwargs: Extra keyword arguments for the model. Returns: - nn.Module: An instance of the TorchMD_Net model. + nn.Module: An instance of the TorchMD_Net model or an Ensemble model. """ if isinstance(filepath, (list, tuple)): return Ensemble( From 1a7b2746c48d55124ff5e2fb92fee0f9f85894cb Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Thu, 28 Mar 2024 07:58:26 +0100 Subject: [PATCH 4/4] Move Emsemble loading to a different function --- torchmdnet/models/model.py | 73 ++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 23 deletions(-) diff --git a/torchmdnet/models/model.py b/torchmdnet/models/model.py index 9ec852b6..91369304 100644 --- a/torchmdnet/models/model.py +++ b/torchmdnet/models/model.py @@ -1,8 +1,10 @@ # Copyright Universitat Pompeu Fabra 2020-2023 https://www.compscience.org # Distributed under the MIT License. # (See accompanying file README.md file or copy at http://opensource.org/licenses/MIT) - +from glob import glob +import os import re +import tempfile from typing import Optional, List, Tuple, Dict import torch from torch.autograd import grad @@ -13,6 +15,7 @@ from torchmdnet import priors from lightning_utilities.core.rank_zero import rank_zero_warn import warnings +import zipfile def create_model(args, prior_model=None, mean=None, std=None): @@ -139,6 +142,47 @@ def create_model(args, prior_model=None, mean=None, std=None): return model +def load_ensemble(filepath, args=None, device="cpu", return_std=False, **kwargs): + """Load an ensemble of models from a list of checkpoint files or a zip file. + + Args: + filepath (str or list): Can be any of the following: + + - Path to a zip file containing multiple checkpoint files. + - List of paths to checkpoint files. + + args (dict, optional): Arguments for the model. Defaults to None. + device (str, optional): Device on which the model should be loaded. Defaults to "cpu". + return_std (bool, optional): Whether to return the standard deviation of the predictions. Defaults to False. + **kwargs: Extra keyword arguments for the model, will be passed to :py:mod:`load_model`. + + Returns: + nn.Module: An instance of :py:mod:`Ensemble`. + """ + if isinstance(filepath, (list, tuple)): + assert all(isinstance(f, str) for f in filepath), "Invalid filepath list." + model_list = [ + load_model(f, args=args, device=device, **kwargs) for f in filepath + ] + elif filepath.endswith(".zip"): + with tempfile.TemporaryDirectory() as tmpdir: + with zipfile.ZipFile(filepath, "r") as z: + z.extractall(tmpdir) + ckpt_list = glob(os.path.join(tmpdir, "*.ckpt")) + assert len(ckpt_list) > 0, "No checkpoint files found in zip file." + model_list = [ + load_model(f, args=args, device=device, **kwargs) for f in ckpt_list + ] + else: + raise ValueError( + "Invalid filepath. Must be a list of paths or a path to a zip file." + ) + return Ensemble( + model_list, + return_std=return_std, + ) + + def load_model(filepath, args=None, device="cpu", return_std=False, **kwargs): """Load a model from a checkpoint file. @@ -158,29 +202,12 @@ def load_model(filepath, args=None, device="cpu", return_std=False, **kwargs): Returns: nn.Module: An instance of the TorchMD_Net model or an Ensemble model. """ - if isinstance(filepath, (list, tuple)): - return Ensemble( - [load_model(f, args=args, device=device, **kwargs) for f in filepath], - return_std=return_std, + isEnsemble = isinstance(filepath, (list, tuple)) or filepath.endswith(".zip") + if isEnsemble: + return load_ensemble( + filepath, args=args, device=device, return_std=return_std, **kwargs ) - - if filepath.endswith(".zip"): - import zipfile - import tempfile - from glob import glob - import os - - with tempfile.TemporaryDirectory() as tmpdir: - with zipfile.ZipFile(filepath, "r") as z: - z.extractall(tmpdir) - - filepath = glob(os.path.join(tmpdir, "*.ckpt")) - - return Ensemble( - [load_model(f, args=args, device=device, **kwargs) for f in filepath], - return_std=return_std, - ) - + assert isinstance(filepath, str) ckpt = torch.load(filepath, map_location="cpu") if args is None: args = ckpt["hyper_parameters"]