From 48c8409bb5da1d375df84d8750a1646d8fba3ad8 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Wed, 1 Jun 2022 01:36:42 +0300 Subject: [PATCH] fs: compute size lazily We are effectively reading every object in the tree that we parse right now, which takes a very long time. Before fsspec migration, we used to not use info/size during walking at all. Cuts down `dvc exp show -A` time for me from ~50sec down to ~3sec. Related iterative/dvc#7824 --- scmrepo/fs.py | 10 ++++++---- scmrepo/git/objects.py | 24 +++++++++++++++++------- scmrepo/utils.py | 25 +++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 11 deletions(-) diff --git a/scmrepo/fs.py b/scmrepo/fs.py index f16053d9..b66ab46f 100644 --- a/scmrepo/fs.py +++ b/scmrepo/fs.py @@ -220,10 +220,12 @@ def _open( def info(self, path: str, **kwargs: Any) -> Dict[str, Any]: key = self._get_key(path) try: - return { - **self.trie.info(key), - "name": path, - } + # NOTE: to avoid wasting time computing object size, trie.info + # will return a LazyDict instance, that will compute compute size + # only when it is accessed. + ret = self.trie.info(key) + ret["name"] = path + return ret except KeyError: raise FileNotFoundError( errno.ENOENT, os.strerror(errno.ENOENT), path diff --git a/scmrepo/git/objects.py b/scmrepo/git/objects.py index 2da033fe..c58d4df3 100644 --- a/scmrepo/git/objects.py +++ b/scmrepo/git/objects.py @@ -1,7 +1,7 @@ import stat from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Iterable, List, Optional +from typing import Iterable, List, Optional, cast from pygtrie import Trie @@ -136,13 +136,23 @@ def walk(self, top: tuple, topdown: Optional[bool] = True): yield top, dirs, nondirs def info(self, key: tuple) -> dict: + from scmrepo.utils import LazyDict + obj = self.trie[key] - return { - "size": obj.size, - "type": "directory" if stat.S_ISDIR(obj.mode) else "file", - "sha": obj.sha, - "mode": obj.mode, - } + + def size(): + return obj.size + + ret = LazyDict( + { + "size": size, + "type": "directory" if stat.S_ISDIR(obj.mode) else "file", + "sha": obj.sha, + "mode": obj.mode, + } + ) + + return cast(dict, ret) @dataclass diff --git a/scmrepo/utils.py b/scmrepo/utils.py index fd6cc1cf..d6bf6768 100644 --- a/scmrepo/utils.py +++ b/scmrepo/utils.py @@ -1,4 +1,29 @@ import os +from collections.abc import MutableMapping + + +class LazyDict(MutableMapping): + def __init__(self, values): + self._values = values + + def __getitem__(self, item): + value = self._values[item] + if callable(value): + value = value() + self._values[item] = value + return value + + def __setitem__(self, key, value): + self._values[key] = value + + def __delitem__(self, key): + del self._values[key] + + def __iter__(self): + return iter(self._values) + + def __len__(self): + return len(self._values) def relpath(path, start=os.curdir):