diff --git a/intake_dal/dal_catalog.py b/intake_dal/dal_catalog.py index 753343e..a16cd88 100644 --- a/intake_dal/dal_catalog.py +++ b/intake_dal/dal_catalog.py @@ -20,13 +20,16 @@ class DalCatalog(NestedYAMLFileCatalog): name = "dal_cat" version = pkg_resources.get_distribution("intake-dal").version - def __init__(self, path, storage_mode=None, autoreload=True, **kwargs): + def __init__(self, path=None, catalog_data=None, storage_mode=None, autoreload=True, **kwargs): """ Parameters ---------- path: str Location of the file to parse (can be remote) - reload : bool + catalog_data: dict + If catalog data is in memory, pass it through `catalog_data` to populate the intake catalog. + If the dataset/catalog is in the local or a specific url is given, please use the `path` argument. + reload: bool Whether to watch the source file for changes; make False if you want an editable Catalog storage_mode: str @@ -49,7 +52,18 @@ def __init__(self, path, storage_mode=None, autoreload=True, **kwargs): >>> df = cat.user_events.read() """ self.storage_mode = storage_mode - super(DalCatalog, self).__init__(path, autoreload, **kwargs) + + self.is_path = False + if catalog_data and not path: + # A user passes catalog data, not passes path info. + self.path_or_catalog = catalog_data + else: + # A user passes path and url. + # In this case, ignore catalog_data. + self.path_or_catalog = path + self.is_path = True + + super(DalCatalog, self).__init__(self.path_or_catalog, autoreload, **kwargs) def __getitem__(self, key): # TODO(Taleb Zeghmi): Remove once https://github.com/zillow/intake-nested-yaml-catalog/issues/6 is resolved @@ -59,6 +73,16 @@ def __getitem__(self, key): ret = super().__getitem__(key) return ret + def _load(self, reload=False): + if self.is_path: + # File path or url. Load and parse. + super()._load() + else: + # It's catalog data and not requires directory/url information. + # Set self._dir to an empty value + self._dir = "" + self.parse(yaml.dump(self.path_or_catalog)) + def parse(self, text): data = yaml_load(text) diff --git a/intake_dal/tests/conftest.py b/intake_dal/tests/conftest.py index 7b7ed0e..4dd3070 100644 --- a/intake_dal/tests/conftest.py +++ b/intake_dal/tests/conftest.py @@ -10,6 +10,11 @@ def catalog_path(): return str(Path(__file__).resolve().parent.joinpath(Path("catalog.yaml"))) +@pytest.fixture +def remote_catalog_path(): + return str(Path(__file__).resolve().parent.joinpath(Path("remote_storage_catalog.yaml"))) + + @pytest.fixture def serving_cat(catalog_path: str): return DalCatalog(catalog_path, storage_mode="serving") diff --git a/intake_dal/tests/remote_storage_catalog.yaml b/intake_dal/tests/remote_storage_catalog.yaml new file mode 100644 index 0000000..9755ccc --- /dev/null +++ b/intake_dal/tests/remote_storage_catalog.yaml @@ -0,0 +1,27 @@ +name: My Sample Catalog +metadata: + hierarchical_catalog: true +entity: + property: + user_event: + args: + default: batch + storage: + batch: parquet://https://my_storage.com/user_event/date={{date}}/*.parquet + golden: parquet://https://my_storage.com/user_event/golden/date={{date}}/*.parquet + description: This is user_event description + driver: dal + metadata: + owner_team: my-team + public: false + user_dataset: + args: + default: batch + storage: + batch: parquet://https://my_storage.com/user_dataset/date={{date}}/*.parquet + serving: dal-online://https://featurestore.url.net#userid + description: This is user_dataset description + driver: dal + metadata: + owner_team: my-team + public: false \ No newline at end of file diff --git a/intake_dal/tests/test_dal_catalog.py b/intake_dal/tests/test_dal_catalog.py index 12e71a0..e24a936 100644 --- a/intake_dal/tests/test_dal_catalog.py +++ b/intake_dal/tests/test_dal_catalog.py @@ -1,5 +1,5 @@ +import yaml import pandas as pd - from intake_dal.dal_catalog import DalCatalog @@ -69,3 +69,17 @@ def validate_dataset(ds): validate_dataset(cat["entity.user.user_events"]) validate_dataset(cat.entity["user.user_events"]) validate_dataset(cat.entity.user["user_events"]) + + +def test_dal_catalog_passing_dict(remote_catalog_path): + with open(remote_catalog_path, 'r') as f: + data = yaml.load(f) + + # Instead of passing path, passes the catalog data read from the file. + cat = DalCatalog(catalog_data=data, storage_mode="golden") + + assert cat.entity.property.user_event.default == "golden" + assert cat.entity.property.user_dataset.default == "golden" + + assert len(cat.entity.property.user_event.storage) == 2 + assert len(cat.entity.property.user_dataset.storage) == 2