Feature graph options (#45)

* variable for graph * ncbi_connect config * config graph * config for graph * test graph configs * fix config graph_end * fix path test_tranform * implement legend colours * gradient to single str value * add temp directory where missing * increased timestamp precision for legend file name * ignore config warnings * change config to global accession config * test cases * adapt test cases to new config for accession assets * ran flake
vestalisvirginis · Jan 12, 2024 · 471e298 · 471e298
1 parent 2f03727
commit 471e298
Show file tree

Hide file tree

Showing 31 changed files with 2,350 additions and 196,099 deletions.
diff --git a/Makefile b/Makefile
@@ -15,6 +15,7 @@ clean: # Remove workspace files
 	@rm -rf test/fixtures/assets_testing_folder/blasting/gene_identity
 	@rm -rf test/fixtures/assets_testing_folder/blasting_with_history/gene_identity
 	@rm -rf test/fixtures/assets_testing_folder/synteny/synteny
+	@rm -rf test/fixtures/assets_testing_folder/synteny_var/synteny
 	@rm -rf test/fixtures/assets_testing_folder/transform/fs
 	@rm -rf test/fixtures/assets_testing_folder/transform/tables
 	@rm -rf test/fixtures/assets_testing_folder/transform_2/tables

diff --git a/conftest.py b/conftest.py
@@ -14,11 +14,6 @@ def mock_env_ncbi_download_pos(monkeypatch):
 def mock_env_ncbi_download_neg(monkeypatch):
     monkeypatch.setenv("DATA_DIR", "test/fixtures/ncbi_download/negative")
 
-@pytest.fixture
-def mock_env_ncbi_count(monkeypatch):
-    monkeypatch.setenv("DATABASE", "nuccore")
-    monkeypatch.setenv("KEYWORD", "Bacillus subtilis strain P9_B1")
-
 @pytest.fixture
 def mock_env_ncbi_fetch(monkeypatch):
     monkeypatch.setenv("DATA_DIR", "test/fixtures/ncbi_download/fetch")
@@ -42,7 +37,6 @@ def mock_env_phagy_dir_blasting_with_history(monkeypatch):
 @pytest.fixture
 def mock_env_phagy_dir_transform(monkeypatch):
     monkeypatch.setenv("DATA_DIR", "test/fixtures/assets_testing_folder/transform")
-    monkeypatch.setenv("FILE_SYSTEM", "fs")
 
 @pytest.fixture
 def mock_env_phagy_dir_transform_step3(monkeypatch):
@@ -51,7 +45,6 @@ def mock_env_phagy_dir_transform_step3(monkeypatch):
 @pytest.fixture
 def mock_env_phagy_dir_synteny(monkeypatch):
     monkeypatch.setenv("DATA_DIR", "test/fixtures/assets_testing_folder/synteny")
-    monkeypatch.setenv("SEQUENCE_FILE", "sequences.csv")
 
 @pytest.fixture
 def mock_env_phagy_dir_none(monkeypatch):

diff --git a/legend.svg b/legend.svg
diff --git a/synphage/assets/blaster/n_blaster.py b/synphage/assets/blaster/n_blaster.py
@@ -93,7 +93,7 @@ def genbank_to_fasta(context, standardised_ext_file) -> tuple[List[str], List[st
     )
 
     _path_history = (
-        Path(os.getenv(EnvVar("DATA_DIR")))
+        Path(os.getenv(EnvVar("DATA_DIR"), TEMP_DIR))
         / context.op_config["fs"]
         / "history_fasta_files"
     )

diff --git a/synphage/assets/ncbi_connect/accession.py b/synphage/assets/ncbi_connect/accession.py
@@ -1,4 +1,5 @@
-from dagster import asset, Field, AssetObservation, EnvVar
+from dagster import asset, AssetObservation, EnvVar, Config, ConfigArgumentWarning
+from pydantic import Field
 
 import os
 import tempfile
@@ -8,6 +9,10 @@
 from typing import List
 from pathlib import Path
 from datetime import datetime
+import warnings
+
+
+warnings.filterwarnings("ignore", category=ConfigArgumentWarning)
 
 
 TEMP_DIR = tempfile.gettempdir()
@@ -22,34 +27,47 @@ def _get_ncbi_count_result(result, dbname) -> NucleotideRecord:
     )
 
 
-ncbi_query_config = {
-    "database": Field(str, description="Database identifier", default_value="nuccore"),
-    "use_history": Field(
-        str, description="Yes/No value for history", default_value="y"
-    ),
-    "idtype": Field(str, description="Options for acc", default_value="acc"),
-    "rettype": Field(str, description="File format", default_value="gb"),  # gbwithparts
-}
+class QueryConfig(Config):
+    search_key: str = Field(
+        default="Myoalterovirus", description="Keyword(s) for NCBI query"
+    )
+    database: str = Field(default="nuccore", description="Database identifier")
+    use_history: str = Field(default="y", description="Yes/No value for history")
+    idtype: str = Field(default="acc", description="Options for acc")
+    rettype: str = Field(default="gb", description="File format")  # gbwithparts
+    download_dir: str = Field(default="download", description="Path to download folder")
+
+
+@asset(
+    description="Setup unique config for asset group",
+    compute_kind="Config",
+    metadata={"owner": "Virginie Grosboillot"},
+)
+def setup_query_config(config: QueryConfig) -> QueryConfig:
+    """Source/target dirs and    file output"""
+    return config
 
 
 @asset(
     required_resource_keys={"ncbi_connection"},
-    config_schema=ncbi_query_config,
+    # config_schema=ncbi_query_config,
     description="Getting the number of records matching the keyword(s) in the specified database",
     compute_kind="NCBI",
     io_manager_key="io_manager",
     metadata={"owner": "Virginie Grosboillot"},
 )
-def accession_count(context) -> int:
+def accession_count(context, setup_query_config: QueryConfig) -> int:
     # Search key - default: Myoalterovirus (2 entries in NCBI database Jan 2024)
-    keyword = os.getenv(EnvVar("KEYWORD"), "Myoalterovirus")
+    # keyword = context.op_config["search_key"]
+    keyword = setup_query_config.search_key
     context.log.info(f"Search key(s): {keyword}")
     # Query
     _query = context.resources.ncbi_connection.conn.egquery(term=keyword)
     _result = context.resources.ncbi_connection.conn.read(_query)
     _query.close()
     # Extract number of record for keyword
-    _nucleotide = _get_ncbi_count_result(_result, context.op_config["database"])
+    # _nucleotide = _get_ncbi_count_result(_result, context.op_config["database"])
+    _nucleotide = _get_ncbi_count_result(_result, setup_query_config.database)
     _num_rows = int(_nucleotide.count)
     context.log_event(
         AssetObservation(asset_key="accession_count", metadata={"num_rows": _num_rows})
@@ -70,24 +88,28 @@ def accession_count(context) -> int:
 
 @asset(
     required_resource_keys={"ncbi_connection"},
-    config_schema=ncbi_query_config,
+    # config_schema=ncbi_query_config,
     description="Getting all accession Ids corresponding to keyword(s)",
     compute_kind="NCBI",
     io_manager_key="io_manager",
     metadata={"owner": "Virginie Grosboillot"},
 )
-def accession_ids(context, accession_count) -> dict:
+def accession_ids(context, accession_count, setup_query_config: QueryConfig) -> dict:
     # Search key - default: Myoalterovirus (2 entries in NCBI database Jan 2024)
-    keyword = os.getenv(EnvVar("KEYWORD"), "Myoalterovirus")
+    # keyword = context.op_config["search_key"]
+    keyword = setup_query_config.search_key
     context.log.info(f"Search key(s): {keyword}")
     # Search
     context.log.info("Start NCBI database search")
     _search = context.resources.ncbi_connection.conn.esearch(
-        db=context.op_config["database"],
+        # db=context.op_config["database"],
+        db=setup_query_config.database,
         term=keyword,
         retmax=accession_count,
-        usehistory=context.op_config["use_history"],
-        idtype=context.op_config["idtype"],
+        # usehistory=context.op_config["use_history"],
+        # idtype=context.op_config["idtype"],
+        usehistory=setup_query_config.use_history,
+        idtype=setup_query_config.idtype,
     )
     context.log.info("The searched is finished")
     _result = context.resources.ncbi_connection.conn.read(_search)
@@ -112,27 +134,28 @@ def accession_ids(context, accession_count) -> dict:
     return _result
 
 
-download_folder_config = {
-    "output_directory": Field(
-        str,
-        description="Path to folder",
-        default_value="download",
-    )
-}
+# download_folder_config = {
+#     "output_directory": Field(
+#         str,
+#         description="Path to folder",
+#         default_value="download",
+#     )
+# }
 
 
 @asset(
-    config_schema=download_folder_config,
+    # config_schema=download_folder_config,
     description="In case of multiple search, checked what sequence have already been downloeded",
     compute_kind="python",
     io_manager_key="io_manager",
     metadata={"owner": "Virginie Grosboillot"},
 )
-def downloaded_genomes(context) -> List[str]:
+def downloaded_genomes(context, setup_query_config: QueryConfig) -> List[str]:
     # Download directory
     _download_path = str(
         Path(os.getenv(EnvVar("DATA_DIR"), TEMP_DIR))
-        / context.op_config["output_directory"]
+        # / context.op_config["output_directory"]
+        / setup_query_config.download_dir
     )
     os.makedirs(_download_path, exist_ok=True)
     # List file in download directory
@@ -153,13 +176,15 @@ def downloaded_genomes(context) -> List[str]:
 
 @asset(
     required_resource_keys={"ncbi_connection"},
-    config_schema={**download_folder_config, **ncbi_query_config},
+    # config_schema={**download_folder_config, **ncbi_query_config},
     description="Download records one by one from the ncbi database",
     compute_kind="NCBI",
     io_manager_key="io_manager",
     metadata={"owner": "Virginie Grosboillot"},
 )
-def fetch_genome(context, accession_ids, downloaded_genomes) -> List[str]:
+def fetch_genome(
+    context, accession_ids, downloaded_genomes, setup_query_config: QueryConfig
+) -> List[str]:
     # Exclude already downloaded files
     _A = set(accession_ids["IdList"])
     _B = set(downloaded_genomes)
@@ -168,15 +193,18 @@ def fetch_genome(context, accession_ids, downloaded_genomes) -> List[str]:
     # Path to download
     _download_path = str(
         Path(os.getenv(EnvVar("DATA_DIR"), TEMP_DIR))
-        / context.op_config["output_directory"]
+        # / context.op_config["output_directory"]
+        / setup_query_config.download_dir
     )
     context.log.info(f"Path to download: {_download_path}")
     # Fetch and write files
     for _entry in list(_C):
         _r = context.resources.ncbi_connection.conn.efetch(
-            db=context.op_config["database"],
+            # db=context.op_config["database"],
+            db=setup_query_config.database,
             id=_entry,
-            rettype=context.op_config["rettype"],
+            # rettype=context.op_config["rettype"],
+            rettype=setup_query_config.rettype,
             retmax=1,
             webenv=accession_ids["WebEnv"],
             query_key=accession_ids["QueryKey"],

diff --git a/synphage/assets/ncbi_connect/sequence_quality_assessment.py b/synphage/assets/ncbi_connect/sequence_quality_assessment.py
@@ -34,11 +34,6 @@
         description="Path to folder containing the genbank files",
         default_value="genbank",
     ),
-    "fasta_dir": Field(
-        str,
-        description="Path to folder containing the fasta sequence files",
-        default_value=str(Path("gene_identity") / "fasta"),
-    ),
 }
 
 
@@ -67,7 +62,7 @@
 def sequence_check(context, fetch_genome) -> tuple[List[str], List[str]]:
     # history check
     _path_history = (
-        Path(os.getenv(EnvVar("DATA_DIR")))
+        Path(os.getenv(EnvVar("DATA_DIR"), TEMP_DIR))
         / context.op_config["fs"]
         / "history_transferred_files"
     )