Skip to content

Commit

Permalink
Feature graph options (#45)
Browse files Browse the repository at this point in the history
* variable for graph

* ncbi_connect config

* config graph

* config for graph

* test graph configs

* fix config graph_end

* fix path test_tranform

* implement legend colours

* gradient to single str value

* add temp directory where missing

* increased timestamp precision for legend file name

* ignore config warnings

* change config to global accession config

* test cases

* adapt test cases to new config for accession assets

* ran flake
  • Loading branch information
vestalisvirginis committed Jan 12, 2024
1 parent 2f03727 commit 471e298
Show file tree
Hide file tree
Showing 31 changed files with 2,350 additions and 196,099 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ clean: # Remove workspace files
@rm -rf test/fixtures/assets_testing_folder/blasting/gene_identity
@rm -rf test/fixtures/assets_testing_folder/blasting_with_history/gene_identity
@rm -rf test/fixtures/assets_testing_folder/synteny/synteny
@rm -rf test/fixtures/assets_testing_folder/synteny_var/synteny
@rm -rf test/fixtures/assets_testing_folder/transform/fs
@rm -rf test/fixtures/assets_testing_folder/transform/tables
@rm -rf test/fixtures/assets_testing_folder/transform_2/tables
Expand Down
7 changes: 0 additions & 7 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,6 @@ def mock_env_ncbi_download_pos(monkeypatch):
def mock_env_ncbi_download_neg(monkeypatch):
monkeypatch.setenv("DATA_DIR", "test/fixtures/ncbi_download/negative")

@pytest.fixture
def mock_env_ncbi_count(monkeypatch):
monkeypatch.setenv("DATABASE", "nuccore")
monkeypatch.setenv("KEYWORD", "Bacillus subtilis strain P9_B1")

@pytest.fixture
def mock_env_ncbi_fetch(monkeypatch):
monkeypatch.setenv("DATA_DIR", "test/fixtures/ncbi_download/fetch")
Expand All @@ -42,7 +37,6 @@ def mock_env_phagy_dir_blasting_with_history(monkeypatch):
@pytest.fixture
def mock_env_phagy_dir_transform(monkeypatch):
monkeypatch.setenv("DATA_DIR", "test/fixtures/assets_testing_folder/transform")
monkeypatch.setenv("FILE_SYSTEM", "fs")

@pytest.fixture
def mock_env_phagy_dir_transform_step3(monkeypatch):
Expand All @@ -51,7 +45,6 @@ def mock_env_phagy_dir_transform_step3(monkeypatch):
@pytest.fixture
def mock_env_phagy_dir_synteny(monkeypatch):
monkeypatch.setenv("DATA_DIR", "test/fixtures/assets_testing_folder/synteny")
monkeypatch.setenv("SEQUENCE_FILE", "sequences.csv")

@pytest.fixture
def mock_env_phagy_dir_none(monkeypatch):
Expand Down
33 changes: 33 additions & 0 deletions legend.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion synphage/assets/blaster/n_blaster.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def genbank_to_fasta(context, standardised_ext_file) -> tuple[List[str], List[st
)

_path_history = (
Path(os.getenv(EnvVar("DATA_DIR")))
Path(os.getenv(EnvVar("DATA_DIR"), TEMP_DIR))
/ context.op_config["fs"]
/ "history_fasta_files"
)
Expand Down
96 changes: 62 additions & 34 deletions synphage/assets/ncbi_connect/accession.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from dagster import asset, Field, AssetObservation, EnvVar
from dagster import asset, AssetObservation, EnvVar, Config, ConfigArgumentWarning
from pydantic import Field

import os
import tempfile
Expand All @@ -8,6 +9,10 @@
from typing import List
from pathlib import Path
from datetime import datetime
import warnings


warnings.filterwarnings("ignore", category=ConfigArgumentWarning)


TEMP_DIR = tempfile.gettempdir()
Expand All @@ -22,34 +27,47 @@ def _get_ncbi_count_result(result, dbname) -> NucleotideRecord:
)


ncbi_query_config = {
"database": Field(str, description="Database identifier", default_value="nuccore"),
"use_history": Field(
str, description="Yes/No value for history", default_value="y"
),
"idtype": Field(str, description="Options for acc", default_value="acc"),
"rettype": Field(str, description="File format", default_value="gb"), # gbwithparts
}
class QueryConfig(Config):
search_key: str = Field(
default="Myoalterovirus", description="Keyword(s) for NCBI query"
)
database: str = Field(default="nuccore", description="Database identifier")
use_history: str = Field(default="y", description="Yes/No value for history")
idtype: str = Field(default="acc", description="Options for acc")
rettype: str = Field(default="gb", description="File format") # gbwithparts
download_dir: str = Field(default="download", description="Path to download folder")


@asset(
description="Setup unique config for asset group",
compute_kind="Config",
metadata={"owner": "Virginie Grosboillot"},
)
def setup_query_config(config: QueryConfig) -> QueryConfig:
"""Source/target dirs and file output"""
return config


@asset(
required_resource_keys={"ncbi_connection"},
config_schema=ncbi_query_config,
# config_schema=ncbi_query_config,
description="Getting the number of records matching the keyword(s) in the specified database",
compute_kind="NCBI",
io_manager_key="io_manager",
metadata={"owner": "Virginie Grosboillot"},
)
def accession_count(context) -> int:
def accession_count(context, setup_query_config: QueryConfig) -> int:
# Search key - default: Myoalterovirus (2 entries in NCBI database Jan 2024)
keyword = os.getenv(EnvVar("KEYWORD"), "Myoalterovirus")
# keyword = context.op_config["search_key"]
keyword = setup_query_config.search_key
context.log.info(f"Search key(s): {keyword}")
# Query
_query = context.resources.ncbi_connection.conn.egquery(term=keyword)
_result = context.resources.ncbi_connection.conn.read(_query)
_query.close()
# Extract number of record for keyword
_nucleotide = _get_ncbi_count_result(_result, context.op_config["database"])
# _nucleotide = _get_ncbi_count_result(_result, context.op_config["database"])
_nucleotide = _get_ncbi_count_result(_result, setup_query_config.database)
_num_rows = int(_nucleotide.count)
context.log_event(
AssetObservation(asset_key="accession_count", metadata={"num_rows": _num_rows})
Expand All @@ -70,24 +88,28 @@ def accession_count(context) -> int:

@asset(
required_resource_keys={"ncbi_connection"},
config_schema=ncbi_query_config,
# config_schema=ncbi_query_config,
description="Getting all accession Ids corresponding to keyword(s)",
compute_kind="NCBI",
io_manager_key="io_manager",
metadata={"owner": "Virginie Grosboillot"},
)
def accession_ids(context, accession_count) -> dict:
def accession_ids(context, accession_count, setup_query_config: QueryConfig) -> dict:
# Search key - default: Myoalterovirus (2 entries in NCBI database Jan 2024)
keyword = os.getenv(EnvVar("KEYWORD"), "Myoalterovirus")
# keyword = context.op_config["search_key"]
keyword = setup_query_config.search_key
context.log.info(f"Search key(s): {keyword}")
# Search
context.log.info("Start NCBI database search")
_search = context.resources.ncbi_connection.conn.esearch(
db=context.op_config["database"],
# db=context.op_config["database"],
db=setup_query_config.database,
term=keyword,
retmax=accession_count,
usehistory=context.op_config["use_history"],
idtype=context.op_config["idtype"],
# usehistory=context.op_config["use_history"],
# idtype=context.op_config["idtype"],
usehistory=setup_query_config.use_history,
idtype=setup_query_config.idtype,
)
context.log.info("The searched is finished")
_result = context.resources.ncbi_connection.conn.read(_search)
Expand All @@ -112,27 +134,28 @@ def accession_ids(context, accession_count) -> dict:
return _result


download_folder_config = {
"output_directory": Field(
str,
description="Path to folder",
default_value="download",
)
}
# download_folder_config = {
# "output_directory": Field(
# str,
# description="Path to folder",
# default_value="download",
# )
# }


@asset(
config_schema=download_folder_config,
# config_schema=download_folder_config,
description="In case of multiple search, checked what sequence have already been downloeded",
compute_kind="python",
io_manager_key="io_manager",
metadata={"owner": "Virginie Grosboillot"},
)
def downloaded_genomes(context) -> List[str]:
def downloaded_genomes(context, setup_query_config: QueryConfig) -> List[str]:
# Download directory
_download_path = str(
Path(os.getenv(EnvVar("DATA_DIR"), TEMP_DIR))
/ context.op_config["output_directory"]
# / context.op_config["output_directory"]
/ setup_query_config.download_dir
)
os.makedirs(_download_path, exist_ok=True)
# List file in download directory
Expand All @@ -153,13 +176,15 @@ def downloaded_genomes(context) -> List[str]:

@asset(
required_resource_keys={"ncbi_connection"},
config_schema={**download_folder_config, **ncbi_query_config},
# config_schema={**download_folder_config, **ncbi_query_config},
description="Download records one by one from the ncbi database",
compute_kind="NCBI",
io_manager_key="io_manager",
metadata={"owner": "Virginie Grosboillot"},
)
def fetch_genome(context, accession_ids, downloaded_genomes) -> List[str]:
def fetch_genome(
context, accession_ids, downloaded_genomes, setup_query_config: QueryConfig
) -> List[str]:
# Exclude already downloaded files
_A = set(accession_ids["IdList"])
_B = set(downloaded_genomes)
Expand All @@ -168,15 +193,18 @@ def fetch_genome(context, accession_ids, downloaded_genomes) -> List[str]:
# Path to download
_download_path = str(
Path(os.getenv(EnvVar("DATA_DIR"), TEMP_DIR))
/ context.op_config["output_directory"]
# / context.op_config["output_directory"]
/ setup_query_config.download_dir
)
context.log.info(f"Path to download: {_download_path}")
# Fetch and write files
for _entry in list(_C):
_r = context.resources.ncbi_connection.conn.efetch(
db=context.op_config["database"],
# db=context.op_config["database"],
db=setup_query_config.database,
id=_entry,
rettype=context.op_config["rettype"],
# rettype=context.op_config["rettype"],
rettype=setup_query_config.rettype,
retmax=1,
webenv=accession_ids["WebEnv"],
query_key=accession_ids["QueryKey"],
Expand Down
7 changes: 1 addition & 6 deletions synphage/assets/ncbi_connect/sequence_quality_assessment.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,6 @@
description="Path to folder containing the genbank files",
default_value="genbank",
),
"fasta_dir": Field(
str,
description="Path to folder containing the fasta sequence files",
default_value=str(Path("gene_identity") / "fasta"),
),
}


Expand Down Expand Up @@ -67,7 +62,7 @@
def sequence_check(context, fetch_genome) -> tuple[List[str], List[str]]:
# history check
_path_history = (
Path(os.getenv(EnvVar("DATA_DIR")))
Path(os.getenv(EnvVar("DATA_DIR"), TEMP_DIR))
/ context.op_config["fs"]
/ "history_transferred_files"
)
Expand Down
Loading

0 comments on commit 471e298

Please sign in to comment.