diff --git a/.gitignore b/.gitignore index 70c44ee..68110ef 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # BioProv bioprov/db.json bioprov/provstore_api.txt +**/*.log # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 077fb28..f1354fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,21 @@ * .describe [] * .write_paths_to_file, .copy_files_to_dir(), .link_files_to_dir() +### v0.1.19 +* Debug API endpoint (#23) [x] +* Implement logging [x] + * Implement Workflow logging [x] +* Debug Workflow Steps [x] +* Remove workflow main methods [x] + * Workflows must now be called only from the CLI [x] +* Implement post-workflow actions [x] + * Update db [x] + * Upload to ProvStore [x] + * Write PROVN [x] + * Write PDF [x] +* Add Sample.auto_update_db() methods [x] +* Remove logger call when updating Project in database [x] + ### v0.1.18a * Patch file deserializer bug [x] diff --git a/bioprov/__init__.py b/bioprov/__init__.py index 1dc35d6..3803628 100644 --- a/bioprov/__init__.py +++ b/bioprov/__init__.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ diff --git a/bioprov/bioprov b/bioprov/bioprov index e8d3208..9bf36c1 100644 --- a/bioprov/bioprov +++ b/bioprov/bioprov @@ -3,7 +3,7 @@ __author__ = "Vini Salazar" __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ Executable that goes in $PATH. Code for the command-line is on the bioprov.py module. diff --git a/bioprov/bioprov.py b/bioprov/bioprov.py index 24586e4..f1552d6 100644 --- a/bioprov/bioprov.py +++ b/bioprov/bioprov.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ BioProv command-line application. This module holds the main executable. @@ -10,16 +10,17 @@ import argparse import sys +from pathlib import Path + import bioprov.src.config as bp_config_module from bioprov.src.config import config +from bioprov.utils import parser_help, dict_to_string from bioprov.workflows import ( WorkflowOptionsParser, genome_annotation, blastn_alignment, KaijuWorkflow, ) -from bioprov.utils import parser_help, dict_to_string -from pathlib import Path def main(args=None): @@ -90,7 +91,7 @@ def main(args=None): try: parser.parse_options(args) # no cover except KeyError: - parser_help(bioprov_parser) + raise class CommandOptionsParser: diff --git a/bioprov/data/__init__.py b/bioprov/data/__init__.py index 192eb54..715870a 100644 --- a/bioprov/data/__init__.py +++ b/bioprov/data/__init__.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ diff --git a/bioprov/data/datasets/genome_annotation_input.tsv b/bioprov/data/datasets/genome_annotation_input.tsv index 31557b5..8c71303 100644 --- a/bioprov/data/datasets/genome_annotation_input.tsv +++ b/bioprov/data/datasets/genome_annotation_input.tsv @@ -1,2 +1,3 @@ sample-id assembly GCF_000010065.1_ASM1006v1_genomic.fna bioprov/data/genomes/GCF_000010065.1_ASM1006v1_genomic.fna +GCF_000007925.1_ASM792v1_genomic.fna bioprov/data/genomes/GCF_000007925.1_ASM792v1_genomic.fna diff --git a/bioprov/programs/__init__.py b/bioprov/programs/__init__.py index 80874c0..1b6afd8 100644 --- a/bioprov/programs/__init__.py +++ b/bioprov/programs/__init__.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" from .programs import prodigal, diamond, blastn, blastp, prokka, kaiju, kaiju2table diff --git a/bioprov/programs/programs.py b/bioprov/programs/programs.py index de7a212..2c854b3 100644 --- a/bioprov/programs/programs.py +++ b/bioprov/programs/programs.py @@ -2,13 +2,15 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ Module for holding preset instances of the Program class. +Module for holding preset instances of the Program class. """ +import logging from os import path from pathlib import Path @@ -117,8 +119,8 @@ def blastp(sample, db, query_tag="query", outformat=6): :param Sample sample: Instance of BioProv.Sample. :param str db: A string pointing to the reference database directory and title. :param str query_tag: A tag for the query file. - :param int outformat: The output format to gather from blastn. - :return: Instance of PresetProgram for BLASTN. + :param int outformat: The output format to gather from blastp. + :return: Instance of PresetProgram for BLASTP. :rtype: BioProv.PresetProgram. :raises AssertionError: Path to the reference database does not exist. """ @@ -192,10 +194,12 @@ def prokka( ) for param in params: - _prokka.add_parameter(param, _print=False) + _prokka.add_parameter(param) if path.isdir(output_path): - print(f"Warning: {output_path} directory exists. Will overwrite.") # no cover + config.logger.warning( + f"Warning: {output_path} directory exists. Will overwrite." + ) # no cover _prokka.add_parameter( Parameter(key="--force", value="", kind="misc") ) # no cover @@ -223,8 +227,7 @@ def prokka( # Input goes here, must be last positionally. _prokka.add_parameter( - Parameter(key="", value=str(_sample.files[assembly]), kind="input"), - _print=False, + Parameter(key="", value=str(_sample.files[assembly]), kind="input") ) return _prokka @@ -274,7 +277,7 @@ def kaiju( Parameter(key="-o", value=output_path, kind="output"), ) for p in params: - kaiju_.add_parameter(p, _print=False) + kaiju_.add_parameter(p) if add_param_str: kaiju_.cmd += f" {add_param_str}" # no cover @@ -324,7 +327,7 @@ def kaiju2table( ) for p in params: - kaiju2table_.add_parameter(p, _print=False) + kaiju2table_.add_parameter(p) # Add final parameter: kaiju2table_.cmd += f" {str(_sample.files[kaiju_output])}" diff --git a/bioprov/src/__init__.py b/bioprov/src/__init__.py index 7e9f807..00de961 100644 --- a/bioprov/src/__init__.py +++ b/bioprov/src/__init__.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ diff --git a/bioprov/src/config.py b/bioprov/src/config.py index 6074e91..532b1aa 100644 --- a/bioprov/src/config.py +++ b/bioprov/src/config.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ @@ -12,13 +12,15 @@ """ import os -import bioprov -from bioprov.data import data_dir, genomes_dir +from pathlib import Path + from prov.model import Namespace from provstore.api import Api -from bioprov.utils import serializer, dict_to_sha1, serializer_filter from tinydb import TinyDB -from pathlib import Path + +from bioprov import __file__ as bp_file +from bioprov.data import data_dir, genomes_dir +from bioprov.utils import serializer, dict_to_sha1, serializer_filter, create_logger class Config: @@ -40,7 +42,7 @@ def __init__(self, db_path=None, threads=0): self.db = None self.db_path = None self.threads = threads - self.bioprov_dir = Path(bioprov.__file__).parent + self.bioprov_dir = Path(bp_file).parent self.data = data_dir self.genomes = genomes_dir if db_path is None: @@ -52,6 +54,7 @@ def __init__(self, db_path=None, threads=0): self._provstore_token = None self._provstore_api = None self._provstore_endpoint = "https://openprovenance.org/store/api/v0/" + self._logger = None def __repr__(self): return f"BioProv Config class set in {__file__}" @@ -76,7 +79,7 @@ def provstore_api(self): self._provstore_api = Api( username=self.provstore_user, api_key=self.provstore_token ) - Api.base_url = self._provstore_endpoint + self._provstore_api.base_url = self._provstore_endpoint return self._provstore_api @provstore_api.setter @@ -113,12 +116,22 @@ def provstore_token(self): # no cover def provstore_token(self, value): self._provstore_token = value - def create_provstore_file(self, user=None, token=None): + @property + def logger(self): + if self._logger is None: + self._logger = create_logger() + return self._logger + + @logger.setter + def logger(self, value): + self._logger = value + + def create_provstore_file(self, user=None, token=None): # no cover with open(self.provstore_file, "w") as f: if user is None: - user = input("Please paste your ProvStore user: ") # no cover + user = input("Please paste your ProvStore user: ") if token is None: - token = input("Please paste your ProvStore API token: ") # no cover + token = input("Please paste your ProvStore API token: ") f.write(user + "\n") f.write(token + "\n") diff --git a/bioprov/src/files.py b/bioprov/src/files.py index 532215a..76104f4 100644 --- a/bioprov/src/files.py +++ b/bioprov/src/files.py @@ -2,17 +2,20 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ Contains the File and SeqFile classes and related functions. """ -import numpy as np +import logging from dataclasses import dataclass from pathlib import Path + +import numpy as np from Bio import SeqIO, AlignIO + from bioprov.utils import ( get_size, Warnings, @@ -113,12 +116,12 @@ def replace_path(self, old_terms, new, warnings=False): # TODO: replace these print statements for logger warning/debug level if warnings: if not self.exists and old_exists: - print( - f"Warning: file {self.path} was marked as existing but was not found." + logging.warning( + f"File {self.path} was marked as existing but was not found." ) if old_hash and self.sha1 != old_hash and self.exists: # no cover - print( - f"Warning: file {self.path} previous sha1 checksum differs from the current." + logging.warning( + f"File {self.path} previous sha1 checksum differs from the current." ) def serializer(self): @@ -159,7 +162,7 @@ def replace_path(self, old_terms, new, warnings=False): # TODO: replace these print statements for logger warning/debug level if warnings: if not self.exists and old_exists: - print( + logging.warning( f"Warning: file {self.path} was marked as existing but was not found." ) @@ -448,8 +451,8 @@ def seqrecordgenerator(path, format, parser="seq", warnings=False): return records except FileNotFoundError: if warnings: - print(Warnings()["not_exist"](path)) - print( + logging.warning(Warnings()["not_exist"](path)) + logging.warning( "The file was loaded as a BioProv object, but it does not exist on the specified path." ) return None diff --git a/bioprov/src/main.py b/bioprov/src/main.py index 25b2c53..14f527d 100644 --- a/bioprov/src/main.py +++ b/bioprov/src/main.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ @@ -24,23 +24,32 @@ import datetime import json -import pandas as pd +import logging import tempfile -from bioprov import config -from bioprov.utils import Warnings, serializer, serializer_filter, dict_to_sha1 -from bioprov.src.files import File, SeqFile, Directory, deserialize_files_dict -from bioprov.src.config import EnvProv +from collections import OrderedDict from collections import deque -from coolname import generate_slug from os import path from pathlib import Path from subprocess import Popen, PIPE, getoutput from time import time from types import GeneratorType -from collections import OrderedDict + +import pandas as pd +from coolname import generate_slug from prov.model import ProvEntity, ProvBundle, Namespace from tinydb import Query +from bioprov import config +from bioprov.src.config import EnvProv +from bioprov.src.files import File, SeqFile, Directory, deserialize_files_dict +from bioprov.utils import ( + Warnings, + serializer, + serializer_filter, + dict_to_sha1, + create_logger, +) + class Program: """ @@ -94,6 +103,10 @@ def runs(self): self._runs = dict() return self._runs + @runs.setter + def runs(self, value): + self._runs = value + def add_runs(self, runs): """ Sample method to add runs. @@ -112,13 +125,14 @@ def generate_cmd(self): self.cmd = cmd return cmd - def add_parameter(self, parameter, _print=False): + def add_parameter(self, parameter, _generate_cmd=True): """ Adds a parameter to the current instance and updates the command. :param parameter: an instance of the Parameter class. - :param _print: whether to print the parameter has been added. - :return: + :param _generate_cmd: Refreshes self.cmd when a Parameter is added. + + :return: Updates self.params and self.cmd if _generate_cmd is True. """ assert isinstance(parameter, Parameter), Warnings()["incorrect_type"]( parameter, Parameter @@ -126,23 +140,21 @@ def add_parameter(self, parameter, _print=False): k, v = parameter.key, parameter.value self.params[k] = parameter self.param_str = generate_param_str(self.params) - self.generate_cmd() - if _print: - print( - f"Added parameter {k} with value '{v}' to program {self.name}" - ) # no cover + config.logger.debug( + f"Added parameter {k} with value '{v}' to program {self.name}" + ) # no cover + if _generate_cmd: + self.generate_cmd() - def run(self, sample=None, _print=True): + def run(self, sample=None): """ Runs the process. :param sample: An instance of the Sample class - :param _print: Argument to pass to Run.run() :return: An instance of Run class. """ - # Creates Run instance with self run_ = Run(self, sample=sample) - run_.run(_sample=sample, _print=True) + run_.run(_sample=sample) self.add_runs(run_) return run_ @@ -320,15 +332,12 @@ def _finished_to_status(finished_status): dict_ = {True: "Finished", False: "Pending"} return dict_[finished_status] - def run(self, _sample=None, _print=True, _print_stdout=False, _print_stderr=False): + def run(self, _sample=None): """ Runs process for the Run instance. Will update attributes accordingly. - :type _print: bool :param _sample: self.sample - :param _print_stdout: Whether to print the stdout of the Program. - :param _print_stderr: Whether to print the stderr of the Program. - :return: self.stdout + :return: self """ if _sample is None: _sample = self.sample @@ -337,34 +346,35 @@ def run(self, _sample=None, _print=True, _print_stdout=False, _print_stderr=Fals assert ( self.program.found ), f"Cannot find program {self.program.name}. Make sure it is on your $PATH." - if _print: - str_ = f"Running program '{self.program.name}'" - if _sample is not None: - str_ += f" for sample {_sample.name}." - else: - str_ += "." - - # Pretty printing of commands - split_ = self.program.cmd.split() - if len(self.program.cmd) > 80: - if len(split_) % 2 == 1: - bin_, *fmt_cmd = split_ - last = "" - else: - bin_, *fmt_cmd, last = split_ # no cover - it = iter(fmt_cmd) - fmt_cmd = zip(it, it) - fmt_cmd = " \\ \n".join( - [bin_] + ["\t" + i[0] + " " + i[1] for i in fmt_cmd] + ["\t" + last] - ) - str_ += f"\nCommand is:\n{fmt_cmd}\n" + + # Print block + str_ = f"Running program '{self.program.name}'" + if _sample is not None: + str_ += f" for sample {_sample.name}." + else: + str_ += "." + + # Pretty printing of commands + split_ = self.program.cmd.split() + if len(self.program.cmd) > 80: + if len(split_) % 2 == 1: + bin_, *fmt_cmd = split_ + last = "" else: - str_ += f"\nCommand is:\n{self.program.cmd}\n" + bin_, *fmt_cmd, last = split_ # no cover + it = iter(fmt_cmd) + fmt_cmd = zip(it, it) + fmt_cmd = " \\ \n".join( + [bin_] + ["\t" + i[0] + " " + i[1] for i in fmt_cmd] + ["\t" + last] + ) + str_ += f"\nCommand is:\n{fmt_cmd}\n" + else: + str_ += f"\nCommand is:\n{self.program.cmd}\n" - str_ = str_.strip() - if str_.endswith("\\"): - str_ = str_[:-1] - print(str_) + str_ = str_.strip() + if str_.endswith("\\"): + str_ = str_[:-1] + config.logger.info(str_) p = Popen(self.program.cmd, shell=True, stdout=PIPE, stderr=PIPE) self.process = p @@ -387,11 +397,9 @@ def run(self, _sample=None, _print=True, _print_stdout=False, _print_stderr=Fals self.finished = True self._status = self._finished_to_status(self.finished) - # These are useful for quick debugging. - if _print_stdout: - print(self.stdout) # no cover - if _print_stderr: - print(self.stderr) # no cover + if not self._auto_suppress_stdout: + config.logger.debug(self.stdout) # no cover + config.logger.debug(self.stderr) # no cover return self @@ -476,7 +484,6 @@ def __init__( self.output_files = output_files self.preffix_tag = preffix_tag self.ready = False - self.generate_cmd() if self.sample is not None: self.create_func(sample=self.sample, preffix_tag=self.preffix_tag) @@ -502,7 +509,7 @@ def _parse_input_files(self): param = Parameter( key=k, value=str(self.sample.files[tag]), kind="input", tag=tag ) - self.add_parameter(param) + self.add_parameter(param, _generate_cmd=False) def _parse_output_files(self): """ @@ -531,7 +538,7 @@ def _parse_output_files(self): param = Parameter( key=key, value=str(self.sample.files[tag]), kind="output", tag=tag ) - self.add_parameter(param, _print=False) + self.add_parameter(param, _generate_cmd=False) except ValueError: raise Exception( "Please check the output files dictionary:\n'{}'\n" @@ -559,8 +566,12 @@ def create_func(self, sample, preffix_tag=None): self._parse_input_files() self._parse_output_files() + # Add self to sample automatically + self.sample.add_programs(self) + # Set ready state self.ready = True + self.generate_cmd() def validate_sample(self): """ @@ -578,13 +589,11 @@ def validate_program(self): """ assert isinstance(self, Program), Warnings()["incorrect_type"](self, Program) - def generate_cmd(self, from_files=True): + def generate_cmd(self): """ TODO: improve this function Generates a wildcard command string, independent of samples. - :param from_files: Generate command from self.input_files and self.output_files (recommended) If False, - will generate from parameter dictionary instead. :return: Updates self.cmd. """ self.validate_program() @@ -597,7 +606,7 @@ def generate_cmd(self, from_files=True): try: parameter.value = str(self.sample.files[f"{parameter.tag}"]) except AttributeError: - print("Warning: no sample associated with program.") + config.logger.warning("Warning: no sample associated with program.") pass # Suppress bug for now. else: pass @@ -608,11 +617,10 @@ def generate_cmd(self, from_files=True): self.cmd = generic_cmd return generic_cmd - def run(self, sample=None, _print=True, preffix_tag=None): + def run(self, sample=None, preffix_tag=None): """ Runs PresetProgram for sample. :param sample: Instance of bioprov.Sample. - :param _print: Whether to print more output. :param preffix_tag: Preffix tag to self.create_func() :return: """ @@ -620,11 +628,12 @@ def run(self, sample=None, _print=True, preffix_tag=None): sample = self.sample if preffix_tag is None: preffix_tag = self.preffix_tag - if not self.ready: - self.create_func(sample, preffix_tag) - # Update self._run, run self.run() and update self._run again. - Program.run(self, sample=sample, _print=_print) + self.create_func(sample, preffix_tag) + run_ = Run(self, sample=sample) + run_.run(_sample=sample) + self.add_runs(run_) + return run_ def parse_params(params): @@ -676,7 +685,9 @@ def generate_param_str(params): param_str = str_.strip() else: # TODO: add more parameters options. List of tuples, List of Parameter instances, etc. - print("Must provide either a string or a dictionary for the parameters!") + config.logger.error( + "Must provide either a string or a dictionary for the parameters!" + ) raise TypeError # Add positional arguments split_str = param_str.split() @@ -718,6 +729,8 @@ def _add_programs(object_, programs): for program in programs: object_.programs[program.name] = program + object_.auto_update_db() + def _add_runs(object_, runs): """ @@ -792,11 +805,16 @@ def __init__( attributes = dict() self.attributes = attributes self.programs = OrderedDict() + self.project = None # This is an attribute used by the src.prov module self.namespace_preffix = f"samples:{self.name}" self.files_namespace_preffix = None + def auto_update_db(self): + if self.project is not None: + self.project.auto_update_db() + def __repr__(self): str_ = f"Sample {self.name} with {len(self.files)} file(s)." return str_ @@ -846,33 +864,31 @@ def serializer(self): Custom serializer for Sample class. Serializes runs, programs, and files attributes. :return: """ - keys = ["files_namespace_preffix"] + keys = ["files_namespace_preffix", "project"] return serializer_filter(self, keys) - def run_programs(self, _print=True): + def run_programs(self): """ Runs self._programs in order. :return: """ - _run_programs(self, _print) + _run_programs(self) - def _run_program(self, program, _print=True): + def _run_program(self, program): """ Runs program for self. :param program: bioprov.Program or bioprov.PresetProgram. - :param _print: Whether to print output of Program. :return: Runs Program and updates self. """ - _run_program(self, program, _print) + _run_program(self, program) - def to_json(self, _path=None, _print=True): + def to_json(self, _path=None): """ Exports the Sample as JSON. Similar to Project.to_json() :param _path: JSON output file path. - :param _print: Whether to print if the file was created correctly. :return: """ - return to_json(self, self.serializer(), _path, _print=_print) + return to_json(self, self.serializer(), _path) def to_series(self): """ @@ -883,7 +899,7 @@ def to_series(self): series = {} # Can't apply serializer_filter here. - keys = ["files_namespace_preffix", "namespace_preffix", "_programs"] + keys = ["files_namespace_preffix", "namespace_preffix", "_programs", "project"] modified_dict = self.__dict__.copy() for key in keys: try: @@ -903,34 +919,33 @@ def to_series(self): return pd.Series(series) -def _run_program(_object, program, _print=True): +def _run_program(_object, program): """ Runs program for _object. :param _object: bioprov.Project or bioprov.Sample :param program: bioprov.Program or bioprov.PresetProgram. - :param _print: Whether to print output of Program. :return: Runs Program and updates _object. """ if program not in _object.programs.keys(): _object.add_programs(program) - program.run(sample=_object, _print=_print) + program.run(sample=_object) + _object.auto_update_db() -def _run_programs(_object, _print=True): +def _run_programs(_object): """ Runs programs in order. :param _object: bioprov.Project or bioprov.Sample - :param _print: Whether to print output of Program. :return: """ if len(_object.programs) >= 1: for _, p in _object.programs.items(): # noinspection PyProtectedMember - _object._run_program(p, _print=_print) + _object._run_program(p) else: - print(f"No programs to run for {_object}") + config.logger.warning(f"No programs to run for {_object}") class Project: @@ -938,7 +953,14 @@ class Project: Class which holds a dictionary of Sample instances, where each key is the sample name. """ - def __init__(self, samples=None, tag=None, db=None, auto_update=False): + def __init__( + self, + tag=None, + samples=None, + db=None, + auto_update=False, + log_to_file=False, + ): """ Initiates the object by creating a sample dictionary. :param samples: An iterator of Sample objects. @@ -946,6 +968,8 @@ def __init__(self, samples=None, tag=None, db=None, auto_update=False): :param db: path to TinyDB to store project in JSON format. :param auto_update: Whether to auto_update the BioProvDB record. Disabled by default. + :param log_to_file: Whether to log the Project to a File. You can define this later with + the self.start_logging() method. """ if tag is None: tag = generate_slug(2) @@ -976,6 +1000,13 @@ def __init__(self, samples=None, tag=None, db=None, auto_update=False): db = config.db self.db = db + # Log attributes + self.log_to_file = log_to_file + self.log_file = None + self.logger = None + if self.log_to_file: + self.start_logging() + def __len__(self): return len(self._samples) @@ -987,9 +1018,10 @@ def __getitem__(self, item): value = self._samples[item] return value except KeyError: - print(f"Sample {item} not in Project.\n") - print("Check the following keys:") - print(" ", "\n ".join(self.keys)) + config.logger.error( + f"Sample {item} not in Project.\n" f"Check the following keys:" " ", + "\n ".join(self.keys), + ) def __setitem__(self, key, value): self._samples[key] = value @@ -1049,10 +1081,9 @@ def update_db(self, db=None): db = self.db result, query = self.query_db(db) if result: - print(f"Updating project '{self.tag}' at {db.db_path}") db.update(self.serializer(), query.tag == self.tag) else: - print(f"Inserting new project '{self.tag}' in {db.db_path}") + config.logger.info(f"Inserting new project '{self.tag}' in {db.db_path}") db.insert(self.serializer()) def auto_update_db(self): @@ -1094,6 +1125,30 @@ def replace_paths(self, old_terms, new, warnings=False): for _, file in sample.files.items(): file.replace_path(old_terms, new, warnings) + def start_logging( + self, log_file=None, level=logging.INFO, _custom_start_message=None + ): + """ + Starts logging Project to File. + + :param log_file: Path to log file. If None will be defined automatically. + :param level: Logging level. + :param _custom_start_message: Custom starting message to start the log. + :return: Creates logger attributes and refreshes bp.config.logger + """ + if log_file is None: + log_file = f"{self.tag}.log" + self.log_file = log_file + self.logger = config.logger = create_logger(level, self.log_file, self.tag) + if _custom_start_message is None: + _custom_start_message = f"Starting log for project '{self.tag}'." + + self.logger.info(_custom_start_message) + + self.add_files(File(self.log_file, tag="log")) + self.logger.info(f"Writing log to {self.files['log']}") + self.logger.info(f"Loading {len(self)} samples.") + def add_files(self, files): """ Adds Files to self.files. See documentation to bioprov.src.main.add_files(). @@ -1102,7 +1157,6 @@ def add_files(self, files): :return: Updates self.files """ _add_files(self, files) - self.auto_update_db() def add_programs(self, programs): """ @@ -1111,25 +1165,23 @@ def add_programs(self, programs): :return: Updates self.programs """ _add_programs(self, programs) - self.auto_update_db() - def run_programs(self, _print=True): + def run_programs(self): """ Runs all programs in self.programs in order. :return: """ - _run_programs(self, _print) + _run_programs(self) - def _run_program(self, program=None, _print=True): + def _run_program(self, program=None): """ Runs program for self. :param program: bioprov.Program or bioprov.PresetProgram. - :param _print: Whether to print output of Program. :return: Runs Program and updates self. """ - _run_program(self, program, _print) + _run_program(self, program) @staticmethod def is_sample_and_name(sample): @@ -1146,7 +1198,9 @@ def is_sample_and_name(sample): if sample.name is None: slug = generate_slug(2) sample.name = slug - print(f"No sample name set. Setting random name: {sample.name}") + config.logger.warning( + f"No sample name set. Setting random name: {sample.name}" + ) return sample @@ -1167,8 +1221,7 @@ def is_iterator(constructor): return constructor - @staticmethod - def build_sample_dict(constructor): + def build_sample_dict(self, constructor): """ Build sample dictionary from passed constructor. :param constructor: Iterable or NoneType @@ -1184,6 +1237,7 @@ def build_sample_dict(constructor): for sample in constructor: sample = Project.is_sample_and_name(sample) samples[sample.name] = sample + samples[sample.name].project = self return samples @@ -1198,14 +1252,13 @@ def samples(self, value): def serializer(self): return serializer(self) - def to_json(self, _path=None, _print=True): + def to_json(self, _path=None): """ Exports the Project as JSON. Similar to Sample.to_json() :param _path: JSON output file _path. - :param _print: Whether to print if the file was created correctly. :return: """ - return to_json(self, self.serializer(), _path, _print=_print) + return to_json(self, self.serializer(), _path) def to_df(self): """ @@ -1291,11 +1344,13 @@ def _add_files(object_, files): # Here 'files' must be a dictionary of File or Directory instances for k, v in files.items(): if k in object_.files.keys(): - print(f"Updating file {k} with value {v}.") + config.logger.info(f"Updating file {k} with value {v}.") object_.files[k] = v + object_.auto_update_db() + -def to_json(object_, dictionary, _path=None, _print=True): +def to_json(object_, dictionary, _path=None): """ Exports the Sample or Project as JSON. :return: Writes JSON output @@ -1307,7 +1362,7 @@ def to_json(object_, dictionary, _path=None, _print=True): if "json" not in object_.files.keys(): object_.add_files({"json": _path}) - return write_json(dictionary, _path, _print=_print) + return write_json(dictionary, _path) def from_json(json_file, kind="Project", replace_path=None, replace_home=False): @@ -1363,7 +1418,7 @@ def from_json(json_file, kind="Project", replace_path=None, replace_home=False): pass # Create Project - project = Project(samples=samples, tag=d["tag"]) + project = Project(tag=d["tag"], samples=samples) # Deserializing and adding project files and programs try: @@ -1403,9 +1458,11 @@ def from_json(json_file, kind="Project", replace_path=None, replace_home=False): project.replace_paths(other_HOME_variables, HOME, warnings=True) if replace_path: - print("Replacing paths:") - print(f"\tOld:\t{replace_path[0][0]}") - print(f"\tNew:\t{replace_path[1]}") + config.logger.info( + "Replacing paths:" + f"\tOld:\t{replace_path[0][0]}" + f"\tNew:\t{replace_path[1]}" + ) project.replace_paths(replace_path[0], replace_path[1], warnings=True) return project @@ -1477,7 +1534,7 @@ def from_df( sample.attributes[attr_] = row[attr_] samples[ix] = sample - samples = Project(samples, tag=tag) + samples = Project(tag=tag, samples=samples) if source_file: samples.add_files({"project_csv": source_file}) @@ -1545,22 +1602,20 @@ def dict_to_sample(json_dict): return sample_ -def write_json(dict_, _path, _print=True): +def write_json(dict_, _path): """ Writes dictionary to JSON file. :param dict_: JSON dictionary. :param _path: String with _path to JSON file. - :param _print: Whether to print if the file was successfully created. :return: Writes JSON file. """ with open(_path, "w") as f: json.dump(dict_, f, indent=3) - if _print: - if Path(_path).exists(): - print(f"Created JSON file at {_path}.") - else: - print(f"Could not create JSON file for {_path}.") + if Path(_path).exists(): + config.logger.info(f"Created JSON file at {_path}.") + else: + config.logger.info(f"Could not create JSON file for {_path}.") def load_project(tag): @@ -1578,7 +1633,7 @@ def load_project(tag): try: result = config.db.search(query.tag == tag)[0] except (IndexError, KeyError): - print(f"Project not found in database at {config.db_path}") + config.logger.error(f"Project not found in database at {config.db_path}") return with tempfile.NamedTemporaryFile() as f: diff --git a/bioprov/src/prov.py b/bioprov/src/prov.py index d0f6105..673f877 100644 --- a/bioprov/src/prov.py +++ b/bioprov/src/prov.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ Module containing base provenance attributes. @@ -10,13 +10,16 @@ This module extracts system-level information, such as user and environment settings, and stores them. It is invoked to export provenance objects. """ +import logging from pathlib import Path -from bioprov import Project, Parameter, config -from bioprov.utils import Warnings, build_prov_attributes, serializer_filter -from prov.model import ProvDocument + from prov.dot import prov_to_dot +from prov.model import ProvDocument from requests.exceptions import ConnectionError +from bioprov import Project, Parameter, config +from bioprov.utils import Warnings, build_prov_attributes, serializer_filter + class BioProvDocument: """ @@ -163,10 +166,19 @@ def _iter_envs_and_users(self): ) def _iter_samples(self): - for _, sample in self.project.items(): - self._create_sample_bundle(sample) - self._create_sample_file_entities(sample) - self._create_program_entities(sample) + for _, sample in self.project.samples.items(): + for statement in ( + self._create_sample_bundle(sample), + self._create_sample_file_entities(sample), + self._create_program_entities(sample), + ): + try: + statement + except KeyError: + config.logger.debug( + f"Could not run function '{statement.__name__}' for sample {sample.name}." + ) + pass def _create_sample_bundle(self, object_, kind="Sample"): """ @@ -333,7 +345,7 @@ def _get_IO_from_params(program): def _add_activities_namespace(self): """ - Add activities Namespace to self + Add activities Namespace to self. :return: """ @@ -353,11 +365,11 @@ def upload_to_provstore(self, api=None): if api is None: api = config.provstore_api try: - self.ProvDocument = api.document.create( - self.provstore_document, name=self.project.tag + self.provstore_document = api.document.create( + self.ProvDocument, name=self.project.tag ) except ConnectionError: - print( + logging.error( "Could not create remote document. Please check your internet connection and ProvStore credentials." ) @@ -379,9 +391,9 @@ def write_provn(self, path=None): ), f"Directory '{path.parent}' not found.\nPlease provide a valid directory." if path.exists(): - print(f"Overwriting file at '{path}'") + logging.info(f"Overwriting file at '{path}'") with open(path, "w") as f: f.write(self.provn) - - print(f"Wrote PROVN record to {path}.") + if path.exists(): + logging.info(f"Wrote PROVN record to {path}.") diff --git a/bioprov/src/workflow.py b/bioprov/src/workflow.py index 6331335..0511ff4 100644 --- a/bioprov/src/workflow.py +++ b/bioprov/src/workflow.py @@ -2,21 +2,24 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ Contains the Workflow class and related functions. """ import argparse -import pandas as pd -from glob import glob -from bioprov import from_df, config, PresetProgram -from bioprov.utils import Warnings +import logging from collections import OrderedDict +from glob import glob from os import path + +import pandas as pd from tqdm import tqdm +from bioprov import from_df, config, PresetProgram, BioProvDocument, File +from bioprov.utils import Warnings, create_logger + class Workflow: """ @@ -29,7 +32,7 @@ def __init__( self, name=None, description=None, - input_=None, + input=None, input_type="dataframe", index_col="sample-id", file_columns=None, @@ -40,12 +43,18 @@ def __init__( verbose=None, threads=None, sep="\t", + log=None, + _log_to_file=True, + update_db=False, + upload_to_provstore=False, + write_provn=False, + write_pdf=False, **kwargs, ): """ :param name: Name of the workflow, with no spaces. :param description: A brief (one sentence) description of the workflows. - :param input_: Input of workflow. May be a directory or a tab-delimited file. + :param input: Input of workflow. May be a directory or a tab-delimited file. :param input_type: Input type of the workflow. Choose from ('directory', 'dataframe', 'both') :param index_col: Name of index column which will define sample names if input_type is 'dataframe'. :param file_columns: Name of columns containing files if input_type is 'dataframe'. @@ -57,11 +66,17 @@ def __init__( :param verbose: Verbose output of workflow. :param threads: Number of threads in workflow. Defaults to bioprov.config.threads :param sep: Separator if input_type is 'dataframe'. + :param _log_to_file: Whether to write log to file. + :param log: Path of the file to write the log to. Default is f'{workflow.tag}.log'. + :param update_db: Whether to automatically update the BioProv DB when running the workflow. + :param write_provn: Write PROVN output at the end of the workflow. + :param write_pdf: Write graphical output at the end of the workflow. + :param upload_to_provstore: Upload BioProvDocument to ProvStore at the end of the workflow. :param kwargs: Other keyword arguments to be passed to workflow. """ self.name = name self.description = description - self.input = input_ + self.input = input self.input_type = input_type self.index_col = index_col self.file_columns = file_columns @@ -71,6 +86,13 @@ def __init__( OrderedDict() ) # Will only update if isinstance(steps, (list, dict, tuple): + # Post workflow actions + self._bioprovdocument = None + self.update_db = update_db + self.write_provn = write_provn + self.write_pdf = write_pdf + self.upload_to_provstore = upload_to_provstore + # Parse steps arg - dict if isinstance(steps, dict): # no cover for _, step in steps.items(): @@ -83,6 +105,8 @@ def __init__( self.add_step(step) self.parser = parser + if isinstance(tag, str): + tag = tag.replace(" ", "-") self.tag = tag self.verbose = verbose self.threads = threads @@ -92,6 +116,12 @@ def __init__( self.project_csv = None self.parser = None + # Logging configuration (default is set in self.start_logging()) + self._log_to_file = _log_to_file + self.log_file = log + self._log_level = None + self.logger = None + # Only generate project if there is an input and input type if self.input and self.input_type: # no cover _input_types = ("directory", "dataframe") @@ -106,6 +136,44 @@ def __init__( ): self.generate_parser() + def __repr__(self): + return f"bioprov.Workflow '{self.name}'" + + def create_provenance(self): + self._bioprovdocument = BioProvDocument(self.project) + + def _update_db(self): + self.project.update_db() + + @property + def bioprovdocument(self): + if self._bioprovdocument is None: + self.create_provenance() + return self._bioprovdocument + + def _upload_to_provstore(self): + self.bioprovdocument.upload_to_provstore() + + def _write_provn(self): + self.bioprovdocument.write_provn() + + def _write_pdf(self): + self.bioprovdocument.dot.write_pdf(self.project.tag + ".pdf") + + def _post_wf_actions(self): + self.create_provenance() + if self.update_db: + config.logger.info( + f"Updating project '{self.project.tag}' at {config.db_path}" + ) + self._update_db() + if self.upload_to_provstore: + self._upload_to_provstore() + if self.write_provn: + self._write_provn() + if self.write_pdf: + self._write_pdf() + def generate_project(self): """ Generate Project instance from input. @@ -116,6 +184,15 @@ def generate_project(self): "directory": self._load_directory_input, } self.project = _generate_project[self.input_type]() + if self.tag is None: + self.tag = self.project.tag + "_" + self.name + else: + self.project.tag = self.tag + + if self.update_db: + self.project.auto_update = True + # Logging starts once the project is loaded. + self.start_logging() def generate_parser(self): parser = argparse.ArgumentParser( @@ -140,19 +217,58 @@ def generate_parser(self): default=config.threads, ) parser.add_argument( + "-v", "--verbose", help="More verbose output", action="store_true", default=False, required=False, ) - parser.add_argument("-t", "--tag", help="A tag for the dataset", required=False) + parser.add_argument("-t", "--tag", help="A tag for the Project", required=False) + parser.add_argument( + "-s", + "--sep", + help="Separator for the tab-delimited file.", + required=False, + default="\t", + ) + parser.add_argument( + "-l", + "--log", + help="Path to write log file to. If not set, will be defined automatically.", + required=False, + default=None, + ) parser.add_argument( "--steps", help=f"A comma-delimited string of which steps will be run in the workflow.\n" f"Possible steps:\n{list(self.steps.keys())}", default=self.default_steps, - ), + ) + parser.add_argument( + "--update_db", + help="Whether to update the Project in the BioProvDB.", + action="store_true", + required=False, + ) + parser.add_argument( + "--upload_to_provstore", + help="Whether to upload the Project to ProvStore at the end of the execution.", + action="store_true", + required=False, + ) + parser.add_argument( + "--write_provn", + help="Whether to write PROVN output at the end of the execution.", + action="store_true", + required=False, + ) + parser.add_argument( + "--write_pdf", + help="Whether to write graphical output at the end of the execution.", + action="store_true", + required=False, + ) self.parser = parser @@ -169,6 +285,19 @@ def add_step(self, step): # Update parser: self.generate_parser() + def start_logging(self): + if self.verbose: + self._log_level = logging.DEBUG + else: + self._log_level = logging.INFO + _custom_start_message = ( + f"Starting '{self.name}' workflow for project '{self.project.tag}'." + ) + self.project.start_logging( + level=self._log_level, _custom_start_message=_custom_start_message + ) + self.logger = self.project.logger + # TODO: implement Project steps def run_steps(self, steps_to_run): """ @@ -186,11 +315,19 @@ def run_steps(self, steps_to_run): if self.project is None: self.generate_project() - for k, step in tqdm(self.steps.items()): + steps_str = " " + " \n".join( + [f"{ix+1}. {name}" for ix, name in enumerate(self.steps.keys())] + ) + self.logger.info(f"Running {len(self.steps)} steps:\n{steps_str}") + + # Start running steps + for k, step in self.steps.items(): if k in steps_to_run: if step.kind == "Sample": + self.logger.info(f"Running '{step.name}' for each sample.") + # Progress bar only for sample steps. for _, sample in tqdm(self.project.items()): - _run = step.run(sample=sample, _print=self.verbose) + step.run(sample=sample) if not step.runs[ str(len(step.runs)) ].stderr: # Add to successes if no standard error. @@ -198,15 +335,13 @@ def run_steps(self, steps_to_run): # TODO // write this test elif step.kind == "Project": # no cover + self.logger.info(f"Running '{step.name}' for project.") self.project.add_programs(step) self.project.programs[step.name].run() - if not step.runs[ - str(len(step.runs)) - ].stderr: # Add to successes if no standard error. - step.successes += 1 else: # no cover - if self.verbose: - print(f"Skipping step '{step.name}'") + self.logger.info(f"Skipping step '{step.name}'") + + self._post_wf_actions() def _project_from_dataframe(self, df): """ @@ -214,8 +349,6 @@ def _project_from_dataframe(self, df): :param df: Instance of pd.DataFrame. :return: Updates self.project. """ - # Loading samples statement - print(Warnings()["sample_loading"](len(df))) project = from_df( df, index_col=self.index_col, @@ -271,15 +404,15 @@ def _load_dataframe_input(self): """ index_col = self.index_col - input_ = self.input + input = self.input file_columns = self.file_columns # Assert input file exists - assert path.isfile(input_), Warnings()["not_exist"] + assert path.isfile(input), Warnings()["not_exist"] # Read input - df = pd.read_csv(input_, sep=self.sep) - self.project_csv = input_ + df = pd.read_csv(input, sep=self.sep) + self.project_csv = input # Assert index_col exists in df.columns assert ( @@ -312,20 +445,6 @@ def _load_dataframe_input(self): project = self._project_from_dataframe(df) return project - # TODO // this is related to refactoring command-line parsers - def main(self): # no cover - """ - Parses command-line arguments and runs the workflow. - :return: - """ - if self.parser is None: - self.generate_parser() - args = self.parser.parse_args() - self.input = args.input - self.input_type = args.input_type - steps = args.steps - self.run_steps(steps) - class Step(PresetProgram): """ @@ -354,3 +473,6 @@ def __init__( self.description = description self.successes = 0 self.kind = kind + + def __repr__(self): + return f"Step '{self.name}' with {len(self.params)} parameter(s) and kind '{self.kind}'." diff --git a/bioprov/tests/__init__.py b/bioprov/tests/__init__.py index c57fe4a..0fe0ba7 100644 --- a/bioprov/tests/__init__.py +++ b/bioprov/tests/__init__.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ diff --git a/bioprov/tests/test_bioprov_imports.py b/bioprov/tests/test_bioprov_imports.py index e1bd6c4..a3e82c9 100644 --- a/bioprov/tests/test_bioprov_imports.py +++ b/bioprov/tests/test_bioprov_imports.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ diff --git a/bioprov/tests/test_bioprov_integration.py b/bioprov/tests/test_bioprov_integration.py index c6edf86..7a98b11 100644 --- a/bioprov/tests/test_bioprov_integration.py +++ b/bioprov/tests/test_bioprov_integration.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ @@ -10,9 +10,11 @@ """ from argparse import Namespace -import bioprov as bp -import pytest from os import remove + +import pytest + +import bioprov as bp from bioprov.data import picocyano_dataset @@ -112,10 +114,11 @@ def test_CLI(): list=False, show_provstore=False, ) - with pytest.raises(AttributeError) as pytest_wrapped_e: + + with pytest.raises(KeyError) as pytest_wrapped_e: main(args) - assert pytest_wrapped_e.type == AttributeError + assert pytest_wrapped_e.type == KeyError args = Namespace( show_config=False, show_db=False, version=False, list=True, show_provstore=False diff --git a/bioprov/tests/test_bioprov_programs.py b/bioprov/tests/test_bioprov_programs.py index f733214..1fcfac2 100644 --- a/bioprov/tests/test_bioprov_programs.py +++ b/bioprov/tests/test_bioprov_programs.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ @@ -63,20 +63,6 @@ def test_blastn(): assert blast_params["-outfmt"]["value"] == "6" -def test_blastn(): - - s = Sample("Synechococcus", files={"query": synechococcus_genome}) - reference_db = "./path_to_a_valid_blastdb" - - blast = blastn(s, reference_db) - blast_params = blast.serializer()["params"] - - expected = ["-db", "-outfmt", "-query", "-out"] - - assert list(blast_params.keys()) == expected - assert blast_params["-outfmt"]["value"] == "6" - - def test_blastp(): s = Sample("Synechococcus", files={"query": synechococcus_genome}) diff --git a/bioprov/tests/test_bioprov_workflows.py b/bioprov/tests/test_bioprov_workflows.py index e0dfcd6..01d09f4 100644 --- a/bioprov/tests/test_bioprov_workflows.py +++ b/bioprov/tests/test_bioprov_workflows.py @@ -2,15 +2,16 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ Testing for the workflows package. """ from os import remove -from bioprov.utils import Warnings +from pathlib import Path from bioprov.data import genome_annotation_dataset +from bioprov.utils import Warnings from bioprov.workflows.blastn import blastn_alignment from bioprov.workflows.genome_annotation import genome_annotation from bioprov.workflows.kaiju import KaijuWorkflow @@ -32,7 +33,7 @@ def test_genome_annotation(): Tests the 'genome_annotation' workflow with the 'prodigal' step. :return: """ - workflow = genome_annotation() + workflow = genome_annotation(tag="test-project") workflow.input = genome_annotation_dataset steps = [ "prodigal", @@ -43,6 +44,10 @@ def test_genome_annotation(): for key, file in sample.files.items(): assert file.exists, Warnings()["not_exist"](file.path) + log_file = "test-project.log" + assert Path(log_file).exists() + remove(log_file) + def test_kaiju_workflow(): """ diff --git a/bioprov/tests/test_src_config.py b/bioprov/tests/test_src_config.py index c187398..93c6715 100644 --- a/bioprov/tests/test_src_config.py +++ b/bioprov/tests/test_src_config.py @@ -2,21 +2,23 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ Testing for the Config module. """ -import bioprov as bp -from bioprov.src.config import Config, BioProvDB from os import environ, remove from pathlib import Path -from tinydb import TinyDB, Query -from coolname import generate_slug from tempfile import NamedTemporaryFile +from coolname import generate_slug +from tinydb import TinyDB, Query + +import bioprov as bp +from bioprov.src.config import Config, BioProvDB + def test_Config(): """ diff --git a/bioprov/tests/test_src_file.py b/bioprov/tests/test_src_file.py index 0b32fdd..91de614 100644 --- a/bioprov/tests/test_src_file.py +++ b/bioprov/tests/test_src_file.py @@ -2,19 +2,21 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ Testing for the File module. """ +from pathlib import Path + +from coolname import generate_slug +from prov.model import ProvEntity + import bioprov as bp from bioprov import File, SeqFile, Directory, utils -from bioprov.src.files import seqrecordgenerator -from coolname import generate_slug -from pathlib import Path from bioprov.data import synechococcus_genome -from prov.model import ProvEntity +from bioprov.src.files import seqrecordgenerator def test_File_and_Directory(): diff --git a/bioprov/tests/test_src_main.py b/bioprov/tests/test_src_main.py index d789d8a..5c047e3 100644 --- a/bioprov/tests/test_src_main.py +++ b/bioprov/tests/test_src_main.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ @@ -19,21 +19,12 @@ # TODO: organize this import datetime -import pandas as pd -from bioprov.src.main import ( - generate_param_str, - Parameter, - File, - Directory, - parse_params, - Program, - Run, - dict_to_sample, - json_to_dict, -) -from coolname import generate_slug from os import remove from pathlib import Path + +import pandas as pd +from coolname import generate_slug + from bioprov import ( Sample, Project, @@ -44,9 +35,20 @@ BioProvDocument, BioProvDB, ) -from bioprov.utils import dict_to_sha1, Warnings from bioprov.data import synechococcus_genome, picocyano_dataset from bioprov.programs import prodigal +from bioprov.src.main import ( + generate_param_str, + Parameter, + File, + Directory, + parse_params, + Program, + Run, + dict_to_sample, + json_to_dict, +) +from bioprov.utils import dict_to_sha1, Warnings def test_Program(): diff --git a/bioprov/tests/test_src_prov.py b/bioprov/tests/test_src_prov.py index 872b324..d1a1dd7 100644 --- a/bioprov/tests/test_src_prov.py +++ b/bioprov/tests/test_src_prov.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ @@ -11,12 +11,14 @@ """ from os import environ + +from pydot import Dot + from bioprov import read_csv from bioprov.data import picocyano_dataset -from bioprov.src.prov import BioProvDocument from bioprov.src.config import EnvProv +from bioprov.src.prov import BioProvDocument from bioprov.utils import dict_to_sha1 -from pydot import Dot project = read_csv( picocyano_dataset, sequencefile_cols="assembly", tag="picocyanobacteria" diff --git a/bioprov/utils.py b/bioprov/utils.py index 99aa68f..7f8f55a 100644 --- a/bioprov/utils.py +++ b/bioprov/utils.py @@ -2,16 +2,18 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ Helper functions. """ +import hashlib import io -import sys import json -import hashlib +import logging +import sys from pathlib import Path + from prov.model import Namespace, QualifiedName @@ -237,3 +239,25 @@ def pattern_replacer(pattern, iterable_of_olds, new): pattern = pattern.replace(old, new) return pattern + + +def create_logger(log_level=logging.INFO, log_file=None, tag=None): + if tag is None: + tag = "bioprov" + logger = logging.getLogger(tag) + logger.setLevel(log_level) + + # Console handler + stream_handler = logging.StreamHandler() + simple_fmt = logging.Formatter("%(message)s") + stream_handler.setFormatter(simple_fmt) + logger.addHandler(stream_handler) + + # File handler + if log_file: + timestamp_fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") + fh_handler = logging.FileHandler(log_file, mode="w", delay=True) + fh_handler.setFormatter(timestamp_fmt) + logger.addHandler(fh_handler) + + return logger diff --git a/bioprov/workflows/__init__.py b/bioprov/workflows/__init__.py index 0dbe70b..7770b7a 100644 --- a/bioprov/workflows/__init__.py +++ b/bioprov/workflows/__init__.py @@ -2,10 +2,10 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" +from .blastn import blastn_alignment from .genome_annotation import genome_annotation from .kaiju import KaijuWorkflow -from .blastn import blastn_alignment from .wf_parser import WorkflowOptionsParser diff --git a/bioprov/workflows/blastn.py b/bioprov/workflows/blastn.py index b43e19c..7480e92 100644 --- a/bioprov/workflows/blastn.py +++ b/bioprov/workflows/blastn.py @@ -3,7 +3,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ BLAST nucleotide alignment workflow @@ -14,8 +14,8 @@ with the BioProv CLI application (recommended). """ -from bioprov.src.workflow import Workflow, Step from bioprov.programs import blastn +from bioprov.src.workflow import Workflow, Step def blastn_alignment(**kwargs): @@ -50,8 +50,3 @@ def blastn_alignment(**kwargs): ) return _blastn_alignment - - -if __name__ == "__main__": - workflow = blastn_alignment() - workflow.main() diff --git a/bioprov/workflows/genome_annotation.py b/bioprov/workflows/genome_annotation.py index f84f21c..9106b74 100644 --- a/bioprov/workflows/genome_annotation.py +++ b/bioprov/workflows/genome_annotation.py @@ -3,7 +3,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ Genome annotation workflow module. @@ -14,8 +14,8 @@ with the BioProv CLI application (recommended). """ -from bioprov.src.workflow import Workflow, Step from bioprov.programs import prodigal # , prokka +from bioprov.src.workflow import Workflow, Step def genome_annotation(**kwargs): @@ -40,8 +40,3 @@ def genome_annotation(**kwargs): _genome_annotation.add_step(_step) return _genome_annotation - - -if __name__ == "__main__": - workflow = genome_annotation() - workflow.main() diff --git a/bioprov/workflows/kaiju.py b/bioprov/workflows/kaiju.py index 0626021..3c8cde4 100644 --- a/bioprov/workflows/kaiju.py +++ b/bioprov/workflows/kaiju.py @@ -3,7 +3,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ @@ -15,13 +15,16 @@ with the BioProv CLI application (recommended). """ +import argparse +import logging from os import path, getcwd, mkdir -from bioprov import config, from_df + +import pandas as pd +from tqdm import tqdm + +from bioprov import config, from_df, Sample from bioprov.programs import kaiju, kaiju2table from bioprov.utils import Warnings, tax_ranks -from tqdm import tqdm -import argparse -import pandas as pd class KaijuWorkflow: @@ -85,13 +88,14 @@ def main( file_ ), f"File '{file_}' was not found! Make sure all file paths are correct in input file." - print(Warnings()["sample_loading"](len(df))) + logging.warning(Warnings()["sample_loading"](len(df))) # Create BioProv Project ss = from_df(df, index_col="sample-id", file_cols=("R1", "R2"), tag=_tag) success, skip = 0, 0 + sample: Sample for k, sample in tqdm(ss.items()): kaiju_ = kaiju( sample, @@ -124,7 +128,7 @@ def main( names=names, add_param_str=kaiju2table_params, ) - k2t_run = kaiju2table_.run(sample, _print=False) + kaiju2table_.run(sample) all_files_exist = False for k_, v in sample.files.items(): diff --git a/bioprov/workflows/wf_parser.py b/bioprov/workflows/wf_parser.py index bf02177..de7ed14 100644 --- a/bioprov/workflows/wf_parser.py +++ b/bioprov/workflows/wf_parser.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" """ @@ -22,57 +22,48 @@ def __init__(self): pass @staticmethod - def _blastn_alignment(options): + def _blastn_alignment(kwargs, steps): """ Runs blastn alignment workflow :return: """ - main = blastn_alignment(db=options.database) - main.input = options.input - steps = options.steps + main = blastn_alignment(**kwargs) main.run_steps(steps) @staticmethod - def _genome_annotation(options): + def _genome_annotation(kwargs, steps): """ Runs genome annotation workflow :return: """ - main = genome_annotation() - main.input = options.input - steps = options.steps + main = genome_annotation(**kwargs) main.run_steps(steps) @staticmethod - def _kaiju_workflow(options): + def _kaiju_workflow(kwargs, steps): """ Runs Kaiju workflow :return: """ - KaijuWorkflow.main( - input_file=options.input, - output_path=options.output_directory, - kaijudb=options.kaiju_db, - nodes=options.nodes, - names=options.names, - threads=options.threads, - _tag=options.tag, - verbose=options.verbose, - kaiju_params=options.kaiju_params, - kaiju2table_params=options.kaiju2table_params, - ) + _ = steps + KaijuWorkflow.main(**kwargs) def parse_options(self, options): """ Parses options and returns correct workflow. - :param options: - :return: + :type options: argparse.Namespace + :param options: arguments passed by the parser. + :return: Runs the specified subparser in options.subparser_name. """ subparsers = { - "genome_annotation": lambda _options: self._genome_annotation(_options), - "blastn": lambda _options: self._blastn_alignment(_options), - "kaiju": lambda _options: self._kaiju_workflow(_options), + "genome_annotation": lambda _options, _steps: self._genome_annotation( + _options, _steps + ), + "blastn": lambda _options, _steps: self._blastn_alignment(_options, _steps), + "kaiju": lambda _options, _steps: self._kaiju_workflow(_options, _steps), } # Run desired subparser - subparsers[options.subparser_name](options) + kwargs = dict(options._get_kwargs()) + steps = kwargs.pop("steps") + subparsers[options.subparser_name](kwargs, steps) diff --git a/docs/source/conf.py b/docs/source/conf.py index fcefef8..1484035 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -24,7 +24,7 @@ author = "Vini Salazar" # The full version, including alpha/beta/rc tags -release = "0.1.18a" +release = "0.1.19" # -- General configuration --------------------------------------------------- diff --git a/setup.py b/setup.py index 06ac34a..347097c 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ __license__ = "MIT" __maintainer__ = "Vini Salazar" __url__ = "https://github.com/vinisalazar/bioprov" -__version__ = "0.1.18a" +__version__ = "0.1.19" import setuptools @@ -12,7 +12,7 @@ setuptools.setup( name="bioprov", - version="0.1.18a", + version="0.1.19", author="Vini Salazar", author_email="viniws@gmail.com", description="BioProv - Provenance capture for bioinformatics workflows",