From 7e6382183ea02e086360dba5946a8725a3cdc98d Mon Sep 17 00:00:00 2001 From: "Filipe G. Vieira" <1151762+fgvieira@users.noreply.github.com> Date: Thu, 30 Mar 2023 12:46:52 +0200 Subject: [PATCH] feat: also define overhang on params.extra (#1173) ### Description ### QC * [x] I confirm that: For all wrappers added by this PR, * there is a test case which covers any introduced changes, * `input:` and `output:` file paths in the resulting rule can be changed arbitrarily, * either the wrapper can only use a single core, or the example rule contains a `threads: x` statement with `x` being a reasonable default, * rule names in the test case are in [snake_case](https://en.wikipedia.org/wiki/Snake_case) and somehow tell what the rule is about or match the tools purpose or name (e.g., `map_reads` for a step that maps reads), * all `environment.yaml` specifications follow [the respective best practices](https://stackoverflow.com/a/64594513/2352071), * wherever possible, command line arguments are inferred and set automatically (e.g. based on file extensions in `input:` or `output:`), * all fields of the example rules in the `Snakefile`s and their entries are explained via comments (`input:`/`output:`/`params:` etc.), * `stderr` and/or `stdout` are logged correctly (`log:`), depending on the wrapped tool, * temporary files are either written to a unique hidden folder in the working directory, or (better) stored where the Python function `tempfile.gettempdir()` points to (see [here](https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir); this also means that using any Python `tempfile` default behavior works), * the `meta.yaml` contains a link to the documentation of the respective tool or command, * `Snakefile`s pass the linting (`snakemake --lint`), * `Snakefile`s are formatted with [snakefmt](https://github.com/snakemake/snakefmt), * Python wrapper scripts are formatted with [black](https://black.readthedocs.io). * Conda environments use a minimal amount of channels, in recommended ordering. E.g. for bioconda, use (conda-forge, bioconda, nodefaults, as conda-forge should have highest priority and defaults channels are usually not needed because most packages are in conda-forge nowadays). --- bio/star/index/meta.yaml | 4 ++++ bio/star/index/wrapper.py | 14 ++++++-------- meta/bio/star_arriba/test/Snakefile | 9 +++++---- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/bio/star/index/meta.yaml b/bio/star/index/meta.yaml index 4d604d03ea..4477feae15 100644 --- a/bio/star/index/meta.yaml +++ b/bio/star/index/meta.yaml @@ -1,5 +1,6 @@ name: "STAR Index" description: Index fasta sequences with STAR +url: https://github.com/alexdobin/STAR authors: - Thibault Dayris - Tomás Di Domenico @@ -8,3 +9,6 @@ input: - A (multi)fasta formatted file output: - A directory containing the indexed sequence for downstream STAR mapping +params: + - sjdbOverhang: length of the donor/acceptor sequence on each side of the junctions (optional) + - extra: additional program arguments. diff --git a/bio/star/index/wrapper.py b/bio/star/index/wrapper.py index 7b2ff20e8a..7f8bdd72d7 100644 --- a/bio/star/index/wrapper.py +++ b/bio/star/index/wrapper.py @@ -10,18 +10,16 @@ from snakemake.utils import makedirs log = snakemake.log_fmt_shell(stdout=True, stderr=True) - extra = snakemake.params.get("extra", "") -sjdb_overhang = snakemake.params.get("sjdbOverhang", "100") -gtf = snakemake.input.get("gtf") -if gtf is not None: - gtf = f"--sjdbGTFfile {gtf}" +sjdb_overhang = snakemake.params.get("sjdbOverhang", "") +if sjdb_overhang: sjdb_overhang = f"--sjdbOverhang {sjdb_overhang}" -else: - gtf = sjdb_overhang = "" -makedirs(snakemake.output) +gtf = snakemake.input.get("gtf", "") +if gtf: + gtf = f"--sjdbGTFfile {gtf}" + with tempfile.TemporaryDirectory() as tmpdir: shell( diff --git a/meta/bio/star_arriba/test/Snakefile b/meta/bio/star_arriba/test/Snakefile index 8ea743c457..c6dc2075b7 100644 --- a/meta/bio/star_arriba/test/Snakefile +++ b/meta/bio/star_arriba/test/Snakefile @@ -1,12 +1,13 @@ rule star_index: input: fasta="resources/genome.fasta", - annotation="resources/genome.gtf", + gtf="resources/genome.gtf", output: directory("resources/star_genome"), threads: 4 params: - extra=lambda wc, input: f"--sjdbGTFfile {input.annotation} --sjdbOverhang 100", + sjdbOverhang=100, + extra="--genomeSAindexNbases 2", log: "logs/star_index_genome.log", cache: True # mark as eligible for between workflow caching @@ -41,7 +42,7 @@ rule star_align: rule arriba: input: - bam="star/{sample}/Aligned.out.bam", + bam=rules.star_align.output.aln, genome="resources/genome.fasta", annotation="resources/genome.gtf", # optional: # A custom tsv containing identified artifacts, such as read-through fusions of neighbouring genes. @@ -54,7 +55,7 @@ rule arriba: params: # required if blacklist or known_fusions is set genome_build="GRCh38", - default_blacklist=True, + default_blacklist=False, default_known_fusions=True, extra="", log: