vanheeringen-lab · simonvh · Mar 20, 2023 · Mar 20, 2023 · Mar 20, 2023 · Mar 20, 2023
@@ -9,6 +9,8 @@ All changed fall under either one of these types: `Added`, `Changed`, `Deprecate
 
 ## [Unreleased]
 
+## [1.2.0] - 2023-09-18
+
 ### Changed
 
 - DESeq2 now uses more samples to estimate count dispersions
@@ -826,7 +828,8 @@ Many minor bug- and quality of life fixes.
 
 First release of seq2science!
 
-[Unreleased]: https://github.com/vanheeringen-lab/seq2science/compare/v1.1.0...develop
+[Unreleased]: https://github.com/vanheeringen-lab/seq2science/compare/v1.2.0...develop
+[1.2.0]: https://github.com/vanheeringen-lab/seq2science/compare/v1.1.0...v1.2.0
 [1.1.0]: https://github.com/vanheeringen-lab/seq2science/compare/v1.0.4...v1.1.0
 [1.0.4]: https://github.com/vanheeringen-lab/seq2science/compare/v1.0.3...v1.0.4
 [1.0.3]: https://github.com/vanheeringen-lab/seq2science/compare/v1.0.2...v1.0.3

@@ -1,4 +1,4 @@
 from . import util
 
 __all__ = ["util"]
-__version__ = "1.1.0"
+__version__ = "1.2.0"
@@ -11,6 +11,7 @@ import copy
 import json
 import requests
 import yaml
+from subprocess import check_output, STDOUT, CalledProcessError
 
 import xdg
 import pandas as pd
@@ -44,6 +45,18 @@ DAG.warn_about_changes = lambda *args: None
 CACHE_DIR = os.path.join(xdg.XDG_CACHE_HOME, "seq2science", seq2science.__version__)
 
 
+def is_valid_url(url):
+    cmd = ["wget", "--spider", url]
+    try:
+        result = check_output(cmd, stderr=STDOUT).decode().strip()
+    except CalledProcessError:
+        return False
+    if result.endswith("exists.") and not "does not exist" in result:
+        return True
+    else:
+        return False
+
+
 def parse_config(cfg):
     # convert all config keys to lower case (upper case = user typo)
     cfg = {k.lower(): v for k, v in cfg.items()}
@@ -467,13 +480,13 @@ def parse_pysradb():
     ena_single_end = [
         run
         for values in sampledict.values()
-        if (values["layout"] == "SINGLE") and values.get("ena_fastq_ftp") is not None
+        if (values["layout"] == "SINGLE") and values.get("ena_fastq_ftp") is not None and is_valid_url(values["ena_fastq_ftp"].replace("era-fasp@fasp", "ftp"))
         for run in values["runs"]
     ]
     ena_paired_end = [
         run
         for values in sampledict.values()
-        if (values["layout"] == "PAIRED") and values.get("ena_fastq_ftp") is not None
+        if (values["layout"] == "PAIRED") and values.get("ena_fastq_ftp") is not None and is_valid_url(values["ena_fastq_ftp"].replace("era-fasp@fasp", "ftp"))
         for run in values["runs"]
     ]
     gsa_or_encode_single_end = [

@@ -1,73 +1,11 @@
 """
 all rules/logic related downloading public fastqs should be here.
 """
-localrules: run2sra, ena2fastq_SE, ena2fastq_PE, gsa_or_encode2fastq_SE, gsa_or_encode2fastq_PE, runs2sample
 
+localrules: ena2fastq_SE, ena2fastq_PE, gsa_or_encode2fastq_SE, gsa_or_encode2fastq_PE, runs2sample
 
-import os
-
-
-rule run2sra:
-    """
-    Download the SRA of a sample by its unique identifier.
-    """
-    output:
-        temp(expand("{sra_dir}/{{run}}/{{run}}/{{run}}.sra", **config)),
-    log:
-        expand("{log_dir}/run2sra/{{run}}.log", **config),
-    benchmark:
-        expand("{benchmark_dir}/run2sra/{{run}}.benchmark.txt", **config)[0]
-    message: EXPLAIN["run2sra"]
-    resources:
-        parallel_downloads=1,
-    params:
-        outdir=lambda wildcards: f"{config['sra_dir']}/{wildcards.run}",
-    conda:
-        "../envs/get_fastq.yaml"
-    wildcard_constraints:
-        run="[DES]RR\d+",
-    retries: 2
-    shell:
-        """
-        # move to output dir since somehow prefetch sometimes puts files in the cwd...
-        # and remove the top level folder since prefetch will assume we are done otherwise
-        mkdir -p {params.outdir}; cd {params.outdir}; rm -r {wildcards.run}
-
-        # TODO: for loop can be removed if we dont see the debug message
-        # three attempts
-        for i in {{1..3}}
-        do
-            # acquire a lock
-            (
-                flock -w 30 200 || continue
-                sleep 2
-            ) 200>{PYSRADB_CACHE_LOCK}
-
-            # dump
-            prefetch --max-size u --output-directory ./ --log-level debug --progress {wildcards.run} \
-            >> {log} 2>&1 && break
-
-            # retry
-            echo "DEBUG: prefetch try ${{i}} of 3 failed" >> {log}
-            sleep 10
-        done
 
-        # TODO: section can be removed if we dont see the debug message
-        # bug report: https://github.com/ncbi/sra-tools/issues/533
-        if [[ -f "{params.outdir}/{wildcards.run}.sra" ]]; then
-            echo "DEBUG: moving output to correct directory" >> {log}
-            mkdir -p {params.outdir}/{wildcards.run}
-            mv {params.outdir}/{wildcards.run}.sra {output}
-        fi
-
-        # TODO: section can be removed if we dont see the debug message
-        # If an sralite file was downloaded instead of a sra file, just rename it
-        if [[ -f "{params.outdir}/{wildcards.run}.sralite" ]]; then
-            echo "DEBUG: renaming SRAlite" >> {log}
-            mkdir -p {params.outdir}/{wildcards.run}
-            mv {params.outdir}/{wildcards.run}.sralite {output}
-        fi
-        """
+import os
 
 
 # ENA > SRA
@@ -83,11 +21,8 @@ rule sra2fastq_SE:
     """
     Downloaded (raw) SRAs are converted to single-end fastq files.
     """
-    input:
-        rules.run2sra.output,
     output:
-        fastq=temp(expand("{fastq_dir}/runs/{{run}}.{fqsuffix}.gz", **config)),
-        tmpdir=temp(directory(expand("{fastq_dir}/runs/tmp/{{run}}", **config))),
+        temp(expand("{fastq_dir}/runs/{{run}}.{fqsuffix}.gz", **config))
     log:
         expand("{log_dir}/sra2fastq_SE/{{run}}.log", **config),
     benchmark:
@@ -97,27 +32,41 @@ rule sra2fastq_SE:
     threads: 8
     conda:
         "../envs/get_fastq.yaml"
-    priority: 1
     retries: 2
     shell:
         """
-        # move to output dir since somehow parallel-fastq-dump sometimes puts files in the cwd...
-        mkdir -p {output.tmpdir}; cd {output.tmpdir}
+        tmpdir=`mktemp -d`
+	    echo $tmpdir        
+	    echo "Using tmpdir $tmpdir" >> {log}
+
+        # move to output dir since somehow prefetch and parallel-fastq-dump sometimes put files in the cwd...
+        cd $tmpdir
 
         # acquire the lock
         (
             flock -w 30 200 || exit 1
             sleep 3
         ) 200>{PYSRADB_CACHE_LOCK}
 
+        # three attempts
+        for i in {{1..3}}
+        do
+            # dump
+            source {workflow.basedir}/../../scripts/download_sra.sh {wildcards.run} {log} && break
+            sleep 10
+        done
+
         # dump to tmp dir
-        parallel-fastq-dump -s {input} -O {output.tmpdir} \
+        parallel-fastq-dump -s {wildcards.run}/{wildcards.run}.sra -O $tmpdir \
         --threads {threads} --split-spot --skip-technical --dumpbase --readids \
         --clip --read-filter pass --defline-seq '@$ac.$si.$sg/$ri' \
         --defline-qual '+' --gzip >> {log} 2>&1
 
         # rename file and move to output dir
-        mv {output.tmpdir}/*.fastq.gz {output.fastq}
+        mv $tmpdir/*fastq.gz {output} >> {log} 2>&1
+
+        # Remove temporary directory
+        rm -rf $tmpdir >> {log} 2>&1
         """
 
 
@@ -126,11 +75,8 @@ rule sra2fastq_PE:
     Downloaded (raw) SRAs are converted to paired-end fastq files.
     Forward and reverse samples will be switched if forward/reverse names are not lexicographically ordered.
     """
-    input:
-        rules.run2sra.output,
     output:
-        fastq=temp(expand("{fastq_dir}/runs/{{run}}_{fqext}.{fqsuffix}.gz", **config)),
-        tmpdir=temp(directory(expand("{fastq_dir}/runs/tmp/{{run}}", **config))),
+        temp(expand("{fastq_dir}/runs/{{run}}_{fqext}.{fqsuffix}.gz", **config))
     log:
         expand("{log_dir}/sra2fastq_PE/{{run}}.log", **config),
     benchmark:
@@ -140,35 +86,52 @@ rule sra2fastq_PE:
         run="|".join(SRA_PAIRED_END) if len(SRA_PAIRED_END) else "$a",  # only try to dump (paired-end) SRA samples
     conda:
         "../envs/get_fastq.yaml"
-    priority: 1
     retries: 2
     shell:
         """
-        # move to output dir since somehow parallel-fastq-dump sometimes puts files in the cwd...
-        mkdir -p {output.tmpdir}; cd {output.tmpdir}
+        tmpdir=`mktemp -d`
+        echo $tmpdir
+	    echo "Using tmpdir $tmpdir" >> {log}
+
+        # move to output dir since somehow prefetch and parallel-fastq-dump sometimes put files in the cwd...
+        cd $tmpdir
 
         # acquire the lock
         (
             flock -w 30 200 || exit 1
             sleep 3
         ) 200>{PYSRADB_CACHE_LOCK}
 
+        # three attempts
+        for i in {{1..3}}
+        do
+            # dump
+            source {workflow.basedir}/../../scripts/download_sra.sh {wildcards.run} {log} && break
+            sleep 10
+        done
+
         # dump to tmp dir
-        parallel-fastq-dump -s {input} -O {output.tmpdir} \
-        --threads {threads} --split-e --skip-technical --dumpbase \
-        --readids --clip --read-filter pass --defline-seq '@$ac.$si.$sg/$ri' \
+        parallel-fastq-dump -s {wildcards.run}/{wildcards.run}.sra -O $tmpdir \
+        --threads {threads} --split-e --skip-technical --dumpbase --readids \
+        --clip --read-filter pass --defline-seq '@$ac.$si.$sg/$ri' \
         --defline-qual '+' --gzip >> {log} 2>&1
-        
+
         # check if the files exist
-        if ! compgen -G "{output.tmpdir}/*_1*" > /dev/null ; then printf "ERROR: Couldn't find read 1.fastq after dumping! Perhaps this is not a paired-end file?\n" >> {log} 2>&1; fi
-        if ! compgen -G "{output.tmpdir}/*_2*" > /dev/null ; then printf "ERROR: Couldn't find read 2.fastq after dumping! Perhaps this is not a paired-end file?\n" >> {log} 2>&1; fi
+        if ! compgen -G $tmpdir/*_1* > /dev/null ; then printf "ERROR: Couldn't find read 1.fastq after dumping! Perhaps this is not a paired-end file?\n" >> {log} 2>&1; fi
+        if ! compgen -G $tmpdir/*_2* > /dev/null ; then printf "ERROR: Couldn't find read 2.fastq after dumping! Perhaps this is not a paired-end file?\n" >> {log} 2>&1; fi
+
+       # rename file and move to output dir
+        if ls $tmpdir/*_1* 1> /dev/null 2>&1; then
+	        mv $tmpdir/*_1* {output[0]} >> {log} 2>&1
+            mv $tmpdir/*_2* {output[1]} >> {log} 2>&1
+        else
+	        echo "No paired end files found!" >> {log}
+        fi
 
-        # rename file and move to output dir
-        mv {output.tmpdir}/*_1* {output.fastq[0]} >> {log} 2>&1
-        mv {output.tmpdir}/*_2* {output.fastq[1]} >> {log} 2>&1
+        # Remove temporary directory
+        rm -rf $tmpdir >> {log} 2>&1
         """
 
-
 rule ena2fastq_SE:
     """
     Download single-end fastq files directly from the ENA.

@@ -0,0 +1,35 @@
+#!/bin/bash/
+set -o nounset -e
+# Downloads an SRA file to the current folder.
+
+RUN=$1
+LOG=$2
+
+# Remove the top level folder since prefetch will assume we are done otherwise
+if [ -e $RUN ]; then
+    rm -r $RUN
+fi
+
+# three attempts
+echo "Downloading $RUN using prefetch" >> $LOG
+prefetch --max-size 999999999999 --output-directory ./ --log-level debug --progress $RUN >> $LOG 2>&1
+
+# TODO: this is the strangest bug, in that on some machines (ocimum) prefetch downloads
+# to a different location. Not sure what causes this, but this should fix that. Could simply
+# be a global setting that we haven't discovered yet...
+# bug report: https://github.com/ncbi/sra-tools/issues/533
+if [[ -f "$RUN.sra" ]]; then
+    mkdir -p $RUN
+    mv $RUN.sra $RUN/
+fi
+
+# If an sralite file was downloaded instead of a sra file, just rename it
+if [[ -f "$RUN.sralite" ]]; then
+    mkdir -p $RUN
+    mv $RUN.sralite $RUN/$RUN.sra
+fi
+
+if [[ ! -f "$RUN/$RUN.sra" ]]; then
+    echo "prefetch failed, file $RUN/$RUN.sra not found!"
+    exit 1
+fi