Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: SRA in tmpdir #963

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ All changed fall under either one of these types: `Added`, `Changed`, `Deprecate

## [Unreleased]

## [1.2.0] - 2023-09-18

### Changed

- DESeq2 now uses more samples to estimate count dispersions
Expand Down Expand Up @@ -826,7 +828,8 @@ Many minor bug- and quality of life fixes.

First release of seq2science!

[Unreleased]: https://github.com/vanheeringen-lab/seq2science/compare/v1.1.0...develop
[Unreleased]: https://github.com/vanheeringen-lab/seq2science/compare/v1.2.0...develop
[1.2.0]: https://github.com/vanheeringen-lab/seq2science/compare/v1.1.0...v1.2.0
[1.1.0]: https://github.com/vanheeringen-lab/seq2science/compare/v1.0.4...v1.1.0
[1.0.4]: https://github.com/vanheeringen-lab/seq2science/compare/v1.0.3...v1.0.4
[1.0.3]: https://github.com/vanheeringen-lab/seq2science/compare/v1.0.2...v1.0.3
Expand Down
2 changes: 1 addition & 1 deletion seq2science/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from . import util

__all__ = ["util"]
__version__ = "1.1.0"
__version__ = "1.2.0"
17 changes: 15 additions & 2 deletions seq2science/rules/configuration_generic.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import copy
import json
import requests
import yaml
from subprocess import check_output, STDOUT, CalledProcessError

import xdg
import pandas as pd
Expand Down Expand Up @@ -44,6 +45,18 @@ DAG.warn_about_changes = lambda *args: None
CACHE_DIR = os.path.join(xdg.XDG_CACHE_HOME, "seq2science", seq2science.__version__)


def is_valid_url(url):
cmd = ["wget", "--spider", url]
try:
result = check_output(cmd, stderr=STDOUT).decode().strip()
except CalledProcessError:
return False
if result.endswith("exists.") and not "does not exist" in result:
return True
else:
return False


def parse_config(cfg):
# convert all config keys to lower case (upper case = user typo)
cfg = {k.lower(): v for k, v in cfg.items()}
Expand Down Expand Up @@ -467,13 +480,13 @@ def parse_pysradb():
ena_single_end = [
run
for values in sampledict.values()
if (values["layout"] == "SINGLE") and values.get("ena_fastq_ftp") is not None
if (values["layout"] == "SINGLE") and values.get("ena_fastq_ftp") is not None and is_valid_url(values["ena_fastq_ftp"].replace("era-fasp@fasp", "ftp"))
for run in values["runs"]
]
ena_paired_end = [
run
for values in sampledict.values()
if (values["layout"] == "PAIRED") and values.get("ena_fastq_ftp") is not None
if (values["layout"] == "PAIRED") and values.get("ena_fastq_ftp") is not None and is_valid_url(values["ena_fastq_ftp"].replace("era-fasp@fasp", "ftp"))
for run in values["runs"]
]
gsa_or_encode_single_end = [
Expand Down
143 changes: 53 additions & 90 deletions seq2science/rules/get_fastq.smk
Original file line number Diff line number Diff line change
@@ -1,73 +1,11 @@
"""
all rules/logic related downloading public fastqs should be here.
"""
localrules: run2sra, ena2fastq_SE, ena2fastq_PE, gsa_or_encode2fastq_SE, gsa_or_encode2fastq_PE, runs2sample

localrules: ena2fastq_SE, ena2fastq_PE, gsa_or_encode2fastq_SE, gsa_or_encode2fastq_PE, runs2sample

import os


rule run2sra:
"""
Download the SRA of a sample by its unique identifier.
"""
output:
temp(expand("{sra_dir}/{{run}}/{{run}}/{{run}}.sra", **config)),
log:
expand("{log_dir}/run2sra/{{run}}.log", **config),
benchmark:
expand("{benchmark_dir}/run2sra/{{run}}.benchmark.txt", **config)[0]
message: EXPLAIN["run2sra"]
resources:
parallel_downloads=1,
params:
outdir=lambda wildcards: f"{config['sra_dir']}/{wildcards.run}",
conda:
"../envs/get_fastq.yaml"
wildcard_constraints:
run="[DES]RR\d+",
retries: 2
shell:
"""
# move to output dir since somehow prefetch sometimes puts files in the cwd...
# and remove the top level folder since prefetch will assume we are done otherwise
mkdir -p {params.outdir}; cd {params.outdir}; rm -r {wildcards.run}

# TODO: for loop can be removed if we dont see the debug message
# three attempts
for i in {{1..3}}
do
# acquire a lock
(
flock -w 30 200 || continue
sleep 2
) 200>{PYSRADB_CACHE_LOCK}

# dump
prefetch --max-size u --output-directory ./ --log-level debug --progress {wildcards.run} \
>> {log} 2>&1 && break

# retry
echo "DEBUG: prefetch try ${{i}} of 3 failed" >> {log}
sleep 10
done

# TODO: section can be removed if we dont see the debug message
# bug report: https://github.com/ncbi/sra-tools/issues/533
if [[ -f "{params.outdir}/{wildcards.run}.sra" ]]; then
echo "DEBUG: moving output to correct directory" >> {log}
mkdir -p {params.outdir}/{wildcards.run}
mv {params.outdir}/{wildcards.run}.sra {output}
fi

# TODO: section can be removed if we dont see the debug message
# If an sralite file was downloaded instead of a sra file, just rename it
if [[ -f "{params.outdir}/{wildcards.run}.sralite" ]]; then
echo "DEBUG: renaming SRAlite" >> {log}
mkdir -p {params.outdir}/{wildcards.run}
mv {params.outdir}/{wildcards.run}.sralite {output}
fi
"""
import os


# ENA > SRA
Expand All @@ -83,11 +21,8 @@ rule sra2fastq_SE:
"""
Downloaded (raw) SRAs are converted to single-end fastq files.
"""
input:
rules.run2sra.output,
output:
fastq=temp(expand("{fastq_dir}/runs/{{run}}.{fqsuffix}.gz", **config)),
tmpdir=temp(directory(expand("{fastq_dir}/runs/tmp/{{run}}", **config))),
temp(expand("{fastq_dir}/runs/{{run}}.{fqsuffix}.gz", **config))
log:
expand("{log_dir}/sra2fastq_SE/{{run}}.log", **config),
benchmark:
Expand All @@ -97,27 +32,41 @@ rule sra2fastq_SE:
threads: 8
conda:
"../envs/get_fastq.yaml"
priority: 1
retries: 2
shell:
"""
# move to output dir since somehow parallel-fastq-dump sometimes puts files in the cwd...
mkdir -p {output.tmpdir}; cd {output.tmpdir}
tmpdir=`mktemp -d`
echo $tmpdir
echo "Using tmpdir $tmpdir" >> {log}

# move to output dir since somehow prefetch and parallel-fastq-dump sometimes put files in the cwd...
cd $tmpdir

# acquire the lock
(
flock -w 30 200 || exit 1
sleep 3
) 200>{PYSRADB_CACHE_LOCK}

# three attempts
for i in {{1..3}}
do
# dump
source {workflow.basedir}/../../scripts/download_sra.sh {wildcards.run} {log} && break
sleep 10
done

# dump to tmp dir
parallel-fastq-dump -s {input} -O {output.tmpdir} \
parallel-fastq-dump -s {wildcards.run}/{wildcards.run}.sra -O $tmpdir \
--threads {threads} --split-spot --skip-technical --dumpbase --readids \
--clip --read-filter pass --defline-seq '@$ac.$si.$sg/$ri' \
--defline-qual '+' --gzip >> {log} 2>&1

# rename file and move to output dir
mv {output.tmpdir}/*.fastq.gz {output.fastq}
mv $tmpdir/*fastq.gz {output} >> {log} 2>&1

# Remove temporary directory
rm -rf $tmpdir >> {log} 2>&1
"""


Expand All @@ -126,11 +75,8 @@ rule sra2fastq_PE:
Downloaded (raw) SRAs are converted to paired-end fastq files.
Forward and reverse samples will be switched if forward/reverse names are not lexicographically ordered.
"""
input:
rules.run2sra.output,
output:
fastq=temp(expand("{fastq_dir}/runs/{{run}}_{fqext}.{fqsuffix}.gz", **config)),
tmpdir=temp(directory(expand("{fastq_dir}/runs/tmp/{{run}}", **config))),
temp(expand("{fastq_dir}/runs/{{run}}_{fqext}.{fqsuffix}.gz", **config))
log:
expand("{log_dir}/sra2fastq_PE/{{run}}.log", **config),
benchmark:
Expand All @@ -140,35 +86,52 @@ rule sra2fastq_PE:
run="|".join(SRA_PAIRED_END) if len(SRA_PAIRED_END) else "$a", # only try to dump (paired-end) SRA samples
conda:
"../envs/get_fastq.yaml"
priority: 1
retries: 2
shell:
"""
# move to output dir since somehow parallel-fastq-dump sometimes puts files in the cwd...
mkdir -p {output.tmpdir}; cd {output.tmpdir}
tmpdir=`mktemp -d`
echo $tmpdir
echo "Using tmpdir $tmpdir" >> {log}

# move to output dir since somehow prefetch and parallel-fastq-dump sometimes put files in the cwd...
cd $tmpdir

# acquire the lock
(
flock -w 30 200 || exit 1
sleep 3
) 200>{PYSRADB_CACHE_LOCK}

# three attempts
for i in {{1..3}}
do
# dump
source {workflow.basedir}/../../scripts/download_sra.sh {wildcards.run} {log} && break
sleep 10
done

# dump to tmp dir
parallel-fastq-dump -s {input} -O {output.tmpdir} \
--threads {threads} --split-e --skip-technical --dumpbase \
--readids --clip --read-filter pass --defline-seq '@$ac.$si.$sg/$ri' \
parallel-fastq-dump -s {wildcards.run}/{wildcards.run}.sra -O $tmpdir \
--threads {threads} --split-e --skip-technical --dumpbase --readids \
--clip --read-filter pass --defline-seq '@$ac.$si.$sg/$ri' \
--defline-qual '+' --gzip >> {log} 2>&1

# check if the files exist
if ! compgen -G "{output.tmpdir}/*_1*" > /dev/null ; then printf "ERROR: Couldn't find read 1.fastq after dumping! Perhaps this is not a paired-end file?\n" >> {log} 2>&1; fi
if ! compgen -G "{output.tmpdir}/*_2*" > /dev/null ; then printf "ERROR: Couldn't find read 2.fastq after dumping! Perhaps this is not a paired-end file?\n" >> {log} 2>&1; fi
if ! compgen -G $tmpdir/*_1* > /dev/null ; then printf "ERROR: Couldn't find read 1.fastq after dumping! Perhaps this is not a paired-end file?\n" >> {log} 2>&1; fi
if ! compgen -G $tmpdir/*_2* > /dev/null ; then printf "ERROR: Couldn't find read 2.fastq after dumping! Perhaps this is not a paired-end file?\n" >> {log} 2>&1; fi

# rename file and move to output dir
if ls $tmpdir/*_1* 1> /dev/null 2>&1; then
mv $tmpdir/*_1* {output[0]} >> {log} 2>&1
mv $tmpdir/*_2* {output[1]} >> {log} 2>&1
else
echo "No paired end files found!" >> {log}
fi

# rename file and move to output dir
mv {output.tmpdir}/*_1* {output.fastq[0]} >> {log} 2>&1
mv {output.tmpdir}/*_2* {output.fastq[1]} >> {log} 2>&1
# Remove temporary directory
rm -rf $tmpdir >> {log} 2>&1
"""


rule ena2fastq_SE:
"""
Download single-end fastq files directly from the ENA.
Expand Down
35 changes: 35 additions & 0 deletions seq2science/scripts/download_sra.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash/
set -o nounset -e
# Downloads an SRA file to the current folder.

RUN=$1
LOG=$2

# Remove the top level folder since prefetch will assume we are done otherwise
if [ -e $RUN ]; then
rm -r $RUN
fi

# three attempts
echo "Downloading $RUN using prefetch" >> $LOG
prefetch --max-size 999999999999 --output-directory ./ --log-level debug --progress $RUN >> $LOG 2>&1

# TODO: this is the strangest bug, in that on some machines (ocimum) prefetch downloads
# to a different location. Not sure what causes this, but this should fix that. Could simply
# be a global setting that we haven't discovered yet...
# bug report: https://github.com/ncbi/sra-tools/issues/533
if [[ -f "$RUN.sra" ]]; then
mkdir -p $RUN
mv $RUN.sra $RUN/
fi

# If an sralite file was downloaded instead of a sra file, just rename it
if [[ -f "$RUN.sralite" ]]; then
mkdir -p $RUN
mv $RUN.sralite $RUN/$RUN.sra
fi

if [[ ! -f "$RUN/$RUN.sra" ]]; then
echo "prefetch failed, file $RUN/$RUN.sra not found!"
exit 1
fi