Skip to content

Commit

Permalink
fix(sra): improve layout extraction from NCBI db (#168)
Browse files Browse the repository at this point in the history
  • Loading branch information
mkatsanto committed Feb 5, 2024
1 parent c55a7a7 commit 4b5a1cc
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 39 deletions.
Binary file added pigz_latest.sif
Binary file not shown.
64 changes: 25 additions & 39 deletions workflow/rules/sra_download.smk
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,15 @@ checkpoint get_layout:
),
shell:
"""
(mkdir -p {output.outdir} && \
(mkdir -p {output.outdir}; \
layout=$(efetch -db sra \
-id {wildcards.sample} \
-format runinfo | \
awk -F, 'BEGIN {{ exitval = 1 }} NR>1 && $1 == "{wildcards.sample}" {{ print $16; exitval=0 }} END {{ exit exitval }}') && \
touch {output.outdir}/$layout.info; \
-format runinfo \
-mode xml | \
xtract -pattern Row \
-if Run -equals {wildcards.sample} \
-element LibraryLayout); \
touch {output.outdir}/$layout.info ; \
) 1> {log.stdout} 2> {log.stderr}
"""

Expand Down Expand Up @@ -80,44 +83,27 @@ rule prefetch:


def get_layouts(wildcards):
"""Get the layout of each sample."""

# populate layout dictionary
layouts = {}
for sample in samples[
samples.index.str.contains("^.RR", regex=True, case=True)
].index.tolist():
layouts[sample] = []
layouts = []
for each_sample in samples.index.tolist():
checkpoint_output = checkpoints.get_layout.get(
sample=sample, **wildcards
sample=each_sample, **wildcards
).output.outdir
_files = [
os.path.join(checkpoint_output, _file)
for _file in os.listdir(checkpoint_output)
if _file.endswith(".info")
]
assert len(_files) == 1
layouts[sample] = os.path.splitext(os.path.basename(_files[0]))[0]

# convert layouts to short form
layouts_short = {}
for key, val in layouts.items():
if val == "PAIRED":
layouts_short[key] = "pe"
elif val == "SINGLE":
layouts_short[key] = "se"
layout = glob_wildcards(os.path.join(checkpoint_output, "{layout}.info")).layout
if "PAIRED" in layout:
layouts.append(
os.path.join(
config["outdir"], "compress", each_sample, each_sample + ".pe.tsv"
)
)
elif "SINGLE" in layout:
layouts.append(
os.path.join(
config["outdir"], "compress", each_sample, each_sample + ".se.tsv"
)
)
else:
raise ValueError(f"Layout {val} for sample {key} not recognized.")

# return layouts
layouts = expand(
os.path.join(
config["outdir"], "compress", "{sample}", "{sample}.{seqmode}.tsv"
),
zip,
sample=layouts_short.keys(),
seqmode=layouts_short.values(),
)
pass
print(layouts)
return layouts


Expand Down

0 comments on commit 4b5a1cc

Please sign in to comment.