-
Notifications
You must be signed in to change notification settings - Fork 1
/
Snakefile
executable file
·130 lines (116 loc) · 4.83 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
SAMPLES = ['SRR2090174']
rule all:
input:
expand("outputs/megahit/{sample}.contigs.fa", sample = SAMPLES),
expand("outputs/bcalm/{sample}/GCA_001430905.1_ASM143090v1_genomic.fna.gz.cdbg_ids.reads.fa.unitigs.gfa", sample = SAMPLES)
rule download_reads:
output:
r1='inputs/raw/{sample}_1.fastq.gz',
r2='inputs/raw/{sample}_2.fastq.gz'
shell:'''
wget -O {output.r1} ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR209/004/SRR2090174/SRR2090174_1.fastq.gz
wget -O {output.r2} ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR209/004/SRR2090174/SRR2090174_2.fastq.gz
'''
rule download_cmag:
output: "inputs/cmag/GCA_001430905.1_ASM143090v1_genomic.fna.gz"
shell:'''
wget -O {output} ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/001/430/905/GCA_001430905.1_ASM143090v1/GCA_001430905.1_ASM143090v1_genomic.fna.gz
'''
rule adapter_trim:
input:
r1 = "inputs/raw/{sample}_1.fastq.gz",
r2 = 'inputs/raw/{sample}_2.fastq.gz',
adapters = 'inputs/adapters.fa'
output:
r1 = 'outputs/trim/{sample}_R1.trim.fq.gz',
r2 = 'outputs/trim/{sample}_R2.trim.fq.gz',
o1 = 'outputs/trim/{sample}_o1.trim.fq.gz',
o2 = 'outputs/trim/{sample}_o2.trim.fq.gz'
conda: 'env.yml'
shell:'''
trimmomatic PE {input.r1} {input.r2} \
{output.r1} {output.o1} {output.r2} {output.o2} \
ILLUMINACLIP:{input.adapters}:2:0:15 MINLEN:25 \
LEADING:2 TRAILING:2 SLIDINGWINDOW:4:2
'''
rule kmer_trim_reads:
input:
"outputs/trim/{sample}_R1.trim.fq.gz",
"outputs/trim/{sample}_R2.trim.fq.gz"
output: "outputs/abundtrim/{sample}.abundtrim.fq.gz"
conda: 'env.yml'
shell:'''
interleave-reads.py {input} |
trim-low-abund.py -C 3 -Z 18 -M 30e9 -V - -o {output}
'''
rule sgc_bin_queries:
input:
conf = "inputs/conf/{sample}_conf.yml",
reads = "outputs/abundtrim/{sample}.abundtrim.fq.gz",
cmag = ["inputs/cmag/GCA_001430905.1_ASM143090v1_genomic.fna.gz"]
output: "outputs/{sample}_k31_r1_search_oh0/GCA_001430905.1_ASM143090v1_genomic.fna.gz.cdbg_ids.reads.fa.gz"
shell:'''
python -m spacegraphcats {input.conf} extract_contigs extract_reads --nolock
'''
rule index_cmag:
input: "inputs/cmag/GCA_001430905.1_ASM143090v1_genomic.fna.gz"
output: "inputs/cmag/GCA_001430905.1_ASM143090v1_genomic.fna.gz.bwt"
conda: 'env.yml'
shell:'''
bwa index {input}
'''
rule map_nbhd_reads:
input:
nbhd_reads="outputs/{sample}_k31_r1_search_oh0/GCA_001430905.1_ASM143090v1_genomic.fna.gz.cdbg_ids.reads.fa.gz",
cmag="inputs/cmag/GCA_001430905.1_ASM143090v1_genomic.fna.gz",
index="inputs/cmag/GCA_001430905.1_ASM143090v1_genomic.fna.gz.bwt"
output: "outputs/map_nbhd_reads/{sample}_nbhd.sam"
conda: 'env.yml'
shell:'''
bwa mem -t 4 {input.cmag} {input.nbhd_reads} > {output}
'''
rule sam_unmapped_reads:
input: "outputs/map_nbhd_reads/{sample}_nbhd.sam"
output: "outputs/map_nbhd_reads/{sample}_unmapped.sam"
conda: 'env.yml'
shell:'''
samtools view -f 4 {input} > {output}
'''
rule unmapped_reads_to_fastq:
input: "outputs/map_nbhd_reads/{sample}_unmapped.sam"
output: "outputs/map_nbhd_reads/{sample}_unmapped.fa"
conda: 'env.yml'
shell:'''
samtools fasta {input} > {output}
'''
rule megahit_unmapped_reads:
input: "outputs/map_nbhd_reads/{sample}_unmapped.fa"
output: "outputs/megahit/{sample}.contigs.fa"
conda: 'env.yml'
shell:'''
megahit -r {input} --min-contig-len 142 \
--out-dir {wildcards.sample}_megahit \
--out-prefix {wildcards.sample}
mv {wildcards.sample}_megahit/{wildcards.sample}.contigs.fa {output}
rm -rf {wildcards.sample}_megahit
'''
rule bcalm_nbhd:
input: "outputs/{sample}_k31_r1_search_oh0/GCA_001430905.1_ASM143090v1_genomic.fna.gz.cdbg_ids.reads.fa.gz"
output: "outputs/bcalm/{sample}/GCA_001430905.1_ASM143090v1_genomic.fna.gz.cdbg_ids.reads.fa.unitigs.fa"
params: "outputs/bcalm/{sample}/GCA_001430905.1_ASM143090v1_genomic.fna.gz.cdbg_ids.reads.fa"
shell:'''
bcalm -in {input} -out-dir outputs/bcalm/{wildcards.sample} -kmer-size 31 -abundance-min 1 -out {params}
'''
rule download_convert_to_GFA:
output: "scripts/convertToGFA.py"
shell:'''
wget -O {output} https://raw.githubusercontent.com/spacegraphcats/2018-paper-spacegraphcats/master/pipeline-analyses/variant_snakemake/convertToGFA.py
'''
rule convert_to_gfa:
input:
unitigs = "outputs/bcalm/{sample}/GCA_001430905.1_ASM143090v1_genomic.fna.gz.cdbg_ids.reads.fa.unitigs.fa",
script = "scripts/convertToGFA.py"
output: "outputs/bcalm/{sample}/GCA_001430905.1_ASM143090v1_genomic.fna.gz.cdbg_ids.reads.fa.unitigs.gfa"
shell:'''
python ./scripts/convertToGFA.py {input.unitigs} {output} 31
'''