From 7dcdff88d73d99de2f71d13447b1982355e6e75f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Richard=20A=2E=20Sch=C3=A4fer?= Date: Mon, 24 Jun 2024 23:55:58 -0500 Subject: [PATCH] Error catching routines (#30) * separated samtools, bcftools and realign envs to avoid conflicts * changed scanexitron repo (temporarily) until outfile fixed * update spladder * enhanced UI --- .gitmodules | 2 +- CHANGELOG.md | 10 ++ workflow/envs/bcftools.yml | 6 + workflow/envs/realign.yml | 7 + workflow/envs/samtools.yml | 6 +- workflow/rules/align.smk | 4 +- workflow/rules/all.smk | 0 workflow/rules/altsplicing.smk | 8 +- workflow/rules/common.smk | 146 ++++++++++------- workflow/rules/custom.smk | 2 +- workflow/rules/exitron.smk | 6 +- workflow/rules/germline.smk | 16 +- workflow/rules/hlatyping.smk | 152 +++++++++--------- workflow/rules/indel.smk | 24 +-- workflow/rules/postproc.smk | 71 -------- .../scripts/genotyping/combine_all_alleles.py | 5 + .../combine_optitype_results.py | 5 +- .../genotyping/merge_predicted_mhcI.py | 32 ---- .../scripts/genotyping/optitype_wrapper.py | 42 +++++ workflow/scripts/run_spladder.sh | 1 - 20 files changed, 276 insertions(+), 269 deletions(-) create mode 100644 workflow/envs/bcftools.yml create mode 100644 workflow/envs/realign.yml delete mode 100644 workflow/rules/all.smk delete mode 100644 workflow/rules/postproc.smk rename workflow/scripts/{ => genotyping}/combine_optitype_results.py (88%) delete mode 100644 workflow/scripts/genotyping/merge_predicted_mhcI.py create mode 100644 workflow/scripts/genotyping/optitype_wrapper.py diff --git a/.gitmodules b/.gitmodules index 2f80867..e490640 100644 --- a/.gitmodules +++ b/.gitmodules @@ -3,4 +3,4 @@ url = https://github.com/cauyrd/transIndel.git [submodule "workflow/scripts/scanexitron"] path = workflow/scripts/scanexitron - url = https://github.com/ylab-hi/ScanExitron.git + url = https://github.com/riasc/ScanExitron.git diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ff3411..f41306b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Prioritization of neoantigens is now done separately for each variant type (speeds up the process) - NMD information (e.g., escape rule,...) is now also calculated for all variants +## [0.2.7] - 2024-06-23 + +### Fix + +- Separated samtools, bcftools and realign environments to avoid conflicts +- Changed order of genotyping rules to catch errors when no alleles can be found + - Alleles are merged according to nartype (e.g., DNA, RNA) and then combined +- Force concat of VCF files in genotyping to avoid errors when no variants are found +- Added optitype wrapper to avoid errors when empty BAM files are provided / no HLA reads + ## [0.2.6] - 2024-06-20 ### Fix diff --git a/workflow/envs/bcftools.yml b/workflow/envs/bcftools.yml new file mode 100644 index 0000000..3129665 --- /dev/null +++ b/workflow/envs/bcftools.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - bcftools=1.20 diff --git a/workflow/envs/realign.yml b/workflow/envs/realign.yml new file mode 100644 index 0000000..d53b369 --- /dev/null +++ b/workflow/envs/realign.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - bwa=0.7.18 + - samtools=1.20 diff --git a/workflow/envs/samtools.yml b/workflow/envs/samtools.yml index 556860b..906f443 100644 --- a/workflow/envs/samtools.yml +++ b/workflow/envs/samtools.yml @@ -1,6 +1,6 @@ channels: + - conda-forge - bioconda + - nodefaults dependencies: - - samtools=1.9 - - bcftools=1.9 - + - samtools=1.14 diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index 30d1dc9..bf29421 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -201,7 +201,7 @@ rule realign: output: bam="results/{sample}/{seqtype}/align/{group}_final_BWA.bam", conda: - "../envs/basic.yml" + "../envs/realign.yml" log: "logs/{sample}/realign/{seqtype}_{group}.log" threads: config['threads'] @@ -225,7 +225,7 @@ if config['data']['dnaseq_filetype'] in ['.fq','.fastq']: log: "logs/{sample}/bwa_align/dnaseq_{group}.log" conda: - "../envs/basic.yml" + "../envs/realign.yml" params: extra="" threads: config['threads'] diff --git a/workflow/rules/all.smk b/workflow/rules/all.smk deleted file mode 100644 index e69de29..0000000 diff --git a/workflow/rules/altsplicing.smk b/workflow/rules/altsplicing.smk index 6761fe3..eb13eeb 100644 --- a/workflow/rules/altsplicing.smk +++ b/workflow/rules/altsplicing.smk @@ -23,7 +23,7 @@ rule spladder: {params.confidence} \ {params.iteration} \ {params.edgelimit} \ - {log} > 2>&1 + {log} """ rule splicing_to_vcf: @@ -54,7 +54,7 @@ rule sort_altsplicing: log: "logs/{sample}/spladder/{group}_sort.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1 @@ -70,8 +70,8 @@ rule combine_altsplicing: log: "logs/{sample}/exitrons/combine_exitrons.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ - bcftools concat --naive -O z {input} -o - | bcftools sort -O z -o {output} > {log} 2>&1 + bcftools concat --naive-force -O z {input} -o - | bcftools sort -O z -o {output} > {log} 2>&1 """ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index b6d4402..869e024 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -196,84 +196,118 @@ def get_input_hlatyping_PE(wildcards): ) +def get_filtered_reads_hlatyping_SE(wildcards): + bam = [] + idx = [] + + if wildcards.nartype == "DNA": + if len(config["data"]["dnaseq"]) != 0: + for key in config["data"]["dnaseq"].keys(): + bam += expand("results/{sample}/hla/mhc-I/reads/{group}_DNA_flt_SE.bam", + sample=wildcards.sample, + group=key) + idx += expand("results/{sample}/hla/mhc-I/reads/{group}_DNA_flt_SE.bam.bai", + sample=wildcards.sample, + group=key) + + + if wildcards.nartype == "RNA": + if len(config["data"]["rnaseq"]) != 0: + for key in config["data"]["rnaseq"].keys(): + bam += expand("results/{sample}/hla/mhc-I/reads/{group}_RNA_flt_SE.bam", + sample=wildcards.sample, + group=key) + idx += expand("results/{sample}/hla/mhc-I/reads/{group}_RNA_flt_SE.bam.bai", + sample=wildcards.sample, + group=key) + + return dict( + zip( + ["bam", "idx"], + [bam, idx] + ) + ) + + +def get_filtered_reads_hlatyping_PE(wildcards): + bam = [] + idx = [] + + if wildcards.nartype == "DNA": + if len(config["data"]["dnaseq"]) != 0: + for key in config["data"]["dnaseq"].keys(): + bam += expand("results/{sample}/hla/mhc-I/reads/{group}_DNA_flt_PE_{readpair}.bam", + sample=wildcards.sample, + group=key, + readpair=wildcards.readpair) + idx += expand("results/{sample}/hla/mhc-I/reads/{group}_DNA_flt_PE_{readpair}.bam.bai", + sample=wildcards.sample, + group=key, + readpair=wildcards.readpair) + + if wildcards.nartype == "RNA": + if len(config["data"]["rnaseq"]) != 0: + for key in config["data"]["rnaseq"].keys(): + bam += expand("results/{sample}/hla/mhc-I/reads/{group}_RNA_flt_PE_{readpair}.bam", + sample=wildcards.sample, + group=key, + readpair=wildcards.readpair) + + idx += expand("results/{sample}/hla/mhc-I/reads/{group}_RNA_flt_PE_{readpair}.bam.bai", + sample=wildcards.sample, + group=key, + readpair=wildcards.readpair) + + return dict( + zip( + ["bam", "idx"], + [bam, idx] + ) + ) + + def aggregate_mhcI_SE(wildcards): checkpoint_output = checkpoints.split_reads_mhcI_SE.get(**wildcards).output[0] - return expand("results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_SE/{no}_result.tsv", + return expand("results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_SE/{no}_result.tsv", sample=wildcards.sample, - group=wildcards.group, nartype=wildcards.nartype, no=glob_wildcards(os.path.join(checkpoint_output, "R_{no}.bam")).no) def aggregate_mhcI_PE(wildcards): checkpoint_output = checkpoints.split_reads_mhcI_PE.get(**wildcards).output[0] - return expand("results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_PE/{no}_result.tsv", + return expand("results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_PE/{no}_result.tsv", sample=wildcards.sample, - group=wildcards.group, nartype=wildcards.nartype, no=glob_wildcards(os.path.join(checkpoint_output, "R1_{no}.bam")).no) -def get_predicted_mhcI_alleles(wildcards): - values = [] - - # routines to genotype from DNA - if "DNA" in config['hlatyping']['MHC-I_mode']: - if config['data']['dnaseq'] is not None: - for key in config['data']['dnaseq'].keys(): - - # exclude normal samples (if specified) - if config['data']['normal'] is not None: - normal = config['data']['normal'].split(' ') - if key in normal: - continue - - values += expand("results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_{readtype}.tsv", - sample = wildcards.sample, - group = key, - nartype = "DNA", - readtype = config['data']['dnaseq_readtype']) # add either SE or PE - else: # if no dnaseq data is specified, but mode is DNA or BOTH, then ignore - print('dnaseq data has not been specified in the config file, but specified mode for hla genotyping in config file is DNA or BOTH -- will be ignored') - - # routines to genotype from RNA - if "RNA" in config['hlatyping']['MHC-I_mode']: - if config['data']['rnaseq'] is not None: - for key in config['data']['rnaseq'].keys(): - - # exclude normal samples (if specified) - if config['data']['normal'] is not None: - normal = config['data']['normal'].split(' ') - if key in normal: - continue - - values += expand("results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_{readtype}.tsv", - sample = wildcards.sample, - group = key, - nartype = "RNA", - readtype = config['data']['rnaseq_readtype']) # add either SE or PE) - else: # if no rnaseq data is specified, but mode is RNA or BOTH, then ignore - print('rnaseq data has not been specified in the config file, but specified mode for hla genotyping in config file is RNA or BOTH -- will be ignored') - - if len(values) == 0: - print('No data found. Check config file for correct specification of data and hla genotyping mode') - sys.exit(1) - - return values - def get_all_mhcI_alleles(wildcards): values = [] - if ("DNA" in config['hlatyping']['MHC-I_mode'] or - "RNA" in config['hlatyping']['MHC-I_mode']): - values += expand("results/{sample}/hla/mhc-I/genotyping/mhc-I.tsv", - sample = wildcards.sample) + if "DNA" in config["hlatyping"]["MHC-I_mode"]: + if len(config["data"]["dnaseq"]) != 0: + if config["data"]["dnaseq_readtype"] == "SE": + values += expand("results/{sample}/hla/mhc-I/genotyping/DNA_flt_merged_SE.tsv", + sample=wildcards.sample) + elif config["data"]["dnaseq_readtype"] == "PE": + values += expand("results/{sample}/hla/mhc-I/genotyping/DNA_flt_merged_PE.tsv", + sample=wildcards.sample) + + if "RNA" in config["hlatyping"]["MHC-I_mode"]: + if len(config["data"]["rnaseq"]) != 0: + if config["data"]["rnaseq_readtype"] == "SE": + values += expand("results/{sample}/hla/mhc-I/genotyping/RNA_flt_merged_SE.tsv", + sample=wildcards.sample) + elif config["data"]["rnaseq_readtype"] == "PE": + values += expand("results/{sample}/hla/mhc-I/genotyping/RNA_flt_merged_PE.tsv", + sample=wildcards.sample) if "custom" in config["hlatyping"]["MHC-I_mode"]: values += [config["data"]["custom"]["hlatyping"]["MHC-I"]] if len(values) == 0: - print('No hla data found. Check config file for correct specification of data and hla genotyping mode') + print("No HLA data found. Please check the config file for correct specification of data and HLA genotyping mode") sys.exit(1) return values diff --git a/workflow/rules/custom.smk b/workflow/rules/custom.smk index 8c4a245..2253e2d 100644 --- a/workflow/rules/custom.smk +++ b/workflow/rules/custom.smk @@ -28,7 +28,7 @@ rule sort_custom_variants: log: "logs/{sample}/custom/sort_custom_variants.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1 diff --git a/workflow/rules/exitron.smk b/workflow/rules/exitron.smk index 511d7b1..c59340e 100644 --- a/workflow/rules/exitron.smk +++ b/workflow/rules/exitron.smk @@ -116,7 +116,7 @@ rule sort_exitron: log: "logs/exitron_sort_{sample}_{group}.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1 @@ -132,9 +132,9 @@ rule combine_exitrons: log: "logs/{sample}/exitrons/combine_exitrons.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ - bcftools concat --naive -O z {input} -o - | bcftools sort -O z -o {output} > {log} 2>&1 + bcftools concat --naive-force -O z {input} -o - | bcftools sort -O z -o {output} > {log} 2>&1 """ diff --git a/workflow/rules/germline.smk b/workflow/rules/germline.smk index 0cd912f..6759f44 100644 --- a/workflow/rules/germline.smk +++ b/workflow/rules/germline.smk @@ -97,7 +97,7 @@ rule sort_variants_htc_first_round: log: "logs/{sample}/gatk/haplotypecaller/{seqtype}_{group}_1rd_{chr}_sort.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1 @@ -113,7 +113,7 @@ rule index_variants_htc_first_round: log: "logs/{sample}/gatk/haplotypecaller/{seqtype}_{group}_1rd_{chr}_index.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools index -t {input} > {log} 2>&1 @@ -130,7 +130,7 @@ rule merge_variants_htc_first_round: log: "logs/{sample}/gatk/haplotypecaller/{seqtype}_{group}_1rd_merge.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools concat -O z -a {input.vcf} -o {output} > {log} 2>&1 @@ -146,7 +146,7 @@ rule index_merged_variants_htc_first_round: log: "logs/{sample}/gatk/haplotypecaller/{seqtype}_{group}_1rd_index.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools index -t {input} > {log} 2>&1 @@ -346,7 +346,7 @@ rule sort_variants_htc_final_round: log: "logs/{sample}/indel/gatk/haplotypecaller/sort_final_{seqtype}_{group}_{chr}.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1 @@ -362,7 +362,7 @@ rule index_variants_htc_final_round: log: "logs/{sample}/indel/gatk/haplotypecaller/index_final_{seqtype}_{group}_{chr}.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools index -t {input} > {log} 2>&1 @@ -379,7 +379,7 @@ rule merge_variants_htc_final_round: log: "logs/{sample}/indel/gatk/haplotypecaller/merge_final_{seqtype}_{group}.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools concat -O z -a {input.vcf} -o {output} > {log} 2>&1 @@ -395,7 +395,7 @@ rule index_merged_variants_htc_final_round: log: "logs/{sample}/indel/gatk/haplotypecaller/index_final_{seqtype}_{group}.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools index -t {input} > {log} 2>&1 diff --git a/workflow/rules/hlatyping.smk b/workflow/rules/hlatyping.smk index 1b123f5..f3b3270 100644 --- a/workflow/rules/hlatyping.smk +++ b/workflow/rules/hlatyping.smk @@ -94,13 +94,8 @@ rule filter_reads_mhcI_SE: "../envs/yara.yml" shell: """ - if [ "{wildcards.nartype}" == "DNA" ]; then - yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara_index/{wildcards.nartype} \ + yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara_index/{wildcards.nartype} \ {input.reads} | samtools view -h -F 4 -b1 - | samtools sort - -o {output} > {log} - elif [ "{wildcards.nartype}" == "RNA" ]; then - yara_mapper -t {threads} -e 3 -f bam -u resources/hla/yara_index/{wildcards.nartype} \ - {input.reads} | samtools view -h -F 4 -b1 - | samtools sort - -o {output} > {log} - fi """ rule index_reads_mhcI_SE: @@ -116,22 +111,42 @@ rule index_reads_mhcI_SE: wrapper: "v2.3.0/bio/samtools/index" +rule merge_reads_mhcI_SE: + input: + unpack(get_filtered_reads_hlatyping_SE), + output: + bam="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_SE.bam", + idx="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_SE.bam.bai" + message: + "Merge the filtered {wildcards.nartype}seq reads for hlatyping of sample: {wildcards.sample}" + log: + "logs/{sample}/hla/merge_reads_mhcI_{nartype}_SE.log" + conda: + "../envs/samtools.yml" + threads: 4 + shell: + """ + samtools merge -@ {threads} {output.bam} {input.bam} > {log} 2>&1 + samtools index {output.bam} >> {log} 2>&1 + """ + + checkpoint split_reads_mhcI_SE: input: - reads="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE.bam", - index="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE.bam.bai" + reads="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_SE.bam", + index="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_SE.bam.bai" output: - directory("results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE/") + directory("results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_SE/") message: "Splitting filtered BAM files for HLA typing" log: - "logs/{sample}/hla/split_bam_{group}_{nartype}.log" + "logs/{sample}/hla/split_bam_{nartype}.log" conda: "../envs/gatk.yml" threads: 1 shell: """ - mkdir -p results/{wildcards.sample}/hla/mhc-I/reads/{wildcards.group}_{wildcards.nartype}_flt_SE/ + mkdir -p results/{wildcards.sample}/hla/mhc-I/reads/{wildcards.nartype}_flt_merged_SE/ gatk SplitSamByNumberOfReads \ -I {input.reads} \ --OUTPUT {output} \ @@ -141,44 +156,37 @@ checkpoint split_reads_mhcI_SE: rule hlatyping_mhcI_SE: input: - reads="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_SE/R_{no}.bam", + reads="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_SE/R_{no}.bam", output: - pdf="results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_SE/{no}_coverage_plot.pdf", - tsv="results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_SE/{no}_result.tsv" + pdf="results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_SE/{no}_coverage_plot.pdf", + tsv="results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_SE/{no}_result.tsv" message: "HLA typing from splitted BAM files" log: - "logs/{sample}/optitype/{group}_{nartype}_{no}_call.log" + "logs/{sample}/optitype/merged_{nartype}_{no}_call.log" conda: "../envs/optitype.yml" threads: 64 shell: """ - samtools index {input.reads} - if [ "{wildcards.nartype}" == "DNA" ]; then - OptiTypePipeline.py --input {input.reads} \ - --outdir results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.group}_{wildcards.nartype}_flt_SE/ \ - --prefix {wildcards.no} --dna -v > {log} - elif [ "{wildcards.nartype}" == "RNA" ]; then - OptiTypePipeline.py --input {input.reads} \ - --outdir results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.group}_{wildcards.nartype}_flt_SE/ \ - --prefix {wildcards.no} --rna -v > {log} - fi + python3 workflow/scripts/genotyping/optitype_wrapper.py \ + '{input.reads}' {wildcards.nartype} {wildcards.no} \ + results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.nartype}_flt_merged_SE/ > {log} """ rule combine_mhcI_SE: input: aggregate_mhcI_SE output: - "results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_SE.tsv" + "results/{sample}/hla/mhc-I/genotyping/{nartype}_SE.tsv" log: - "logs/{sample}/optitype/{group}_{nartype}_call.log" + "logs/{sample}/genotyping/combine_optitype_mhcI_{nartype}_call.log" conda: "../envs/basic.yml" threads: 1 shell: """ - python3 workflow/scripts/combine_optitype_results.py \ + python3 workflow/scripts/genotyping/combine_optitype_results.py \ '{input}' {output} """ @@ -219,24 +227,43 @@ rule index_reads_mhcI_PE: wrapper: "v2.3.0/bio/samtools/index" +rule merge_reads_mhcI_PE: + input: + unpack(get_filtered_reads_hlatyping_PE), + output: + bam="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE_{readpair}.bam", + idx="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE_{readpair}.bam.bai" + message: + "Merge the filtered {wildcards.nartype}seq reads for hlatyping of sample: {wildcards.sample} with readpair: {wildcards.readpair}" + log: + "logs/{sample}/hla/merge_reads_mhcI_{nartype}_PE_{readpair}.log", + conda: + "../envs/samtools.yml" + threads: 4 + shell: + """ + samtools merge -@ {threads} {output.bam} {input.bam} > {log} 2>&1 + samtools index {output.bam} >> {log} 2>&1 + """ + checkpoint split_reads_mhcI_PE: input: - fwd="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE_R1.bam", - fwd_idx="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE_R1.bam.bai", - rev="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE_R2.bam", - rev_idx="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE_R2.bam.bai" + fwd="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE_R1.bam", + fwd_idx="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE_R1.bam.bai", + rev="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE_R2.bam", + rev_idx="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE_R2.bam.bai" output: - directory("results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE/") + directory("results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE/") message: "Splitting filtered BAM files for HLA typing" log: - "logs/{sample}/hla/split_bam_{group}_{nartype}.log" + "logs/{sample}/hla/split_bam_{nartype}.log" conda: "../envs/gatk.yml" threads: 1 shell: """ - mkdir -p results/{wildcards.sample}/hla/mhc-I/reads/{wildcards.group}_{wildcards.nartype}_flt_PE/ + mkdir -p results/{wildcards.sample}/hla/mhc-I/reads/{wildcards.nartype}_flt_merged_PE/ gatk SplitSamByNumberOfReads \ -I {input.fwd} \ --OUTPUT {output} \ @@ -252,64 +279,41 @@ checkpoint split_reads_mhcI_PE: rule hlatyping_mhcI_PE: input: - fwd="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE/R1_{no}.bam", - rev="results/{sample}/hla/mhc-I/reads/{group}_{nartype}_flt_PE/R2_{no}.bam", + fwd="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE/R1_{no}.bam", + rev="results/{sample}/hla/mhc-I/reads/{nartype}_flt_merged_PE/R2_{no}.bam", output: - pdf="results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_PE/{no}_coverage_plot.pdf", - tsv="results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_PE/{no}_result.tsv" + pdf="results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_PE/{no}_coverage_plot.pdf", + tsv="results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_PE/{no}_result.tsv" message: "HLA typing from splitted BAM files" log: - "logs/{sample}/optitype/{group}_{nartype}_{no}_call.log" + "logs/{sample}/optitype/merged_{nartype}_{no}_call.log" conda: "../envs/optitype.yml" threads: 64 shell: """ - samtools index {input.fwd} - samtools index {input.rev} - if [ "{wildcards.nartype}" == "DNA" ]; then - OptiTypePipeline.py --input {input.fwd} {input.rev} \ - --outdir results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.group}_{wildcards.nartype}_flt_PE/ \ - --prefix {wildcards.no} --dna -v > {log} - elif [ "{wildcards.nartype}" == "RNA" ]; then - OptiTypePipeline.py --input {input.fwd} {input.rev} \ - --outdir results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.group}_{wildcards.nartype}_flt_PE/ \ - --prefix {wildcards.no} --rna -v > {log} - fi - """ + python3 workflow/scripts/genotyping/optitype_wrapper.py \ + '{input.fwd} {input.rev}' {wildcards.nartype} {wildcards.no} \ + results/{wildcards.sample}/hla/mhc-I/genotyping/{wildcards.nartype}_flt_merged_PE/ > {log} -rule combine_mhcI_PE: - input: - aggregate_mhcI_PE - output: - "results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_PE.tsv" - log: - "logs/{sample}/optitype/{group}_{nartype}_call.log" - conda: - "../envs/basic.yml" - threads: 1 - shell: - """ - python3 workflow/scripts/combine_optitype_results.py \ - '{input}' {output} """ -rule merge_predicted_mhcI_alleles: +rule combine_hlatyping_mhcI_PE: input: - get_predicted_mhcI_alleles + aggregate_mhcI_PE, output: - "results/{sample}/hla/mhc-I/genotyping/mhc-I.tsv", + "results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_PE.tsv", message: - "Merging HLA alleles from different sources" + "Combining HLA alleles from predicted optitype results" log: - "logs/{sample}/optitype/merge_predicted_mhc-I.log" + "logs/{sample}/genotyping/combine_optitype_mhcI_{nartype}_PE.log" conda: "../envs/basic.yml" threads: 1 shell: """ - python workflow/scripts/genotyping/merge_predicted_mhcI.py \ + python3 workflow/scripts/genotyping/combine_optitype_results.py \ '{input}' {output} """ @@ -319,7 +323,7 @@ rule combine_all_mhcI_alleles: output: "results/{sample}/hla/mhc-I.tsv" message: - "Combining HLA alleles from different sources" + "Combining HLA alleles from different sources (e.g., predicted and user-defined alleles)" log: "logs/{sample}/genotyping/combine_all_mhc-I.log" conda: @@ -328,7 +332,7 @@ rule combine_all_mhcI_alleles: shell: """ python workflow/scripts/genotyping/combine_all_alleles.py \ - '{input}' mhc-I {output} > {log} 2>&1\ + '{input}' mhc-I {output} """ diff --git a/workflow/rules/indel.smk b/workflow/rules/indel.smk index 7f20fb4..d252702 100644 --- a/workflow/rules/indel.smk +++ b/workflow/rules/indel.smk @@ -110,7 +110,7 @@ rule longindel_sort_and_compress: log: "logs/{sample}/transindel/{seqtype}_{group}_longindel_sort.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1 @@ -126,10 +126,10 @@ rule combine_longindels: log: "logs/{sample}/transindel/combine_longindels.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ - bcftools concat --naive -O z {input} -o - | bcftools sort -O z -o {output} > {log} 2>&1 + bcftools concat --naive-force -O z {input} -o - | bcftools sort -O z -o {output} > {log} 2>&1 """ ####### MUTECT2 ###### @@ -222,7 +222,7 @@ rule sort_short_indels_m2: log: "logs/{sample}/gatk/mutect2/sort_{seqtype}_{group}_{chr}.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1 @@ -238,7 +238,7 @@ rule index_short_indels_m2: log: "logs/{sample}/gatk/mutect2/index_{seqtype}_{group}_{chr}.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools index -t {input} > {log} 2>&1 @@ -255,7 +255,7 @@ rule merge_short_indels_m2: log: "logs/{sample}/gatk/mutect2/merge_{seqtype}_{group}.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools concat -O z -a {input.vcf} -o {output} > {log} 2>&1 @@ -273,7 +273,7 @@ rule index_merged_short_indels_m2: log: "logs/{sample}/gatk/mutect2/index_merged_{seqtype}_{group}.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools index -t {input} > {log} 2>&1 @@ -328,7 +328,7 @@ rule sort_aug_short_indels_m2: log: "logs/{sample}/transindel/{seqtype}_{group}_shortindel_sort.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1 @@ -344,7 +344,7 @@ rule combine_aug_short_indels_m2: log: "logs/{sample}/transindel/combine_longindels.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools concat --naive -O z {input} -o - | bcftools sort -O z -o {output} > {log} 2>&1 @@ -398,7 +398,7 @@ rule sort_somatic_SNVs_m2: log: "logs/{sample}/transindel/{seqtype}_{group}_SNVs_sort.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1 @@ -414,9 +414,9 @@ rule combine_somatic_SNVs_m2: log: "logs/{sample}/transindel/combine_somatic_SNVs.log" conda: - "../envs/samtools.yml" + "../envs/bcftools.yml" shell: """ - bcftools concat --naive -O z {input} -o - | bcftools sort -O z -o {output} > {log} 2>&1 + bcftools concat --naive-force -O z {input} -o - | bcftools sort -O z -o {output} > {log} 2>&1 """ diff --git a/workflow/rules/postproc.smk b/workflow/rules/postproc.smk deleted file mode 100644 index d1063be..0000000 --- a/workflow/rules/postproc.smk +++ /dev/null @@ -1,71 +0,0 @@ -rule postproc: - input: - "results/{sample}/rnaseq/align/{group}_aligned.bam" - output: - "results/{sample}/rnaseq/align/{group}_ready.bam" - conda: - "../envs/samtools.yml" - log: - "logs/{sample}/postproc/rnaseq_{group}.log" - threads: 6 - shell: - """ - samtools index {input} - samtools view -bh -F 4 --min-MQ {config[mapq]} {input} -o - \ - | samtools sort -n -@ {threads} -m1g -O bam - -o - \ - | samtools fixmate -pcmu -O bam -@ {threads} - - \ - | samtools sort -@ {threads} -m1g -O bam - -o - \ - | samtools markdup -r -@ {threads} - {output} > {log} 2>&1 - samtools index {output} - """ - -rule postproc_bam_index: - input: - "results/{sample}/rnaseq/align/{group}_ready.bam" - output: - "results/{sample}/rnaseq/align/{group}_ready.bam.bai" - conda: - "../envs/samtools.yml" - log: - "logs/{sample}/postproc/index/rnaseq_{group}.log" - shell: - """ - samtools index {input} > {log} 2>&1 - """ - - -## retrieve readgroups from bam file -rule get_readgroups: - input: - get_readgroups_input - output: - "results/{sample}/{seqtype}/reads/{group}_readgroups.txt" - conda: - "../envs/basic.yml" - log: - "logs/{sample}/get_readgroups/{seqtype}_{group}.log" - shell: - """ - python workflow/scripts/get_readgroups.py '{input}' \ - {output} > {log} 2>&1 - """ - -rule realign: - input: - bam="results/{sample}/rnaseq/align/{group}_ready.bam", - rg="results/{sample}/rnaseq/reads/{group}_readgroups.txt" - output: - "results/{sample}/rnaseq/align/{group}_realigned.bam" - conda: - "../envs/basic.yml" - log: - "logs/{sample}/realign/rnaseq_{group}.log" - threads: config['threads'] - shell: - """ - samtools collate -Oun128 {input.bam} \ - | samtools fastq -OT RG -@ {threads} - \ - | bwa mem -pt{threads} -CH <(cat {input.rg}) resources/refs/bwa/genome - \ - | samtools sort -@6 -m1g - -o {output} > {log} 2>&1 - samtools index {output} - """ diff --git a/workflow/scripts/genotyping/combine_all_alleles.py b/workflow/scripts/genotyping/combine_all_alleles.py index 56b7ce8..7c6ba1b 100644 --- a/workflow/scripts/genotyping/combine_all_alleles.py +++ b/workflow/scripts/genotyping/combine_all_alleles.py @@ -49,4 +49,9 @@ def main(): fh_out.write(f'{alleles[allele]}\t{allele}\n') fh_out.close() + if len(alleles) == 0: + print("No valid alleles found in the input files!") + print("Please check the input files (e.g., dnaseq, rnaseq, user-provided alleles) and try again") + sys.exit(1) + main() diff --git a/workflow/scripts/combine_optitype_results.py b/workflow/scripts/genotyping/combine_optitype_results.py similarity index 88% rename from workflow/scripts/combine_optitype_results.py rename to workflow/scripts/genotyping/combine_optitype_results.py index b5ebe78..12617dc 100644 --- a/workflow/scripts/combine_optitype_results.py +++ b/workflow/scripts/genotyping/combine_optitype_results.py @@ -9,7 +9,10 @@ def main(): for alfile in sys.argv[1].split(' '): with open(alfile, 'r') as f: - next(f) + try: + next(f) + except StopIteration: + continue # empty file for line in f: line_list = line.rstrip().split('\t') diff --git a/workflow/scripts/genotyping/merge_predicted_mhcI.py b/workflow/scripts/genotyping/merge_predicted_mhcI.py deleted file mode 100644 index ed94b4e..0000000 --- a/workflow/scripts/genotyping/merge_predicted_mhcI.py +++ /dev/null @@ -1,32 +0,0 @@ -import os -import sys -import re -from pathlib import Path - -def main(): - files = sys.argv[1] - alleles = {} - for file in files.rstrip().split(' '): - filestem = Path(file).stem - se = re.search(r'^(.+)_(RNA|DNA)_(SE|PE)', filestem) - group = se.group(1) - - fh = open(file, 'r') - for line in fh: - al = line.rstrip() - if al not in alleles: - alleles[al] = [] - - alleles[al].append(group) - - - out = open(sys.argv[2], 'w') - for al in dict(sorted(alleles.items())): - for i,v in enumerate(alleles[al]): - if i == 0: - out.write(v) - else: - out.write(f',{v}') - out.write(f'\t{al}\n') - -main() diff --git a/workflow/scripts/genotyping/optitype_wrapper.py b/workflow/scripts/genotyping/optitype_wrapper.py new file mode 100644 index 0000000..f50b1b9 --- /dev/null +++ b/workflow/scripts/genotyping/optitype_wrapper.py @@ -0,0 +1,42 @@ +import sys +import os +import subprocess + +""" + Usage: + python3 optitype_wrapper.py +""" + +def main(): + inbams = sys.argv[1] + nartype = "dna" if sys.argv[2] == "DNA" else "rna" + nartype = sys.argv[2] + prefix = sys.argv[3] + outpath = sys.argv[4] + + for filename in inbams.split(' '): + if not os.path.exists(filename + '.bai'): + print("Indexing BAM file: " + filename) + samtools_idx = subprocess.Popen("samtools index " + + filename, shell=True) + + # check if input file is not empty (using samtools) + try: + samtools = subprocess.Popen("samtools view -c " + inbams.split(' ')[0] + " 2>/dev/null", + stdout=subprocess.PIPE, shell=True) + + if int(samtools.communicate()[0]) < 10: + print("Input BAM file: " + inbams + " is empty") + # create an empty output file + pdf = subprocess.Popen("touch " + outpath + prefix + "_coverage_plot.pdf", shell=True) + tsv = subprocess.Popen("touch " + outpath + prefix + "_result.tsv", shell=True) + else: + # call optitype + optitype = subprocess.Popen("optitype --input " + inbams + + " --prefix " + prefix + " --" + nartype + "-v", shell=True) + + + except subprocess.CalledProcessError as e: + print("samtools failed") + +main() diff --git a/workflow/scripts/run_spladder.sh b/workflow/scripts/run_spladder.sh index 4fd8261..a870a85 100644 --- a/workflow/scripts/run_spladder.sh +++ b/workflow/scripts/run_spladder.sh @@ -28,4 +28,3 @@ if [ ! -d $output ]; then echo "Spladder did not generate output. Creating empty folder." mkdir -p $output fi -