Error catching routines (#30)

* separated samtools, bcftools and realign envs to avoid conflicts * changed scanexitron repo (temporarily) until outfile fixed * update spladder * enhanced UI
ylab-hi · Jun 25, 2024 · 7dcdff8 · 7dcdff8
1 parent 9df2e17
commit 7dcdff8
Show file tree

Hide file tree

Showing 20 changed files with 276 additions and 269 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -3,4 +3,4 @@
 	url = https://github.com/cauyrd/transIndel.git
 [submodule "workflow/scripts/scanexitron"]
 	path = workflow/scripts/scanexitron
-	url = https://github.com/ylab-hi/ScanExitron.git
+	url = https://github.com/riasc/ScanExitron.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Prioritization of neoantigens is now done separately for each variant type (speeds up the process)
 - NMD information (e.g., escape rule,...) is now also calculated for all variants
 
+## [0.2.7] - 2024-06-23 
+
+### Fix 
+
+- Separated samtools, bcftools and realign environments to avoid conflicts
+- Changed order of genotyping rules to catch errors when no alleles can be found
+    - Alleles are merged according to nartype (e.g., DNA, RNA) and then combined
+- Force concat of VCF files in genotyping to avoid errors when no variants are found
+- Added optitype wrapper to avoid errors when empty BAM files are provided / no HLA reads
+
 ## [0.2.6] - 2024-06-20
 
 ### Fix 

diff --git a/workflow/envs/bcftools.yml b/workflow/envs/bcftools.yml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - bcftools=1.20
diff --git a/workflow/envs/realign.yml b/workflow/envs/realign.yml
@@ -0,0 +1,7 @@
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - bwa=0.7.18
+  - samtools=1.20
diff --git a/workflow/envs/samtools.yml b/workflow/envs/samtools.yml
@@ -1,6 +1,6 @@
 channels:
+  - conda-forge
   - bioconda
+  - nodefaults
 dependencies:
- - samtools=1.9
- - bcftools=1.9
-
+ - samtools=1.14
diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk
@@ -201,7 +201,7 @@ rule realign:
   output:
     bam="results/{sample}/{seqtype}/align/{group}_final_BWA.bam",
   conda:
-    "../envs/basic.yml"
+    "../envs/realign.yml"
   log:
     "logs/{sample}/realign/{seqtype}_{group}.log"
   threads: config['threads']
@@ -225,7 +225,7 @@ if config['data']['dnaseq_filetype'] in ['.fq','.fastq']:
     log:
       "logs/{sample}/bwa_align/dnaseq_{group}.log"
     conda:
-      "../envs/basic.yml"
+      "../envs/realign.yml"
     params:
       extra=""
     threads: config['threads']

diff --git a/workflow/rules/all.smk b/workflow/rules/all.smk
diff --git a/workflow/rules/altsplicing.smk b/workflow/rules/altsplicing.smk
@@ -23,7 +23,7 @@ rule spladder:
             {params.confidence} \
             {params.iteration} \
             {params.edgelimit} \
-            {log} > 2>&1
+            {log} 
       """
 
 rule splicing_to_vcf:
@@ -54,7 +54,7 @@ rule sort_altsplicing:
   log:
     "logs/{sample}/spladder/{group}_sort.log"
   conda:
-    "../envs/samtools.yml"
+    "../envs/bcftools.yml"
   shell:
     """
       bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1
@@ -70,8 +70,8 @@ rule combine_altsplicing:
   log:
     "logs/{sample}/exitrons/combine_exitrons.log"
   conda:
-    "../envs/samtools.yml"
+    "../envs/bcftools.yml"
   shell:
     """
-      bcftools concat --naive -O z {input} -o  - | bcftools sort -O z -o {output} > {log} 2>&1
+      bcftools concat --naive-force -O z {input} -o  - | bcftools sort -O z -o {output} > {log} 2>&1
     """
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -196,84 +196,118 @@ def get_input_hlatyping_PE(wildcards):
     )
 
 
+def get_filtered_reads_hlatyping_SE(wildcards):
+  bam = []
+  idx = []
+
+  if wildcards.nartype == "DNA":
+    if len(config["data"]["dnaseq"]) != 0:
+      for key in config["data"]["dnaseq"].keys():
+        bam += expand("results/{sample}/hla/mhc-I/reads/{group}_DNA_flt_SE.bam",
+                      sample=wildcards.sample,
+                      group=key)
+        idx += expand("results/{sample}/hla/mhc-I/reads/{group}_DNA_flt_SE.bam.bai",
+                      sample=wildcards.sample,
+                      group=key)
+
+
+  if wildcards.nartype == "RNA":
+    if len(config["data"]["rnaseq"]) != 0:
+      for key in config["data"]["rnaseq"].keys():
+        bam += expand("results/{sample}/hla/mhc-I/reads/{group}_RNA_flt_SE.bam",
+                      sample=wildcards.sample,
+                      group=key)
+        idx += expand("results/{sample}/hla/mhc-I/reads/{group}_RNA_flt_SE.bam.bai",
+                      sample=wildcards.sample,
+                      group=key)
+
+  return dict(
+      zip(
+        ["bam", "idx"],
+        [bam, idx]
+    )
+  )
+
+
+def get_filtered_reads_hlatyping_PE(wildcards):
+  bam = []
+  idx = []
+
+  if wildcards.nartype == "DNA":
+    if len(config["data"]["dnaseq"]) != 0:
+      for key in config["data"]["dnaseq"].keys():
+        bam += expand("results/{sample}/hla/mhc-I/reads/{group}_DNA_flt_PE_{readpair}.bam",
+                      sample=wildcards.sample,
+                      group=key,
+                      readpair=wildcards.readpair)
+        idx += expand("results/{sample}/hla/mhc-I/reads/{group}_DNA_flt_PE_{readpair}.bam.bai",
+                      sample=wildcards.sample,
+                      group=key,
+                      readpair=wildcards.readpair)
+
+  if wildcards.nartype == "RNA":
+    if len(config["data"]["rnaseq"]) != 0:
+      for key in config["data"]["rnaseq"].keys():
+        bam += expand("results/{sample}/hla/mhc-I/reads/{group}_RNA_flt_PE_{readpair}.bam",
+                      sample=wildcards.sample,
+                      group=key,
+                      readpair=wildcards.readpair)
+
+        idx += expand("results/{sample}/hla/mhc-I/reads/{group}_RNA_flt_PE_{readpair}.bam.bai",
+                      sample=wildcards.sample,
+                      group=key,
+                      readpair=wildcards.readpair)
+
+  return dict(
+      zip(
+        ["bam", "idx"],
+        [bam, idx]
+    )
+  )
+
+
 def aggregate_mhcI_SE(wildcards):
   checkpoint_output = checkpoints.split_reads_mhcI_SE.get(**wildcards).output[0]
-  return expand("results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_SE/{no}_result.tsv",
+  return expand("results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_SE/{no}_result.tsv",
     sample=wildcards.sample,
-    group=wildcards.group,
     nartype=wildcards.nartype,
     no=glob_wildcards(os.path.join(checkpoint_output, "R_{no}.bam")).no)
 
 
 def aggregate_mhcI_PE(wildcards):
   checkpoint_output = checkpoints.split_reads_mhcI_PE.get(**wildcards).output[0]
-  return expand("results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_flt_PE/{no}_result.tsv",
+  return expand("results/{sample}/hla/mhc-I/genotyping/{nartype}_flt_merged_PE/{no}_result.tsv",
     sample=wildcards.sample,
-    group=wildcards.group,
     nartype=wildcards.nartype,
     no=glob_wildcards(os.path.join(checkpoint_output, "R1_{no}.bam")).no)
 
 
-def get_predicted_mhcI_alleles(wildcards):
-  values = []
-
-  # routines to genotype from DNA
-  if "DNA" in config['hlatyping']['MHC-I_mode']:
-    if config['data']['dnaseq'] is not None:
-      for key in config['data']['dnaseq'].keys():
-
-        # exclude normal samples (if specified)
-        if config['data']['normal'] is not None:
-          normal = config['data']['normal'].split(' ')
-          if key in normal:
-            continue
-
-        values += expand("results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_{readtype}.tsv",
-                         sample = wildcards.sample,
-                         group = key,
-                         nartype = "DNA",
-                         readtype = config['data']['dnaseq_readtype']) # add either SE or PE
-    else: # if no dnaseq data is specified, but mode is DNA or BOTH, then ignore
-      print('dnaseq data has not been specified in the config file, but specified mode for hla genotyping in config file is DNA or BOTH -- will be ignored')
-
-  # routines to genotype from RNA
-  if "RNA" in config['hlatyping']['MHC-I_mode']:
-    if config['data']['rnaseq'] is not None:
-      for key in config['data']['rnaseq'].keys():
-
-        # exclude normal samples (if specified)
-        if config['data']['normal'] is not None:
-          normal = config['data']['normal'].split(' ')
-          if key in normal:
-            continue
-
-        values += expand("results/{sample}/hla/mhc-I/genotyping/{group}_{nartype}_{readtype}.tsv",
-                           sample = wildcards.sample,
-                           group = key,
-                           nartype = "RNA",
-                           readtype = config['data']['rnaseq_readtype']) # add either SE or PE)
-    else: # if no rnaseq data is specified, but mode is RNA or BOTH, then ignore
-      print('rnaseq data has not been specified in the config file, but specified mode for hla genotyping in config file is RNA or BOTH -- will be ignored')
-
-  if len(values) == 0:
-    print('No data found. Check config file for correct specification of data and hla genotyping mode')
-    sys.exit(1)
-
-  return values
-
 def get_all_mhcI_alleles(wildcards):
   values = []
 
-  if ("DNA" in config['hlatyping']['MHC-I_mode'] or
-      "RNA" in config['hlatyping']['MHC-I_mode']):
-    values += expand("results/{sample}/hla/mhc-I/genotyping/mhc-I.tsv",
-                    sample = wildcards.sample)
+  if "DNA" in config["hlatyping"]["MHC-I_mode"]:
+    if len(config["data"]["dnaseq"]) != 0:
+      if config["data"]["dnaseq_readtype"] == "SE":
+        values += expand("results/{sample}/hla/mhc-I/genotyping/DNA_flt_merged_SE.tsv",
+                         sample=wildcards.sample)
+      elif config["data"]["dnaseq_readtype"] == "PE":
+        values += expand("results/{sample}/hla/mhc-I/genotyping/DNA_flt_merged_PE.tsv",
+                         sample=wildcards.sample)
+
+  if "RNA" in config["hlatyping"]["MHC-I_mode"]:
+    if len(config["data"]["rnaseq"]) != 0:
+      if config["data"]["rnaseq_readtype"] == "SE":
+        values += expand("results/{sample}/hla/mhc-I/genotyping/RNA_flt_merged_SE.tsv",
+                         sample=wildcards.sample)
+      elif config["data"]["rnaseq_readtype"] == "PE":
+        values += expand("results/{sample}/hla/mhc-I/genotyping/RNA_flt_merged_PE.tsv",
+                         sample=wildcards.sample)
 
   if "custom" in config["hlatyping"]["MHC-I_mode"]:
     values += [config["data"]["custom"]["hlatyping"]["MHC-I"]]
 
   if len(values) == 0:
-    print('No hla data found. Check config file for correct specification of data and hla genotyping mode')
+    print("No HLA data found. Please check the config file for correct specification of data and HLA genotyping mode")
     sys.exit(1)
 
   return values

diff --git a/workflow/rules/custom.smk b/workflow/rules/custom.smk
@@ -28,7 +28,7 @@ rule sort_custom_variants:
   log:
     "logs/{sample}/custom/sort_custom_variants.log"
   conda:
-    "../envs/samtools.yml"
+    "../envs/bcftools.yml"
   shell:
     """
       bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1

diff --git a/workflow/rules/exitron.smk b/workflow/rules/exitron.smk
@@ -116,7 +116,7 @@ rule sort_exitron:
   log:
     "logs/exitron_sort_{sample}_{group}.log"
   conda:
-    "../envs/samtools.yml"
+    "../envs/bcftools.yml"
   shell:
     """
       bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1
@@ -132,9 +132,9 @@ rule combine_exitrons:
   log:
     "logs/{sample}/exitrons/combine_exitrons.log"
   conda:
-    "../envs/samtools.yml"
+    "../envs/bcftools.yml"
   shell:
     """
-      bcftools concat --naive -O z {input} -o  - | bcftools sort -O z -o {output} > {log} 2>&1
+      bcftools concat --naive-force -O z {input} -o  - | bcftools sort -O z -o {output} > {log} 2>&1
     """
 
diff --git a/workflow/rules/germline.smk b/workflow/rules/germline.smk
@@ -97,7 +97,7 @@ rule sort_variants_htc_first_round:
   log:
     "logs/{sample}/gatk/haplotypecaller/{seqtype}_{group}_1rd_{chr}_sort.log"
   conda:
-    "../envs/samtools.yml"
+    "../envs/bcftools.yml"
   shell:
     """
       bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1
@@ -113,7 +113,7 @@ rule index_variants_htc_first_round:
   log:
      "logs/{sample}/gatk/haplotypecaller/{seqtype}_{group}_1rd_{chr}_index.log"
   conda:
-    "../envs/samtools.yml"
+    "../envs/bcftools.yml"
   shell:
     """
       bcftools index -t {input} > {log} 2>&1
@@ -130,7 +130,7 @@ rule merge_variants_htc_first_round:
   log:
     "logs/{sample}/gatk/haplotypecaller/{seqtype}_{group}_1rd_merge.log"
   conda:
-    "../envs/samtools.yml"
+    "../envs/bcftools.yml"
   shell:
     """
       bcftools concat -O z -a {input.vcf} -o {output} > {log} 2>&1
@@ -146,7 +146,7 @@ rule index_merged_variants_htc_first_round:
   log:
     "logs/{sample}/gatk/haplotypecaller/{seqtype}_{group}_1rd_index.log"
   conda:
-    "../envs/samtools.yml"
+    "../envs/bcftools.yml"
   shell:
     """
       bcftools index -t {input} > {log} 2>&1
@@ -346,7 +346,7 @@ rule sort_variants_htc_final_round:
   log:
     "logs/{sample}/indel/gatk/haplotypecaller/sort_final_{seqtype}_{group}_{chr}.log"
   conda:
-    "../envs/samtools.yml"
+    "../envs/bcftools.yml"
   shell:
     """
       bcftools sort {input} -o - | bcftools view -O z -o {output} > {log} 2>&1
@@ -362,7 +362,7 @@ rule index_variants_htc_final_round:
   log:
      "logs/{sample}/indel/gatk/haplotypecaller/index_final_{seqtype}_{group}_{chr}.log"
   conda:
-    "../envs/samtools.yml"
+    "../envs/bcftools.yml"
   shell:
     """
       bcftools index -t {input} > {log} 2>&1
@@ -379,7 +379,7 @@ rule merge_variants_htc_final_round:
   log:
     "logs/{sample}/indel/gatk/haplotypecaller/merge_final_{seqtype}_{group}.log"
   conda:
-    "../envs/samtools.yml"
+    "../envs/bcftools.yml"
   shell:
     """
       bcftools concat -O z -a {input.vcf} -o {output} > {log} 2>&1
@@ -395,7 +395,7 @@ rule index_merged_variants_htc_final_round:
   log:
     "logs/{sample}/indel/gatk/haplotypecaller/index_final_{seqtype}_{group}.log"
   conda:
-    "../envs/samtools.yml"
+    "../envs/bcftools.yml"
   shell:
     """
       bcftools index -t {input} > {log} 2>&1