Skip to content

Commit

Permalink
JQ-334: Adjusted Mutect translate to handle new style tumor/normal sa…
Browse files Browse the repository at this point in the history
…mple metaheaders
  • Loading branch information
cgates committed Oct 25, 2020
1 parent 9ead59f commit 7463e16
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 22 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Changelog

x.x.x (MM/DD/YYYY)
-----------------
-
- Adjusted Mutect translate to handle new style tumor/normal sample metaheaders


1.1.4 (9/29/2020)
Expand Down
68 changes: 51 additions & 17 deletions jacquard/variant_caller_transforms/mutect.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

JQ_MUTECT_TAG = "JQ_MT_"
MUTECT_ABBREVIATION = "MT"
VERSION = "v1.1 - v4.0"
VERSION = "v1.1 - v2.2"

class _GenotypeTag(common_tags.AbstractJacquardTag):
#pylint: disable=too-few-public-methods
Expand Down Expand Up @@ -176,8 +176,11 @@ class _Mutect1Parser(object):


@staticmethod
def is_mutect_metaheader(metaheader):
return _Mutect1Parser._MUTECT1_METAHEADER_REGEX.search(metaheader)
def is_mutect_metaheader(metaheaders):
for metaheader in metaheaders:
if _Mutect1Parser._MUTECT1_METAHEADER_REGEX.search(metaheader):
return True
return False

@staticmethod
def build_mutect_dict(metaheaders, normal_key, tumor_key):
Expand All @@ -203,19 +206,39 @@ def get_mutect_header(metaheaders):

class _Mutect2Parser(object):
_MUTECT2_METAHEADER_REGEX = re.compile('^##GATKCommandLine.*?=<.*ID=Mu[tT]ect2')
_MUTECT2_METAHEADER_SAMPLE_REGEX = re.compile('^##SAMPLE=<ID=(.*?),SampleName=(.*?),.*')
_MUTECT2_METAHEADER_COMMAND_REGEX = re.compile('^##GATKCommandLine.*?=<.*ID=Mu[tT]ect2.*CommandLine.*?="(.*?)"')
_MUTECT2_METAHEADER_OLD_SAMPLE_REGEX = re.compile('^##SAMPLE=<ID=(.*?),SampleName=(.*?),.*')
_MUTECT2_DICT_KEY_NORMAL = 'normal_sample'
_MUTECT2_DICT_KEY_TUMOR = 'tumor_sample'
_MUTECT2_METAHEADER_NEW_SAMPLE_REGEX = re.compile(
'^##(' + _MUTECT2_DICT_KEY_NORMAL + '|' + _MUTECT2_DICT_KEY_TUMOR + ')=(.*)')


@staticmethod
def is_mutect_metaheader(metaheader):
return _Mutect2Parser._MUTECT2_METAHEADER_REGEX.search(metaheader)
def is_mutect_metaheader(metaheaders):
for metaheader in metaheaders:
if _Mutect2Parser._MUTECT2_METAHEADER_REGEX.search(metaheader):
return True
return False

@staticmethod
def _mutect_dict_from_sample_metalines(metaheaders, normal_key, tumor_key):
def _mutect_dict_from_new_sample_metalines(metaheaders, normal_key, tumor_key):
mutect_keys = {_Mutect2Parser._MUTECT2_DICT_KEY_NORMAL: normal_key,
_Mutect2Parser._MUTECT2_DICT_KEY_TUMOR: tumor_key}
mutect_dict = {}
for metaheader in metaheaders:
match = _Mutect2Parser._MUTECT2_METAHEADER_NEW_SAMPLE_REGEX.search(metaheader)
if match:
key = mutect_keys[match.group(1)]
mutect_dict[key] = match.group(2)
return mutect_dict

@staticmethod
def _mutect_dict_from_old_sample_metalines(metaheaders, normal_key, tumor_key):
mutect_keys = {'NORMAL': normal_key, 'TUMOR': tumor_key}
mutect_dict = {}
for metaheader in metaheaders:
match = _Mutect2Parser._MUTECT2_METAHEADER_SAMPLE_REGEX.search(metaheader)
match = _Mutect2Parser._MUTECT2_METAHEADER_OLD_SAMPLE_REGEX.search(metaheader)
if match and match.group(1) in mutect_keys:
key = mutect_keys[match.group(1)]
mutect_dict[key] = match.group(2)
Expand Down Expand Up @@ -244,17 +267,28 @@ def get_mutect_header(metaheaders):

@staticmethod
def build_mutect_dict(metaheaders, normal_key, tumor_key):
mutect_dict = _Mutect2Parser._mutect_dict_from_sample_metalines(metaheaders, normal_key, tumor_key)
if not mutect_dict:
mutect_dict = _Mutect2Parser._mutect_dict_from_command_line(metaheaders, normal_key, tumor_key)
return mutect_dict
normal_tumor_strategies = [
_Mutect2Parser._mutect_dict_from_new_sample_metalines,
_Mutect2Parser._mutect_dict_from_old_sample_metalines,
_Mutect2Parser._mutect_dict_from_command_line,
]
for strategy in normal_tumor_strategies:
mutect_dict = strategy(metaheaders, normal_key, tumor_key)
if mutect_dict:
return mutect_dict
#
# mutect_dict = _Mutect2Parser._mutect_dict_from_new_sample_metalines(metaheaders, normal_key, tumor_key)
# if not mutect_dict:
# mutect_dict = _Mutect2Parser._mutect_dict_from_old_sample_metalines(metaheaders, normal_key, tumor_key)
# if not mutect_dict:
# mutect_dict = _Mutect2Parser._mutect_dict_from_command_line(metaheaders, normal_key, tumor_key)
# return mutect_dict

def _get_mutect_parser(metaheaders):
for metaheader in metaheaders:
if _Mutect1Parser.is_mutect_metaheader(metaheader):
return _Mutect1Parser
elif _Mutect2Parser.is_mutect_metaheader(metaheader):
return _Mutect2Parser
if _Mutect1Parser.is_mutect_metaheader(metaheaders):
return _Mutect1Parser
elif _Mutect2Parser.is_mutect_metaheader(metaheaders):
return _Mutect2Parser
return None

class Mutect(object):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
##fileformat=VCFv4.1
##MuTect="analysis_type=MuTect input_file=[11N_25714.sorted.bam, 11Ta_25715.sorted.bam] read_buffer_size=null phone_home=STANDARD gatk_key=null tag=NA read_filter=[BadCigar] intervals=[TargetRegion_buffered10bases.bed] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=ucsc.hg19.fasta nonDeterministicRandomSeed=false disableRandomization=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 enable_experimental_downsampling=false baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false noop=false enable_extended_output=false artifact_detection_mode=false tumor_sample_name=25715 bam_tumor_sample_name=null normal_sample_name=25714 force_output=false force_alleles=false only_passing_calls=false initial_tumor_lod=4.0 tumor_lod=6.3 fraction_contamination=0.02 minimum_mutation_cell_fraction=0.0 normal_lod=2.2 normal_artifact_lod=1.0 strand_artifact_lod=2.0 strand_artifact_power_threshold=0.9 dbsnp_normal_lod=5.5 somatic_classification_normal_power_threshold=0.95 minimum_normal_allele_fraction=0.0 tumor_f_pretest=0.0050 min_qscore=5 gap_events_threshold=3 heavily_clipped_read_fraction=0.3 clipping_bias_pvalue_threshold=0.05 fraction_mapq0_threshold=0.5 pir_median_threshold=10.0 pir_mad_threshold=3.0 required_maximum_alt_allele_mapping_quality_score=20 max_alt_alleles_in_normal_count=2 max_alt_alleles_in_normal_qscore_sum=20 max_alt_allele_in_normal_fraction=0.03 power_constant_qscore=30 absolute_copy_number_data=null power_constant_af=0.30000001192092896 vcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub dbsnp=[(RodBinding name=dbsnp source=dbsnp_138.hg19.vcf)] cosmic=[(RodBinding name=cosmic source=Cosmic.v68.hg19.vcf)] normal_panel=[] coverage_20_q20_file=null power_file=null tumor_depth_file=null normal_depth_file=null filter_mismatching_base_and_quals=false"
##GATKCommandLine=<ID=Mutect2,CommandLine="Mutect2 --normal-sample 2000001_Tumor_Normal --panel-of-normals /home/jose.fernandez.navarro/shared/hg19/Mutect2-exome-panel.vcf --germline-resource /home/jose.fernandez.navarro/shared/hg19/af-only-gnomad.raw.sites.vcf --output Mutect_unfiltered.vcf --input sample1_final.bam --input sample2_final.bam --reference /home/jose.fernandez.navarro/shared/hg19/Homo_sapiens_assembly19.fasta --f1r2-median-mq 50 --f1r2-min-bq 20 --f1r2-max-depth 200 --genotype-pon-sites false --genotype-germline-sites false --af-of-alleles-not-in-resource -1.0 --mitochondria-mode false --tumor-lod-to-emit 3.0 --initial-tumor-lod 2.0 --pcr-snv-qual 40 --pcr-indel-qual 40 --max-population-af 0.01 --downsampling-stride 1 --callable-depth 10 --max-suspicious-reads-per-alignment-start 0 --normal-lod 2.2 --ignore-itr-artifacts false --gvcf-lod-band -2.5 --gvcf-lod-band -2.0 --gvcf-lod-band -1.5 --gvcf-lod-band -1.0 --gvcf-lod-band -0.5 --gvcf-lod-band 0.0 --gvcf-lod-band 0.5 --gvcf-lod-band 1.0 --minimum-allele-fraction 0.0 --independent-mates false --disable-adaptive-pruning false --kmer-size 10 --kmer-size 25 --dont-increase-kmer-sizes-for-cycles false --allow-non-unique-kmers-in-ref false --num-pruning-samples 1 --min-dangling-branch-length 4 --recover-all-dangling-branches false --max-num-haplotypes-in-population 128 --min-pruning 2 --adaptive-pruning-initial-error-rate 0.001 --pruning-lod-threshold 2.302585092994046 --max-unpruned-variants 100 --linked-de-bruijn-graph false --disable-artificial-haplotype-recovery false --debug-assembly false --debug-graph-transformations false --capture-assembly-failure-bam false --error-correction-log-odds -Infinity --error-correct-reads false --kmer-length-for-read-error-correction 25 --min-observations-for-kmer-to-be-solid 20 --base-quality-score-threshold 18 --pair-hmm-gap-continuation-penalty 10 --pair-hmm-implementation FASTEST_AVAILABLE --pcr-indel-model CONSERVATIVE --phred-scaled-global-read-mismapping-rate 45 --native-pair-hmm-threads 4 --native-pair-hmm-use-double-precision false --bam-writer-type CALLED_HAPLOTYPES --dont-use-soft-clipped-bases false --min-base-quality-score 10 --smith-waterman JAVA --emit-ref-confidence NONE --max-mnp-distance 1 --force-call-filtered-alleles false --allele-informative-reads-overlap-margin 2 --min-assembly-region-size 50 --max-assembly-region-size 300 --active-probability-threshold 0.002 --max-prob-propagation-distance 50 --force-active false --assembly-region-padding 100 --padding-around-indels 75 --padding-around-snps 20 --padding-around-strs 75 --max-reads-per-alignment-start 50 --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --sites-only-vcf-output false --help false --version false --showHidden false --verbosity INFO --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --gcs-project-for-requester-pays --disable-tool-default-read-filters false --max-read-length 2147483647 --min-read-length 30 --minimum-mapping-quality 20 --disable-tool-default-annotations false --enable-all-annotations false",Version="4.1.8.1",Date="October 1, 2020 1:16:45 PM CEST">
##jacquard.translate.caller=MuTect
##jacquard=<timestamp="2015-09-04 06:15:38",command="['translate', 'test/functional_tests/01_translate/input', 'test/functional_tests/01_translate/benchmark', '--force']",cwd="/Users/cgates/workspace",source="Jacquard",version="0.42">
##jacquard=<timestamp="2020-10-25 16:53:01",command="['translate', 'test/functional_tests/01_translate/input', '/tmp/jacquard/', '--force']",cwd="/Users/cgates/git",source="Jacquard",version="1.1.4x">
##normal_sample=25714
##reference=file:ucsc.hg19.fasta
##source=Mutect2
##tumor_sample=25715
##contig=<ID=chr1,length=249250621,assembly=hg19>
##contig=<ID=chr1_gl000191_random,length=106433,assembly=hg19>
##contig=<ID=chr1_gl000192_random,length=547496,assembly=hg19>
Expand Down
5 changes: 4 additions & 1 deletion test/functional_tests/01_translate/input/tiny.mutect.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Somatic event">
##INFO=<ID=VT,Number=1,Type=String,Description="Variant type, can be SNP, INS or DEL">
##MuTect="analysis_type=MuTect input_file=[11N_25714.sorted.bam, 11Ta_25715.sorted.bam] read_buffer_size=null phone_home=STANDARD gatk_key=null tag=NA read_filter=[BadCigar] intervals=[TargetRegion_buffered10bases.bed] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=ucsc.hg19.fasta nonDeterministicRandomSeed=false disableRandomization=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 enable_experimental_downsampling=false baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false noop=false enable_extended_output=false artifact_detection_mode=false tumor_sample_name=25715 bam_tumor_sample_name=null normal_sample_name=25714 force_output=false force_alleles=false only_passing_calls=false initial_tumor_lod=4.0 tumor_lod=6.3 fraction_contamination=0.02 minimum_mutation_cell_fraction=0.0 normal_lod=2.2 normal_artifact_lod=1.0 strand_artifact_lod=2.0 strand_artifact_power_threshold=0.9 dbsnp_normal_lod=5.5 somatic_classification_normal_power_threshold=0.95 minimum_normal_allele_fraction=0.0 tumor_f_pretest=0.0050 min_qscore=5 gap_events_threshold=3 heavily_clipped_read_fraction=0.3 clipping_bias_pvalue_threshold=0.05 fraction_mapq0_threshold=0.5 pir_median_threshold=10.0 pir_mad_threshold=3.0 required_maximum_alt_allele_mapping_quality_score=20 max_alt_alleles_in_normal_count=2 max_alt_alleles_in_normal_qscore_sum=20 max_alt_allele_in_normal_fraction=0.03 power_constant_qscore=30 absolute_copy_number_data=null power_constant_af=0.30000001192092896 vcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub dbsnp=[(RodBinding name=dbsnp source=dbsnp_138.hg19.vcf)] cosmic=[(RodBinding name=cosmic source=Cosmic.v68.hg19.vcf)] normal_panel=[] coverage_20_q20_file=null power_file=null tumor_depth_file=null normal_depth_file=null filter_mismatching_base_and_quals=false"
##GATKCommandLine=<ID=Mutect2,CommandLine="Mutect2 --normal-sample 2000001_Tumor_Normal --panel-of-normals /home/jose.fernandez.navarro/shared/hg19/Mutect2-exome-panel.vcf --germline-resource /home/jose.fernandez.navarro/shared/hg19/af-only-gnomad.raw.sites.vcf --output Mutect_unfiltered.vcf --input sample1_final.bam --input sample2_final.bam --reference /home/jose.fernandez.navarro/shared/hg19/Homo_sapiens_assembly19.fasta --f1r2-median-mq 50 --f1r2-min-bq 20 --f1r2-max-depth 200 --genotype-pon-sites false --genotype-germline-sites false --af-of-alleles-not-in-resource -1.0 --mitochondria-mode false --tumor-lod-to-emit 3.0 --initial-tumor-lod 2.0 --pcr-snv-qual 40 --pcr-indel-qual 40 --max-population-af 0.01 --downsampling-stride 1 --callable-depth 10 --max-suspicious-reads-per-alignment-start 0 --normal-lod 2.2 --ignore-itr-artifacts false --gvcf-lod-band -2.5 --gvcf-lod-band -2.0 --gvcf-lod-band -1.5 --gvcf-lod-band -1.0 --gvcf-lod-band -0.5 --gvcf-lod-band 0.0 --gvcf-lod-band 0.5 --gvcf-lod-band 1.0 --minimum-allele-fraction 0.0 --independent-mates false --disable-adaptive-pruning false --kmer-size 10 --kmer-size 25 --dont-increase-kmer-sizes-for-cycles false --allow-non-unique-kmers-in-ref false --num-pruning-samples 1 --min-dangling-branch-length 4 --recover-all-dangling-branches false --max-num-haplotypes-in-population 128 --min-pruning 2 --adaptive-pruning-initial-error-rate 0.001 --pruning-lod-threshold 2.302585092994046 --max-unpruned-variants 100 --linked-de-bruijn-graph false --disable-artificial-haplotype-recovery false --debug-assembly false --debug-graph-transformations false --capture-assembly-failure-bam false --error-correction-log-odds -Infinity --error-correct-reads false --kmer-length-for-read-error-correction 25 --min-observations-for-kmer-to-be-solid 20 --base-quality-score-threshold 18 --pair-hmm-gap-continuation-penalty 10 --pair-hmm-implementation FASTEST_AVAILABLE --pcr-indel-model CONSERVATIVE --phred-scaled-global-read-mismapping-rate 45 --native-pair-hmm-threads 4 --native-pair-hmm-use-double-precision false --bam-writer-type CALLED_HAPLOTYPES --dont-use-soft-clipped-bases false --min-base-quality-score 10 --smith-waterman JAVA --emit-ref-confidence NONE --max-mnp-distance 1 --force-call-filtered-alleles false --allele-informative-reads-overlap-margin 2 --min-assembly-region-size 50 --max-assembly-region-size 300 --active-probability-threshold 0.002 --max-prob-propagation-distance 50 --force-active false --assembly-region-padding 100 --padding-around-indels 75 --padding-around-snps 20 --padding-around-strs 75 --max-reads-per-alignment-start 50 --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --sites-only-vcf-output false --help false --version false --showHidden false --verbosity INFO --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --gcs-project-for-requester-pays --disable-tool-default-read-filters false --max-read-length 2147483647 --min-read-length 30 --minimum-mapping-quality 20 --disable-tool-default-annotations false --enable-all-annotations false",Version="4.1.8.1",Date="October 1, 2020 1:16:45 PM CEST">
##normal_sample=25714
##source=Mutect2
##tumor_sample=25715
##contig=<ID=chrM,length=16571,assembly=hg19>
##contig=<ID=chr1,length=249250621,assembly=hg19>
##contig=<ID=chr2,length=243199373,assembly=hg19>
Expand Down
Loading

0 comments on commit 7463e16

Please sign in to comment.