Skip to content

Commit

Permalink
Merge branch 'JQ-322_adjust_mutect_translate' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
cgates committed Oct 30, 2018
2 parents fd10c29 + 55525ba commit 7e9fe72
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 17 deletions.
12 changes: 8 additions & 4 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
Changelog
=========

x.x.x (mm/dd/yyyy)
------------------
-
1.1.1 (10/30/2018)
-----------------
- Adjusted Mutect translators to:

- parse normal and tumor designations from SAMPLE metaheaders if available
- recognize more variations of Mutect metaheader formats

1.1.0 (6/18/2018)
-----------------
- Adjusted *translate* to correctly parse newer versions of Mutect
- Updated supported versions for Mutect, Strelka, Varscan
- Fixed error in JQ_SUMMARY_DP_AVERAGE tag description
- Fixed error in JQ_SUMMARY_DP_AVERAGE tag description

1.0.0 (6/5/2018)
-----------------
Expand Down Expand Up @@ -55,12 +57,14 @@ x.x.x (mm/dd/yyyy)
- More consistent behavior in *expand*
- Significantly improved *merge* performance
- Added new summary tags:

- CALLERS_REPORTED_COUNT
- CALLERS_REPORTED_LIST
- SAMPLES_REPORTED_COUNT
- CALLERS_PASSED_COUNT
- CALLERS_PASSED_LIST
- SAMPLES_PASSED_COUNT

- Fixed bug in how Strelka calculated AF on indels
- Improved command validation and error handling
- Added project/code documentation
Expand Down
28 changes: 23 additions & 5 deletions jacquard/variant_caller_transforms/mutect.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

JQ_MUTECT_TAG = "JQ_MT_"
MUTECT_ABBREVIATION = "MT"
VERSION = "v1.1-4.0"
VERSION = "v1.1 - v4.0"

class _GenotypeTag(common_tags.AbstractJacquardTag):
#pylint: disable=too-few-public-methods
Expand Down Expand Up @@ -143,18 +143,30 @@ def get_mutect_header(metaheaders):
return mutect_dict

class _Mutect2Parser(object):
_MUTECT2_METAHEADER_REGEX = re.compile('^##GATKCommandLine=<.*ID=Mutect2')
_MUTECT2_METAHEADER_DICT = re.compile('##GATKCommandLine=<ID=Mutect2.*CommandLine="(.*?)"')
_MUTECT2_METAHEADER_REGEX = re.compile('^##GATKCommandLine.*?=<.*ID=Mu[tT]ect2')
_MUTECT2_METAHEADER_SAMPLE_REGEX = re.compile('^##SAMPLE=<ID=(.*?),SampleName=(.*?),.*')
_MUTECT2_METAHEADER_COMMAND_REGEX = re.compile('^##GATKCommandLine.*?=<.*ID=Mu[tT]ect2.*CommandLine.*?="(.*?)"')

@staticmethod
def is_mutect_metaheader(metaheader):
return _Mutect2Parser._MUTECT2_METAHEADER_REGEX.search(metaheader)

@staticmethod
def build_mutect_dict(metaheaders, normal_key, tumor_key):
def _mutect_dict_from_sample_metalines(metaheaders, normal_key, tumor_key):
mutect_keys = {'NORMAL': normal_key, 'TUMOR': tumor_key}
mutect_dict = {}
for metaheader in metaheaders:
match = _Mutect2Parser._MUTECT2_METAHEADER_SAMPLE_REGEX.search(metaheader)
if match and match.group(1) in mutect_keys:
key = mutect_keys[match.group(1)]
mutect_dict[key] = match.group(2)
return mutect_dict

@staticmethod
def _mutect_dict_from_command_line(metaheaders, normal_key, tumor_key):
def get_mutect_header(metaheaders):
for metaheader in metaheaders:
match = _Mutect2Parser._MUTECT2_METAHEADER_DICT.search(metaheader)
match = _Mutect2Parser._MUTECT2_METAHEADER_COMMAND_REGEX.search(metaheader)
if match:
return match.group(1)
return None
Expand All @@ -171,6 +183,12 @@ def get_mutect_header(metaheaders):
mutect_dict[tumor_key] = args.tumor_sample
return mutect_dict

@staticmethod
def build_mutect_dict(metaheaders, normal_key, tumor_key):
mutect_dict = _Mutect2Parser._mutect_dict_from_sample_metalines(metaheaders, normal_key, tumor_key)
if not mutect_dict:
mutect_dict = _Mutect2Parser._mutect_dict_from_command_line(metaheaders, normal_key, tumor_key)
return mutect_dict

def _get_mutect_parser(metaheaders):
for metaheader in metaheaders:
Expand Down
2 changes: 1 addition & 1 deletion jacquard/variant_caller_transforms/strelka.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

JQ_STRELKA_TAG = "JQ_SK_"
STRELKA_ABBREVIATION = "SK"
VERSION = "v1.0-2.9"
VERSION = "v1.0 - v2.9"

class _GenotypeTag(common_tags.AbstractJacquardTag):
#pylint: disable=too-few-public-methods
Expand Down
2 changes: 1 addition & 1 deletion jacquard/variant_caller_transforms/varscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
"NORMAL|TUMOR").replace("|", "\t")
JQ_VARSCAN_TAG = "JQ_VS_"
VARSCAN_ABBREVIATION = "VS"
VERSION = "v2.3-2.4"
VERSION = "v2.3 - v2.4"

def _varscan_hc_fileheader(line):
return line.startswith("chrom\tposition")
Expand Down
52 changes: 46 additions & 6 deletions test/variant_caller_transforms/mutect_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,19 +171,41 @@ def test_claim_vcfExtensionCaseInsensitive(self):
self.assertEquals(0, len(unrecognized_readers))
self.assertEquals(1, len(vcf_readers))

def test_claim_metaheaderRecognizesOldAndNewVersionsOfMutect(self):
def test_claim_metaheaderRecognizesMutectV2x(self):
record1 = "chr1\t.\t.\t.\t.\t.\t.\t.\t."
content1 = ["##foo", "##MuTect=2.1", "#chrom", record1]
reader1 = MockFileReader("fileA.vcf", content1)
content2 = ["##foo", '##GATKCommandLine=<ID=Mutect2,CommandLine="Mutect2 ...">', "#chrom", record1]
reader2 = MockFileReader("fileB.vcf", content2)
file_readers = [reader1, reader2]
file_readers = [reader1]

caller = mutect.Mutect()
unrecognized_readers, vcf_readers = caller.claim(file_readers)

self.assertEquals(0, len(unrecognized_readers))
self.assertEquals(1, len(vcf_readers))

def test_claim_metaheaderRecognizesMutectV3x(self):
record1 = "chr1\t.\t.\t.\t.\t.\t.\t.\t."
content1 = ["##foo", '##GATKCommandLine.MuTect2=<ID=MuTect2,CommandLineOptions="MuTect2 ...">', "#chrom", record1]
reader1 = MockFileReader("fileB.vcf", content1)
file_readers = [reader1]

caller = mutect.Mutect()
unrecognized_readers, vcf_readers = caller.claim(file_readers)

self.assertEquals(0, len(unrecognized_readers))
self.assertEquals(2, len(vcf_readers))
self.assertEquals(1, len(vcf_readers))

def test_claim_metaheaderRecognizesMutectV4x(self):
record1 = "chr1\t.\t.\t.\t.\t.\t.\t.\t."
content1 = ["##foo", '##GATKCommandLine=<ID=Mutect2,CommandLine="Mutect2 ...">', "#chrom", record1]
reader1 = MockFileReader("fileB.vcf", content1)
file_readers = [reader1]

caller = mutect.Mutect()
unrecognized_readers, vcf_readers = caller.claim(file_readers)

self.assertEquals(0, len(unrecognized_readers))
self.assertEquals(1, len(vcf_readers))

class MutectVcfReaderTestCase(test_case.JacquardBaseTestCase):
def test_metaheaders(self):
Expand Down Expand Up @@ -251,7 +273,24 @@ def test_column_header_mangleSampleNameMutect1(self):

self.assertEquals(expected_column_header, mutect_vcf_reader.column_header)

def test_column_header_mangleSampleNameMutect2(self):
def test_column_header_mangleSampleNameMutect2UsesSampleMetalinesIfAvailable(self):
column_header = self.entab("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|25714|25715")
meta_header = '''
##GATKCommandLine=<ID=Mutect2,CommandLine="Mutect2 --tumor-sample A --normal-sample B",Date="recent">'
##foo=42
##SAMPLE=<ID=NORMAL,SampleName=25714,File=foo.bam>
##SAMPLE=<ID=TUMOR,SampleName=25715,File=bar.bam>
##baz=42
'''
vcf_reader = MockVcfReader(metaheaders=meta_header.strip().split('\n'),
column_header=column_header)
mutect_vcf_reader = mutect._MutectVcfReader(vcf_reader)

expected_column_header = self.entab("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|NORMAL|TUMOR")

self.assertEquals(expected_column_header, mutect_vcf_reader.column_header)

def test_column_header_mangleSampleNameMutect2UsesCommandLineIfNoSampleMetalines(self):
column_header = self.entab("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|25714|25715")
meta_header = '##GATKCommandLine=<ID=Mutect2,CommandLine="Mutect2 --tumor-sample 25715 --normal-sample 25714",Date="recent">'
vcf_reader = MockVcfReader(metaheaders=[meta_header],
Expand All @@ -262,6 +301,7 @@ def test_column_header_mangleSampleNameMutect2(self):

self.assertEquals(expected_column_header, mutect_vcf_reader.column_header)


def test_column_header_mangleSampleNameMutect2IgnoresHelpFlag(self):
column_header = self.entab("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|25714|25715")
meta_header = '##GATKCommandLine=<ID=Mutect2,CommandLine="Mutect2 --tumor-sample 25715 --normal-sample 25714 --help false",Date="recent">'
Expand Down

0 comments on commit 7e9fe72

Please sign in to comment.