Skip to content

Commit

Permalink
Merge branch 'JQ-324_pad_optional_trailing_format_fields' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
cgates committed Jul 11, 2019
2 parents 2f33bf8 + 94717c3 commit c139517
Show file tree
Hide file tree
Showing 11 changed files with 228 additions and 62 deletions.
5 changes: 2 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@ python:

install:
- pip install -r requirements.txt

before_script:
- pip install coverage
- pip install coveralls

script:
script:
- nosetests --with-coverage --verbose --cover-package=jacquard
- nosetests examples

Expand All @@ -23,4 +23,3 @@ after_success:
notifications:
email:
- cgates@umich.edu
- jebene@umich.edu
6 changes: 4 additions & 2 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
Changelog
=========

x.y.z (mm/dd/yyyy)
1.1.2 (5/29/2019)
-----------------
- tbd
- Adjusted VCF handling to correctly pad optional trailing FORMAT fields
- Adjusted Mutect translation to regard PASS as somatic when FilterMutectCalls
present in metaheaders.

1.1.1 (10/30/2018)
-----------------
Expand Down
2 changes: 1 addition & 1 deletion jacquard/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.1.1x"
__version__ = "1.1.2"
54 changes: 38 additions & 16 deletions jacquard/utils/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,38 +332,59 @@ def get_empty_record(self):
ref=self.ref,
alt=self.alt)

def _format_field(self):
"""Returns string representation of format field."""
format_field = "."
def _format_tag_fields(self):
"""Returns list of format tag names."""
tag_names = []
if self.sample_tag_values:
first_sample = list(self.sample_tag_values.keys())[0]
tag_names = self.sample_tag_values[first_sample].keys()
if tag_names:
format_field = ":".join(tag_names)
return format_field
tag_names = list(self.sample_tag_values[first_sample].keys())
return tag_names

def _sample_field(self, sample):
"""Returns string representation of sample-format values.
def _sample_field(self, tag_names, sample):
"""Returns string representation of sample-format values ordered by tag_names.
Tag_names must be superset of sample_tag_values.
Missing sample_tag_values padded as '.'.
If tag_names empty, returns '.'.
Raises:
KeyError: if requested sample is not defined.
ValueError if sample_tag_values has more keys than tag_names.
"""
tag_values = self.sample_tag_values[sample].values()
sample_tag_values = self.sample_tag_values[sample]
missing_tag_names = set(sample_tag_values) - set(tag_names)
if missing_tag_names:
msg = ('{}:{}:{}:{}|sample format tags are not consistent: '
'requested tags [{}] but sample {} has has tags [{}] '
'leaving behind [{}]')\
.format(self.chrom,
self.pos,
self.ref,
self.alt,
', '.join(tag_names),
sample,
', '.join(['{}={}'.format(k,v) for k,v in sample_tag_values.items()]),
', '.join(missing_tag_names)
)
raise ValueError(msg)
tag_values = [sample_tag_values.get(t, '.') for t in tag_names]
if tag_values:
return ":".join(tag_values)
else:
return "."

def text(self):
"Returns tab-delimited, newline terminated string of VcfRecord."
stringifier = [self.chrom, self.pos, self.vcf_id, self.ref, self.alt,
self.qual, self.filter, self.info,
self._format_field()]
tag_names = self._format_tag_fields()
format_field = '.' if not tag_names else ':'.join(tag_names)

fields = [self.chrom, self.pos, self.vcf_id, self.ref, self.alt,
self.qual, self.filter, self.info,
format_field]

for sample in self.sample_tag_values:
stringifier.append(self._sample_field(sample))
fields.append(self._sample_field(tag_names, sample))

return "\t".join(stringifier) + "\n"
return "\t".join(fields) + "\n"

def _samples_match(self, new_sample_values):
return set(new_sample_values.keys()) == \
Expand Down Expand Up @@ -398,6 +419,8 @@ def add_or_replace_filter(self, new_filter):
self.filter = ";".join([self.filter,
new_filter])

#TODO: cgates: This is not a good equals method. Please adjust this to
# consider the full set of instance variables.
def __eq__(self, other):
return isinstance(other, VcfRecord) and self._key == other._key

Expand Down Expand Up @@ -469,4 +492,3 @@ def __hash__(self):
def __lt__(self, other):
key = natsort.natsort_keygen()
return key(self.file_name) < key(other.file_name)

2 changes: 1 addition & 1 deletion jacquard/variant_caller_transforms/common_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ class ReportedTag(AbstractJacquardTag):
it explicitly simplifies how summary tags are generated.
"""
#pylint: disable=too-few-public-methods

def __init__(self, caller_abbreviation):
super(self.__class__,
self).__init__(caller_abbreviation,
Expand Down
51 changes: 49 additions & 2 deletions jacquard/variant_caller_transforms/mutect.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def add_tag_values(self, vcf_record):
sample_values[samp] = vcf_record.sample_tag_values[samp]["DP"]
vcf_record.add_sample_tag_value(self.tag_id, sample_values)

class _SomaticTag(common_tags.AbstractJacquardTag):
class _SomaticTagSS(common_tags.AbstractJacquardTag):
#pylint: disable=too-few-public-methods
_DESCRIPTION = (\
'''Jacquard somatic status for MuTect: 0=non-somatic,1=somatic (based on SS
Expand Down Expand Up @@ -108,6 +108,53 @@ def _somatic_status(ss_value):
else:
return "0"

class _SomaticTagFilterMutectCalls(common_tags.AbstractJacquardTag):
#pylint: disable=too-few-public-methods
_DESCRIPTION = (\
'''Jacquard somatic status for MuTect: 0=non-somatic,1=somatic (based on
FilterMutectCalls setting filter to PASS)''').replace("\n","")
#pylint: disable=too-few-public-methods
def __init__(self):
super(self.__class__,
self).__init__(MUTECT_ABBREVIATION,
common_tags.SOMATIC_TAG,
self._DESCRIPTION)

def add_tag_values(self, vcf_record):
sample_values = {}
if vcf_record.filter == "PASS":
for sample in vcf_record.sample_tag_values:
sample_values[sample] = self._somatic_status(vcf_record, sample)
else:
for sample in vcf_record.sample_tag_values:
sample_values[sample] = "0"
vcf_record.add_sample_tag_value(self.tag_id, sample_values)

@staticmethod
def _somatic_status(vcf_record, sample):
tag_values = vcf_record.sample_tag_values[sample]
try:
gt = tag_values["GT"]
except KeyError:
msg_fmt = ('Cannot assign somatic status using FilterMutectCalls '
'when sample GT absent: '
'(CHROM:POS:REF:ALT={}:{}:{}:{})')
msg = msg_fmt.format(vcf_record.chrom,
vcf_record.pos,
vcf_record.ref,
vcf_record.alt)
raise utils.JQException(msg)

if gt == "0/0":
return "0"
else:
return "1"

def _build_somatic_tag(metaheaders):
if "##source=FilterMutectCalls" in metaheaders:
return _SomaticTagFilterMutectCalls()
else:
return _SomaticTagSS()

class _Mutect1Parser(object):
_MUTECT1_METAHEADER_REGEX = re.compile('^##MuTect=')
Expand Down Expand Up @@ -294,7 +341,7 @@ def __init__(self, vcf_reader):
common_tags.PassedTag(MUTECT_ABBREVIATION),
_AlleleFreqTag(),
_DepthTag(),
_SomaticTag(),
_build_somatic_tag(vcf_reader.metaheaders),
_GenotypeTag()]

def _get_new_metaheaders(self):
Expand Down
2 changes: 1 addition & 1 deletion test/jacquard_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_error_raisesTransformedMessage(self):

class JacquardTestCase(test_case.JacquardBaseTestCase):
def test_version(self):
self.assertEquals("1.1.1x", jacquard.__version__)
self.assertEquals("1.1.2", jacquard.__version__)

def test_get_execution_context(self):
command = "foo input_dir output_dir"
Expand Down
19 changes: 0 additions & 19 deletions test/merge_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,25 +154,6 @@ def test_init_includeAllFlag(self):
self.assertEquals(merge._Filter._include_row_if_all,
record_filter._row_filter_strategy)

# def test_init_includeAllFlag_raisesError(self):
# args = Namespace(include_all=True, include_cells=True, include_rows=True)
# self.assertRaisesRegexp(utils.UsageError,
# "Unable to process command-line arguments. Neither --include_cells nor --include_rows can be specified if --include_all is specified.",
# merge._Filter,
# args)
#
# args = Namespace(include_all=True, include_cells=False, include_rows=True)
# self.assertRaisesRegexp(utils.UsageError,
# "Unable to process command-line arguments. Neither --include_cells nor --include_rows can be specified if --include_all is specified.",
# merge._Filter,
# args)
#
# args = Namespace(include_all=True, include_cells=True, include_rows=False)
# self.assertRaisesRegexp(utils.UsageError,
# "Unable to process command-line arguments. Neither --include_cells nor --include_rows can be specified if --include_all is specified.",
# merge._Filter,
# args)

def test_init_includeValidAnysomaticByDefault(self):
args = Namespace(include_all=False, include_cells="valid", include_rows="at_least_one_somatic")
record_filter = merge._Filter(args)
Expand Down
28 changes: 25 additions & 3 deletions test/utils/vcf_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,20 +367,20 @@ def test_format_field(self):
sample_names = ["SA", "SB"]
input_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F3:F1:F2|SA.1:SA.2:SA.3|SB.1:SB.2:SB.3\n")
record = VcfRecord.parse_record(input_line, sample_names)
self.assertEquals("F3:F1:F2", record._format_field())
self.assertEquals(["F3", "F1", "F2"], list(record._format_tag_fields()))

def test_format_field_emptyWhenNoSamples(self):
input_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO\n")
record = VcfRecord.parse_record(input_line, [])
self.assertEquals(".", record._format_field())
self.assertEquals([], record._format_tag_fields())

def test_format_field_preservesOrderWhenAddingNewTags(self):
sample_names = ["SA", "SB"]
input_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F3:F1:F2|SA.1:SA.2:SA.3|SB.1:SB.2:SB.3\n")
record = VcfRecord.parse_record(input_line, sample_names)
record.add_sample_tag_value("Z4", {"SA" : "SA.4", "SB" : "SB.4"})
record.add_sample_tag_value("A5", {"SA" :"SA.A5", "SB" : "SB.A5"})
self.assertEquals("F3:F1:F2:Z4:A5", record._format_field())
self.assertEquals(["F3", "F1", "F2", "Z4", "A5"], list(record._format_tag_fields()))

def test_parse_record_sample_dict(self):
sample_names = ["SampleA", "SampleB"]
Expand Down Expand Up @@ -524,6 +524,28 @@ def test_asTextWhenEmptyFormatField(self):
expected = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|.|.|.\n")
self.assertEquals(expected, record.text())

def test_asTextExpandsEmptyTrailingFormatField(self):
sampleA = OrderedDict([('a','1'), ('b','2')])
sampleB = OrderedDict([('a','10')])
sample_tag_values = OrderedDict([("SampleA", sampleA), ("SampleB", sampleB)])
record = VcfRecord("CHROM", "POS", "REF", "ALT", "ID", "QUAL", "FILTER", "INFO", sample_tag_values)
expected = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|a:b|1:2|10:.\n")
self.assertEquals(expected, record.text())

def test_sample_field_whenInconsistentTags(self):
# FYI this should never happen in the wild, but I wanted to test the exception formatting.
sampleA = OrderedDict([('a','1'), ('b','2')])
sampleB = OrderedDict([('a','10')])
sample_tag_values = OrderedDict([("SampleA", sampleA), ("SampleB", sampleB)])
record = VcfRecord("CHROM", "POS", "REF", "ALT", "ID", "QUAL", "FILTER", "INFO", sample_tag_values)

self.assertRaisesRegexp(ValueError,
r'CHROM:POS:REF:ALT|sample format tags are not consistent: requested tags \[a\] but sample has has tags \[a=1, b=2\] leaving behind \[b\]',
record._sample_field,
['a'],
'SampleA')


def test_equals(self):
sample_names = ["sampleA"]
base = VcfRecord.parse_record(self.entab("A|1|ID|C|D|QUAL|FILTER|INFO|F|S\n"), sample_names)
Expand Down
26 changes: 18 additions & 8 deletions test/variant_caller_transforms/common_tags_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,14 +159,14 @@ def test_reported_tag_metaheader(self):
reported_tag.metaheader)

def test_reported_tag_format(self):
reported_tag = common_tags.ReportedTag("foo_")
reported_tag = common_tags.ReportedTag("foo")
actual_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F1:F2|SA.1:SA.2|SB.1:SB.2\n")
actual_vcf_record = VcfRecord.parse_record(actual_line, ["SA", "SB"])
expected_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F1:F2:foo_CALLER_REPORTED"
"|SA.1:SA.2:1|SB.1:SB.2\n")
expected_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F1:F2:JQ_foo_CALLER_REPORTED"
"|SA.1:SA.2:1|SB.1:SB.2:1\n")
expected_vcf_record = VcfRecord.parse_record(expected_line, ["SA", "SB"])
reported_tag.add_tag_values(actual_vcf_record)
self.assertEquals(expected_vcf_record, actual_vcf_record)
self.assertEquals(expected_vcf_record.text(), actual_vcf_record.text())

def test_passed_tag_metaheader(self):
passed_tag = common_tags.PassedTag("foo")
Expand All @@ -176,12 +176,22 @@ def test_passed_tag_metaheader(self):
'original VCF">').format("JQ_foo_CALLER_PASSED"),
passed_tag.metaheader)

def test_passed_tag_format(self):
def test_passed_tag_format_noPass(self):
passed_tag = common_tags.PassedTag("foo")
actual_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F1:F2|SA.1:SA.2|SB.1:SB.2\n")
actual_vcf_record = VcfRecord.parse_record(actual_line, ["SA", "SB"])
expected_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F1:F2:foo_CALLER_REPORTED"
"|SA.1:SA.2:1|SB.1:SB.2\n")
expected_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F1:F2:JQ_foo_CALLER_PASSED"
"|SA.1:SA.2:0|SB.1:SB.2:0\n")
expected_vcf_record = VcfRecord.parse_record(expected_line, ["SA", "SB"])
passed_tag.add_tag_values(actual_vcf_record)
self.assertEquals(expected_vcf_record.text(), actual_vcf_record.text())

def test_passed_tag_format_noPass(self):
passed_tag = common_tags.PassedTag("foo")
actual_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|PASS|INFO|F1:F2|SA.1:SA.2|SB.1:SB.2\n")
actual_vcf_record = VcfRecord.parse_record(actual_line, ["SA", "SB"])
expected_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|PASS|INFO|F1:F2:JQ_foo_CALLER_PASSED"
"|SA.1:SA.2:1|SB.1:SB.2:1\n")
expected_vcf_record = VcfRecord.parse_record(expected_line, ["SA", "SB"])
passed_tag.add_tag_values(actual_vcf_record)
self.assertEquals(expected_vcf_record, actual_vcf_record)
self.assertEquals(expected_vcf_record.text(), actual_vcf_record.text())
Loading

0 comments on commit c139517

Please sign in to comment.