JQ-324 (cgates): pad trailing format field values; improved unit test…

…s of common_tags.py
umich-brcf-bioinf · May 28, 2019 · fb8b6ca · fb8b6ca
1 parent 83dd61d
commit fb8b6ca
Show file tree

Hide file tree

Showing 7 changed files with 64 additions and 23 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,10 @@
 Changelog
 =========
 
+1.1.2 (5/25/2019)
+-----------------
+- Adjusted VCF handling to correctly pad optional trailing FORMAT fields
+
 1.1.1 (10/30/2018)
 -----------------
 - Adjusted Mutect translators to:

diff --git a/jacquard/__init__.py b/jacquard/__init__.py
@@ -1 +1 @@
-__version__ = "1.1.1"
+__version__ = "1.1.2"
diff --git a/jacquard/utils/vcf.py b/jacquard/utils/vcf.py
@@ -332,36 +332,53 @@ def get_empty_record(self):
                          ref=self.ref,
                          alt=self.alt)
 
-    def _format_field(self):
-        """Returns string representation of format field."""
+    def _format_tag_fields(self):
+        """Returns list of format tag names."""
+        tag_names = []
         format_field = "."
         if self.sample_tag_values:
             first_sample = list(self.sample_tag_values.keys())[0]
             tag_names = self.sample_tag_values[first_sample].keys()
             if tag_names:
                 format_field = ":".join(tag_names)
-        return format_field
+        return tag_names
 
-    def _sample_field(self, sample):
-        """Returns string representation of sample-format values.
+    def _sample_field(self, tag_names, sample):
+        """Returns string representation of sample-format values ordered by tag_names.
+        Tag_names must be superset of sample_tag_values.
+        Missing sample_tag_values padded as '.'.
+        If tag_names empty, returns '.'.
 
         Raises:
             KeyError: if requested sample is not defined.
+            ValueError if sample_tag_values has more keys than tag_names.
         """
-        tag_values = self.sample_tag_values[sample].values()
+        sample_tag_values = self.sample_tag_values[sample]
+        missing_tag_names = set(sample_tag_values) - set(tag_names)
+        if missing_tag_names:
+            msg = ('sample format tags are not consistent: '
+                  '{}:{}:{}:{}').format(self.chrom,
+                                        self.pos,
+                                        self.ref,
+                                        self.alt)
+            raise ValueError(msg)
+        tag_values = [sample_tag_values.get(t, '.') for t in tag_names]
         if tag_values:
             return ":".join(tag_values)
         else:
             return "."
 
     def text(self):
         "Returns tab-delimited, newline terminated string of VcfRecord."
+        tag_names = self._format_tag_fields()
+        format_field = '.' if not tag_names else ':'.join(tag_names)
+
         stringifier = [self.chrom, self.pos, self.vcf_id, self.ref, self.alt,
                        self.qual, self.filter, self.info,
-                       self._format_field()]
+                       format_field]
 
         for sample in self.sample_tag_values:
-            stringifier.append(self._sample_field(sample))
+            stringifier.append(self._sample_field(tag_names, sample))
 
         return "\t".join(stringifier) + "\n"
 
@@ -398,6 +415,8 @@ def add_or_replace_filter(self, new_filter):
             self.filter = ";".join([self.filter,
                                     new_filter])
 
+    #TODO: cgates: This is not a good equals method. Please adjust this to
+    # consider the full set of instance variables.
     def __eq__(self, other):
         return isinstance(other, VcfRecord) and self._key == other._key
 
@@ -469,4 +488,3 @@ def __hash__(self):
     def __lt__(self, other):
         key = natsort.natsort_keygen()
         return key(self.file_name) < key(other.file_name)
-
diff --git a/jacquard/variant_caller_transforms/common_tags.py b/jacquard/variant_caller_transforms/common_tags.py
@@ -88,7 +88,7 @@ class ReportedTag(AbstractJacquardTag):
     it explicitly simplifies how summary tags are generated.
     """
     #pylint: disable=too-few-public-methods
-    
+
     def __init__(self, caller_abbreviation):
         super(self.__class__,
               self).__init__(caller_abbreviation,

diff --git a/test/jacquard_test.py b/test/jacquard_test.py
@@ -55,7 +55,7 @@ def test_error_raisesTransformedMessage(self):
 
 class JacquardTestCase(test_case.JacquardBaseTestCase):
     def test_version(self):
-        self.assertEquals("1.1.1", jacquard.__version__)
+        self.assertEquals("1.1.2", jacquard.__version__)
 
     def test_get_execution_context(self):
         command = "foo input_dir output_dir"

diff --git a/test/utils/vcf_test.py b/test/utils/vcf_test.py
@@ -367,20 +367,20 @@ def test_format_field(self):
         sample_names = ["SA", "SB"]
         input_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F3:F1:F2|SA.1:SA.2:SA.3|SB.1:SB.2:SB.3\n")
         record = VcfRecord.parse_record(input_line, sample_names)
-        self.assertEquals("F3:F1:F2", record._format_field())
+        self.assertEquals(["F3", "F1", "F2"], list(record._format_tag_fields()))
 
     def test_format_field_emptyWhenNoSamples(self):
         input_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO\n")
         record = VcfRecord.parse_record(input_line, [])
-        self.assertEquals(".", record._format_field())
+        self.assertEquals([], record._format_tag_fields())
 
     def test_format_field_preservesOrderWhenAddingNewTags(self):
         sample_names = ["SA", "SB"]
         input_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F3:F1:F2|SA.1:SA.2:SA.3|SB.1:SB.2:SB.3\n")
         record = VcfRecord.parse_record(input_line, sample_names)
         record.add_sample_tag_value("Z4", {"SA" : "SA.4", "SB" : "SB.4"})
         record.add_sample_tag_value("A5", {"SA"  :"SA.A5", "SB" : "SB.A5"})
-        self.assertEquals("F3:F1:F2:Z4:A5", record._format_field())
+        self.assertEquals(["F3", "F1", "F2", "Z4", "A5"], list(record._format_tag_fields()))
 
     def test_parse_record_sample_dict(self):
         sample_names = ["SampleA", "SampleB"]
@@ -524,6 +524,15 @@ def test_asTextWhenEmptyFormatField(self):
         expected = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|.|.|.\n")
         self.assertEquals(expected, record.text())
 
+    def test_asTextExpandsEmptyTrailingFormatField(self):
+        sampleA = OrderedDict({'a':'1', 'b':'2'})
+        sampleB = OrderedDict({'a':'10'})
+        sample_tag_values = OrderedDict({"SampleA":sampleA, "SampleB":sampleB})
+        record = VcfRecord("CHROM", "POS", "REF", "ALT", "ID", "QUAL", "FILTER", "INFO", sample_tag_values)
+        expected = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|a:b|1:2|10:.\n")
+        self.assertEquals(expected, record.text())
+
+
     def test_equals(self):
         sample_names = ["sampleA"]
         base = VcfRecord.parse_record(self.entab("A|1|ID|C|D|QUAL|FILTER|INFO|F|S\n"), sample_names)

diff --git a/test/variant_caller_transforms/common_tags_test.py b/test/variant_caller_transforms/common_tags_test.py
@@ -159,14 +159,14 @@ def test_reported_tag_metaheader(self):
                           reported_tag.metaheader)
 
     def test_reported_tag_format(self):
-        reported_tag = common_tags.ReportedTag("foo_")
+        reported_tag = common_tags.ReportedTag("foo")
         actual_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F1:F2|SA.1:SA.2|SB.1:SB.2\n")
         actual_vcf_record = VcfRecord.parse_record(actual_line, ["SA", "SB"])
-        expected_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F1:F2:foo_CALLER_REPORTED"
-                                   "|SA.1:SA.2:1|SB.1:SB.2\n")
+        expected_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F1:F2:JQ_foo_CALLER_REPORTED"
+                                   "|SA.1:SA.2:1|SB.1:SB.2:1\n")
         expected_vcf_record = VcfRecord.parse_record(expected_line, ["SA", "SB"])
         reported_tag.add_tag_values(actual_vcf_record)
-        self.assertEquals(expected_vcf_record, actual_vcf_record)
+        self.assertEquals(expected_vcf_record.text(), actual_vcf_record.text())
 
     def test_passed_tag_metaheader(self):
         passed_tag = common_tags.PassedTag("foo")
@@ -176,12 +176,22 @@ def test_passed_tag_metaheader(self):
                            'original VCF">').format("JQ_foo_CALLER_PASSED"),
                           passed_tag.metaheader)
 
-    def test_passed_tag_format(self):
+    def test_passed_tag_format_noPass(self):
         passed_tag = common_tags.PassedTag("foo")
         actual_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F1:F2|SA.1:SA.2|SB.1:SB.2\n")
         actual_vcf_record = VcfRecord.parse_record(actual_line, ["SA", "SB"])
-        expected_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F1:F2:foo_CALLER_REPORTED"
-                                   "|SA.1:SA.2:1|SB.1:SB.2\n")
+        expected_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|F1:F2:JQ_foo_CALLER_PASSED"
+                                   "|SA.1:SA.2:0|SB.1:SB.2:0\n")
+        expected_vcf_record = VcfRecord.parse_record(expected_line, ["SA", "SB"])
+        passed_tag.add_tag_values(actual_vcf_record)
+        self.assertEquals(expected_vcf_record.text(), actual_vcf_record.text())
+
+    def test_passed_tag_format_noPass(self):
+        passed_tag = common_tags.PassedTag("foo")
+        actual_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|PASS|INFO|F1:F2|SA.1:SA.2|SB.1:SB.2\n")
+        actual_vcf_record = VcfRecord.parse_record(actual_line, ["SA", "SB"])
+        expected_line = self.entab("CHROM|POS|ID|REF|ALT|QUAL|PASS|INFO|F1:F2:JQ_foo_CALLER_PASSED"
+                                   "|SA.1:SA.2:1|SB.1:SB.2:1\n")
         expected_vcf_record = VcfRecord.parse_record(expected_line, ["SA", "SB"])
         passed_tag.add_tag_values(actual_vcf_record)
-        self.assertEquals(expected_vcf_record, actual_vcf_record)
+        self.assertEquals(expected_vcf_record.text(), actual_vcf_record.text())