Merge branch 'jq-320_fix_expand_handling_of_name_collisions_between_f…

…ixed_and_info' into develop
umich-brcf-bioinf · Jun 6, 2018 · 5adfd5c · 5adfd5c
2 parents c9cbd39 + 7fc0020
commit 5adfd5c
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 13 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,9 +1,11 @@
 Changelog
 =========
 
-0.43 (XX/XX/XXXX)
+0.43 (6/5/2018)
 -----------------
 - Removed obsolete spikes directory
+- Fixed bug in *expand* which could overwrite fixed VCF fields (e.g. REF, ALT,
+  etc) if identically named fields in INFO.
 
 0.42 (9/22/2015)
 ----------------
@@ -22,15 +24,15 @@ Changelog
  - Improved checks for consistent VCF file sets
  - Fixed bug in *merge* that caused error if any VCFs were unsorted
  - Fixed bug in *summarize* that caused error if variant was called by subset
-   of callers 
+   of callers
 
 0.31 (3/17/2015)
 ----------------
  - Downgraded VCF format from 4.2 to 4.1
  - Fixed a bug that omitted CALLERS_REPORTED_LIST summary tag
  - Simplified summary tags; removed dependency on numpy
  - Adjusted VarScan translation to accept a file pattern to identify
-   high-confidence files 
+   high-confidence files
 
 
 0.3 (3/9/2015)
@@ -39,7 +41,7 @@ Changelog
    on incoming data.
  - Renamed *consensus* to *summarize*
  - More consistent behavior in *expand*
- - Significantly improved *merge* performance 
+ - Significantly improved *merge* performance
  - Added new summary tags:
    - CALLERS_REPORTED_COUNT
    - CALLERS_REPORTED_LIST
@@ -49,12 +51,10 @@ Changelog
    - SAMPLES_PASSED_COUNT
  - Fixed bug in how Strelka calculated AF on indels
  - Improved command validation and error handling
- - Added project/code documentation 
+ - Added project/code documentation
  - Removed dependencies on pandas
-  
-  
+
+
 0.21 (10/2014)
 --------------
  - Initial public release
-
-
diff --git a/jacquard/expand.py b/jacquard/expand.py
@@ -60,10 +60,12 @@ def _create_row_dict(column_list, vcf_record):
         for format_key, format_value in format_key_values.items():
             row_dict[format_key + "|" + sample_name] = format_value
 
-    new_dict = row_dict.copy()
-    new_dict.update(vcf_record.info_dict)
+    for (name, value) in vcf_record.info_dict.items():
+        if name in row_dict:
+            name = "INFO_" + name
+        row_dict[name] = value
 
-    return new_dict
+    return row_dict
 
 def _filter_column_list(column_spec_list,
                         potential_col_list,

diff --git a/test/expand_test.py b/test/expand_test.py
@@ -62,6 +62,33 @@ def test_create_row_dict(self):
                          "AF|SAMPLE_A|TUMOR": "0.3"}
         self.assertEquals(expected_dict, actual_dict)
 
+    def test_create_row_dict_fieldNamesMangledToAvoidCollision(self):
+        column_list = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER",
+                       "INFO" ] #, "FORMAT", "SAMPLE_A|NORMAL", "SAMPLE_A|TUMOR"]
+        # sample_tag_values = {"SAMPLE_A|NORMAL":{"DP":"50", "AF":"0.2"},
+        #                      "SAMPLE_A|TUMOR":{"DP":"87", "AF":"0.3"}}
+        vcf_record = vcf.VcfRecord("1", "42", "A", "AT",
+                                   vcf_id="rs32", qual="30", vcf_filter="PASS",
+                                   info="SNP;REF;ALT=Yep"
+                                   )#sample_tag_values=sample_tag_values)
+        actual_dict = expand._create_row_dict(column_list, vcf_record)
+
+        expected_dict = {"CHROM": "1",
+                         "POS": "42",
+                         "ID": "rs32",
+                         "REF": "A",
+                         "ALT": "AT",
+                         "QUAL": "30",
+                         "FILTER": "PASS",
+                         "SNP": "SNP",
+                         "INFO_REF": "REF",
+                         "INFO_ALT": "Yep"}
+                         # "DP|SAMPLE_A|NORMAL": "50",
+                         # "DP|SAMPLE_A|TUMOR": "87",
+                         # "AF|SAMPLE_A|NORMAL": "0.2",
+                         # "AF|SAMPLE_A|TUMOR": "0.3"}
+        self.assertEquals(expected_dict, actual_dict)
+
     def test_filter_column_list(self):
         potential_col_list = OrderedDict([("CHROM", None),
                                           ("POS", None),
@@ -397,4 +424,3 @@ def test_expand_colSpec(self):
                                         "expanded.txt")
 
             self.assertCommand(command, expected_file)
-