Skip to content

Commit

Permalink
ex-266 (jebene/dkriti) Made MergeVcfReader understand which subset of
Browse files Browse the repository at this point in the history
format tags to keep; it modifies the format_tags in each vcf_record
  • Loading branch information
dkriti committed May 27, 2015
1 parent 36c2f3e commit 36346f1
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 14 deletions.
25 changes: 22 additions & 3 deletions jacquard/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,14 @@ def _get_next(self):
return None

class MergeVcfReader(vcf.VcfReader):
def __init__(self, file_reader):
def __init__(self, file_reader, specified_regex=None):
super(self.__class__,self).__init__(file_reader)

if specified_regex:
self.format_tag_regexes = specified_regex.split(',')
else:
self.format_tag_regexes = _DEFAULT_INCLUDED_FORMAT_TAGS

self.format_tags = {}

def modify_metaheader(self, original_metaheader, transformed_tag):
Expand All @@ -114,8 +120,20 @@ def modify_metaheader(self, original_metaheader, transformed_tag):
def store_format_tags(self, original_tag, new_tag):
self.format_tags[original_tag] = new_tag

def _get_format_tag_subset(self, vcf_record):
new_sample_tag_values = OrderedDict()
for sample, tag_values in list(vcf_record.sample_tag_values.items()):
new_sample_tag_values[sample] = {}
for tag, value in list(tag_values.items()):
for regex in self.format_tag_regexes:
if re.match(regex, tag):
new_sample_tag_values[sample][tag] = value
vcf_record.sample_tag_values = new_sample_tag_values

return vcf_record

@staticmethod
def modify_format_tag(vcf_record, format_tags):
def _modify_format_tag(vcf_record, format_tags):
for tags in list(vcf_record.sample_tag_values.values()):
for original_tag, new_tag in list(format_tags.items()):
if new_tag not in tags and original_tag in tags:
Expand Down Expand Up @@ -151,7 +169,8 @@ def vcf_records(self, format_tags=None, qualified=False):
continue
vcf_record = vcf.VcfRecord.parse_record(line, sample_names)
if format_tags:
vcf_record = self.modify_format_tag(vcf_record, format_tags)
vcf_record = self._get_format_tag_subset(vcf_record)
vcf_record = self._modify_format_tag(vcf_record, format_tags)
yield vcf_record


Expand Down
88 changes: 77 additions & 11 deletions test/merge_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,47 @@ def test_extends_vcf_readers(self):
self.entab("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|SampleNormal|SampleTumor\n"),
self.entab("chr2|1|.|A|C|.|.|INFO|DP|32|78")]
file_reader = MockFileReader("A.mutect.vcf", file_contents)
vcf_reader = vcf.VcfReader(file_reader)
merge_vcf_reader = merge.MergeVcfReader(vcf_reader._file_reader)
merge_vcf_reader = merge.MergeVcfReader(file_reader)

self.assertEquals(merge_vcf_reader._file_reader, file_reader)

def test_stores_format_tags_to_keep_default(self):
file_contents = ["##metaheader1\n",
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">\n',
'##FORMAT=<ID=AF,Number=1,Type=Integer,Description="Allele Frequency">\n',
self.entab("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|SampleNormal|SampleTumor\n"),
self.entab("chr2|1|.|A|C|.|.|INFO|DP|32|78")]
file_reader = MockFileReader("A.mutect.vcf", file_contents)
merge_vcf_reader = merge.MergeVcfReader(file_reader)

self.assertEquals(merge._DEFAULT_INCLUDED_FORMAT_TAGS,
merge_vcf_reader.format_tag_regexes)

def test_stores_format_tags_to_keep_userSpecified(self):
file_contents = ["##metaheader1\n",
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">\n',
'##FORMAT=<ID=AF,Number=1,Type=Integer,Description="Allele Frequency">\n',
self.entab("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|SampleNormal|SampleTumor\n"),
self.entab("chr2|1|.|A|C|.|.|INFO|DP|32|78")]
file_reader = MockFileReader("A.mutect.vcf", file_contents)
specified_regex = ".*"
merge_vcf_reader = merge.MergeVcfReader(file_reader, specified_regex)

self.assertEquals([specified_regex], merge_vcf_reader.format_tag_regexes)

def test_stores_format_tags_to_keep_userSpecifiedSplitsIntoList(self):
file_contents = ["##metaheader1\n",
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">\n',
'##FORMAT=<ID=AF,Number=1,Type=Integer,Description="Allele Frequency">\n',
'##FORMAT=<ID=GT,Number=1,Type=Integer,Description="Allele Frequency">\n',
self.entab("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|SampleNormal|SampleTumor\n"),
self.entab("chr2|1|.|A|C|.|.|INFO|DP|32|78")]
file_reader = MockFileReader("A.mutect.vcf", file_contents)
specified_regex = "^DP$,^AF$"
merge_vcf_reader = merge.MergeVcfReader(file_reader, specified_regex)

self.assertEquals(["^DP$", "^AF$"], merge_vcf_reader.format_tag_regexes)

def test_modify_metaheader(self):
file_contents = ["##metaheader1\n",
'##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n',
Expand All @@ -59,6 +95,35 @@ def test_modify_metaheader(self):
self.assertIn('##FORMAT=<ID=JX1_DP,Number=1,Type=Integer,Description="Read Depth">', merge_vcf_reader.metaheaders)
self.assertNotIn('##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">', merge_vcf_reader.metaheaders)

def test_get_format_tag_subset_defaultTagRegex(self):
file_contents = ["##metaheader1\n",
'##FORMAT=<ID=JQ_AF,Number=A,Type=Float,Description="Allele Frequency">\n',
'##FORMAT=<ID=JQ_DP,Number=1,Type=Integer,Description="Read Depth">\n',
'##FORMAT=<ID=GT,Number=1,Type=Integer,Description="Genotype">\n',
self.entab("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|SampleNormal|SampleTumor\n"),
self.entab("chr2|1|.|A|C|.|.|INFO|FORMAT|NORMAL|TUMOR")]
mock_file_reader = MockFileReader("my_dir/my_file.txt", file_contents)
merge_vcf_reader = merge.MergeVcfReader(mock_file_reader)

original_vcf_record = MockVcfRecord("chr1", "245", "A", "G", vcf_format="JQ_AF:JQ_DP", samples=["0.2:21", "0.34:56"])
new_vcf_record = merge_vcf_reader._get_format_tag_subset(original_vcf_record)

self.assertEquals(set(["JQ_AF", "JQ_DP"]), new_vcf_record.format_tags)

def test_get_format_tag_subset_specifiedTagRegex(self):
file_contents = ["##metaheader1\n",
'##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n',
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">\n',
self.entab("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|SampleNormal|SampleTumor\n"),
self.entab("chr2|1|.|A|C|.|.|INFO|FORMAT|NORMAL|TUMOR")]
mock_file_reader = MockFileReader("my_dir/my_file.txt", file_contents)
merge_vcf_reader = merge.MergeVcfReader(mock_file_reader, "^AF")

original_vcf_record = MockVcfRecord("chr1", "245", "A", "G", vcf_format="AF:DP", samples=["0.2:21", "0.34:56"])
new_vcf_record = merge_vcf_reader._get_format_tag_subset(original_vcf_record)

self.assertEquals(set(["AF"]), new_vcf_record.format_tags)

def test_modify_format_tags(self):
file_contents = ["##metaheader1\n",
'##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n',
Expand All @@ -74,7 +139,7 @@ def test_modify_format_tags(self):
self.assertEquals(OrderedDict({0: {"AF": "0.2", "DP": "21"}, 1: {"AF": "0.34", "DP": "56"}}),
vcf_record.sample_tag_values)

merge_vcf_reader.modify_format_tag(vcf_record, format_tags)
merge_vcf_reader._modify_format_tag(vcf_record, format_tags)

self.assertEquals(OrderedDict({0: {"AF": "0.2", "JX1_DP": "21"}, 1: {"AF": "0.34", "JX1_DP": "56"}}),
vcf_record.sample_tag_values)
Expand All @@ -95,27 +160,28 @@ def test_modify_format_tags_inconsistentFormatTags(self):
self.assertEquals(OrderedDict({0: {"AF": "0.2", "DP": "21"}, 1: {"AF": "0.34", "DP": "56"}}),
vcf_record.sample_tag_values)

merge_vcf_reader.modify_format_tag(vcf_record, format_tags)
merge_vcf_reader._modify_format_tag(vcf_record, format_tags)

self.assertEquals(OrderedDict({0: {"AF": "0.2", "JX1_DP": "21"}, 1: {"AF": "0.34", "JX1_DP": "56"}}),
vcf_record.sample_tag_values)

def test_vcf_records_modifiesFormatTags(self):
def test_vcf_records_modifiesFormatTags_onlyKeepsSubset(self):
file_contents = ["##metaheader1\n",
'##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n',
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">\n',
self.entab("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|SampleNormal|SampleTumor\n"),
self.entab("chr2|1|.|A|C|.|.|INFO|AF:DP|0.2:21|0.34:56")]
mock_file_reader = MockFileReader("my_dir/my_file.txt", file_contents)
merge_vcf_reader = merge.MergeVcfReader(mock_file_reader)
merge_vcf_reader = merge.MergeVcfReader(mock_file_reader, "DP")
merge_vcf_reader.open()

format_tags = {"DP": "JX1_DP"}
format_tags = {"AF": "AF",
"DP": "JX1_DP"}

vcf_records = merge_vcf_reader.vcf_records(format_tags, qualified=False)
for vcf_record in vcf_records:
normal_dict = OrderedDict(sorted({"AF": "0.2", "JX1_DP": "21"}.items()))
tumor_dict = OrderedDict(sorted({"AF": "0.34", "JX1_DP": "56"}.items()))
normal_dict = OrderedDict(sorted({"JX1_DP": "21"}.items()))
tumor_dict = OrderedDict(sorted({"JX1_DP": "56"}.items()))
self.assertEquals(OrderedDict(sorted({"SampleNormal": normal_dict,
"SampleTumor": tumor_dict}.items())),
vcf_record.sample_tag_values)
Expand Down Expand Up @@ -1073,14 +1139,14 @@ def test_create_buffered_readers_modifiesRecords(self):
self.entab("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|SampleNormal|SampleTumor\n"),
self.entab("chr2|1|.|A|C|.|.|INFO|AF:DP|0.24:56|0.01:24")]
mock_reader1 = MockFileReader("fileA.txt", file_contents1)
vcf_reader1 = merge.MergeVcfReader(mock_reader1)
vcf_reader1 = merge.MergeVcfReader(mock_reader1, "AF,DP")

file_contents2 = ["##metaheader1\n",
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">\n',
self.entab("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|SampleNormal|SampleTumor\n"),
self.entab("chr2|1|.|A|C|.|.|INFO|DP|32|78")]
mock_reader2 = MockFileReader("fileB.txt", file_contents2)
vcf_reader2 = merge.MergeVcfReader(mock_reader2)
vcf_reader2 = merge.MergeVcfReader(mock_reader2, "AF,DP")

vcf_reader1.store_format_tags("DP", "JX1_DP")
vcf_reader1.store_format_tags("AF", "AF")
Expand Down

0 comments on commit 36346f1

Please sign in to comment.