Skip to content

Commit

Permalink
ex-266 (dkriti/jebene): continued refactor of merge
Browse files Browse the repository at this point in the history
  • Loading branch information
jebene committed May 28, 2015
1 parent 2e5d813 commit 057dd86
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 74 deletions.
71 changes: 51 additions & 20 deletions jacquard/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"""
from __future__ import print_function, absolute_import, division

from collections import defaultdict, OrderedDict
from collections import defaultdict, OrderedDict, Counter
import glob
import os
import re
Expand Down Expand Up @@ -393,9 +393,8 @@ def _alter_description(metaheader, caller_name):
r'\g<0>[%s]: ' % caller_name,
metaheader)

#TODO: remove
def _get_format_tag_distribution(vcf_readers, specified_regex):
#TODO: add logic from _get_format_tag_regex closer to this method (?)
#that will eliminate the need to do this check
if type(specified_regex) != list:
msg = ("Unable to process regular expression [{}]. It must be a list "
"of regular expressions")
Expand Down Expand Up @@ -450,25 +449,57 @@ def _get_format_tags(vcf_readers):

return format_tags

def _get_format_tags_per_reader(vcf_readers):
format_tags_dict = {}
for vcf_reader in vcf_readers:
sorted_tags = sorted(vcf_reader.format_metaheaders.keys())
format_tags_dict[vcf_reader.file_name] = sorted_tags

return format_tags_dict

#TODO: rename
def _disambiguate_format_tags_new(format_tag_distribution):
def _create_format_tag_mapping(vcf_readers):
format_tags_dict = _get_format_tags_per_reader(vcf_readers)

all_values = []
for tags in format_tags_dict.values():
all_values.extend(tags)

format_tag_counts = Counter(all_values)
ambiguous_tags = [i for i in format_tag_counts if format_tag_counts[i] > 1]

count = 0
unambiguous_distribution = defaultdict(list)
for tag, metaheaders in format_tag_distribution.items():
if len(metaheaders) > 1:
for metaheader in metaheaders:
reader_tag_mapping = defaultdict(dict)
for reader, format_tags in format_tags_dict.items():
for format_tag in format_tags:
if format_tag in ambiguous_tags:
count += 1
new_tag = "JX{}_{}".format(count, tag)
metaheader = re.sub(r'(^##FORMAT=.*?[<,]ID=)([^,>]*)',
r'\g<1>{}'.format(new_tag),
metaheader)
unambiguous_distribution[tag].append(metaheader)
else:
unambiguous_distribution[tag] = metaheaders
new_tag = 'JX{}_{}'.format(count, format_tag)
reader_tag_mapping[reader][format_tag] = new_tag
else:
reader_tag_mapping[reader][format_tag] = format_tag
return reader_tag_mapping

ordered_distribution = OrderedDict(sorted(unambiguous_distribution.items()))

return ordered_distribution
#TODO: remove
# def _disambiguate_format_tags_new(format_tag_distribution):
# count = 0
# unambiguous_distribution = defaultdict(list)
# for tag, metaheaders in format_tag_distribution.items():
# if len(metaheaders) > 1:
# for metaheader in metaheaders:
# count += 1
# new_tag = "JX{}_{}".format(count, tag)
# metaheader = re.sub(r'(^##FORMAT=.*?[<,]ID=)([^,>]*)',
# r'\g<1>{}'.format(new_tag),
# metaheader)
# unambiguous_distribution[tag].append(metaheader)
# else:
# unambiguous_distribution[tag] = metaheaders
#
# ordered_distribution = OrderedDict(sorted(unambiguous_distribution.items()))
#
# return ordered_distribution

#TODO: remove
def _disambiguate_format_tags(merge_vcf_readers, format_tags):
Expand Down Expand Up @@ -528,9 +559,9 @@ def _write_metaheaders(file_writer, all_headers):

def _create_merge_vcf_readers(file_readers, specified_regex):
vcf_readers = [vcf.VcfReader(i) for i in file_readers]
format_tag_distribution = _get_format_tag_distribution(vcf_readers,
specified_regex)
format_tag_mapping = _disambiguate_format_tags_new(format_tag_distribution)
# format_tag_distribution = _get_format_tag_distribution(vcf_readers,
# specified_regex)
# format_tag_mapping = _disambiguate_format_tags_new(format_tag_distribution)

merge_vcf_readers = []

Expand Down
73 changes: 19 additions & 54 deletions test/merge_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,30 +105,6 @@ def xtest_modify_metaheader_errorIfDifferentKeys(self):
merge.NewMergeVcfReader,
mock_file_reader,
format_tag_mapping)

def xtest_modify_format_tags(self):
file_contents = ["##metaheader1\n",
'##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n',
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">\n',
self.entab("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|SampleNormal|SampleTumor\n"),
self.entab("chr2|1|.|A|C|.|.|INFO|FORMAT|NORMAL|TUMOR")]
mock_file_reader = MockFileReader("my_dir/my_file.txt", file_contents)
format_tag_mapping = OrderedDict(sorted({"AF": ['##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">'],
"DP": ['##FORMAT=<ID=JX1_DP,Number=1,Type=Integer,Description="Read Depth">',
'##FORMAT=<ID=JX2_DP,Number=1,Type=Integer,Description="Read Depth">']}.items()))

merge_vcf_reader = merge.NewMergeVcfReader(mock_file_reader, format_tag_mapping)

vcf_record = MockVcfRecord("chr1", "245", "A", "G", vcf_format="AF:DP", samples=["0.2:21", "0.34:56"])

self.assertEquals(OrderedDict({0: {"AF": "0.2", "DP": "21"}, 1: {"AF": "0.34", "DP": "56"}}),
vcf_record.sample_tag_values)

merge_vcf_reader._modify_format_tag(vcf_record)

self.assertEquals(OrderedDict({0: {"AF": "0.2", "JX1_DP": "21"}, 1: {"AF": "0.34", "JX1_DP": "56"}}),
vcf_record.sample_tag_values)

#TODO: remove
class MergeVcfReaderTestCase_LEGACY(test_case.JacquardBaseTestCase):
def setUp(self):
Expand Down Expand Up @@ -837,6 +813,25 @@ def test_get_format_tag_regex_raisesError(self):
merge._get_format_tag_regex,
args)

def test_get_format_tags_per_reader(self):
metaheaders1 = ["##metaheader1",
'##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">',
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">']
vcf_reader1 = MockVcfReader(input_filepath="fileA",
metaheaders=metaheaders1)

metaheaders2 = ["##metaheader1",
'##FORMAT=<ID=AF,Number=A,Type=Float,Description="Approximate Allele Frequency">',
'##FORMAT=<ID=GT,Number=1,Type=Integer,Description="Genotype>']
vcf_reader2 = MockVcfReader(input_filepath="fileB",
metaheaders=metaheaders2)

format_tags = merge._get_format_tags_per_reader([vcf_reader1, vcf_reader2])

self.assertEquals(2, len(format_tags))
self.assertEquals(["AF", "DP"], format_tags["fileA"])
self.assertEquals(["AF", "GT"], format_tags["fileB"])

def test_get_format_tag_distribution(self):
file_contents1 = ["##metaheader1\n",
'##FORMAT=<ID=JQ_AF,Number=A,Type=Float,Description="Allele Frequency">\n',
Expand Down Expand Up @@ -1054,36 +1049,6 @@ def test_get_format_tags_sameMetaheaderOkay(self):

self.assertEquals(expected_format_tag_dict, format_tags)

def test_disambiguate_format_tags(self):
format_tag_distribution = {"DP": ['##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate Read Depth">',
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">'],
"AF": ['##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">']}
actual_mapping = merge._disambiguate_format_tags_new(format_tag_distribution)

expected_mapping = {"DP": ['##FORMAT=<ID=JX1_DP,Number=1,Type=Integer,Description="Approximate Read Depth">',
'##FORMAT=<ID=JX2_DP,Number=1,Type=Integer,Description="Read Depth">'],
"AF": ['##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">']}

self.assertEquals(expected_mapping, actual_mapping)

def test_disambiguate_format_tags_orderRetained(self):
format_tag_distribution = OrderedDict(sorted({"GT": ['##FORMAT=<ID=GT,Number=A,Type=Float,Description="Genotype">',
'##FORMAT=<ID=GT,Number=A,Type=Float,Description="Approximate Genotype">'],
"DP": ['##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate Read Depth">',
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">'],
"AF": ['##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">']}.items()))
actual_mapping = merge._disambiguate_format_tags_new(format_tag_distribution)

expected_mapping = OrderedDict(sorted({"DP": ['##FORMAT=<ID=JX1_DP,Number=1,Type=Integer,Description="Approximate Read Depth">',
'##FORMAT=<ID=JX2_DP,Number=1,Type=Integer,Description="Read Depth">'],
"AF": ['##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">'],
"GT": ['##FORMAT=<ID=JX3_GT,Number=A,Type=Float,Description="Genotype">',
'##FORMAT=<ID=JX4_GT,Number=A,Type=Float,Description="Approximate Genotype">']}.items()))

self.assertEquals(expected_mapping, actual_mapping)
self.assertEquals("AF", actual_mapping.keys()[0])
self.assertEquals("DP", actual_mapping.keys()[1])
self.assertEquals("GT", actual_mapping.keys()[2])

def test_build_coordinates(self):
fileArec1 = vcf.VcfRecord("chr1", "1", "A", "C")
Expand Down

0 comments on commit 057dd86

Please sign in to comment.