Skip to content

Commit

Permalink
ex-266 (jebene/dkriti) began changing the signature of MergeVcfReader()
Browse files Browse the repository at this point in the history
  • Loading branch information
jebene committed May 28, 2015
1 parent 836ad5e commit 2e5d813
Show file tree
Hide file tree
Showing 2 changed files with 392 additions and 100 deletions.
116 changes: 92 additions & 24 deletions jacquard/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,38 @@ def _get_next(self):
except StopIteration:
return None

class NewMergeVcfReader(vcf.VcfReader):
def __init__(self, file_reader, format_tag_mapping):
super(self.__class__,self).__init__(file_reader)

self.format_tag_mapping = format_tag_mapping
self.metaheaders = self._modify_metaheaders()

def _modify_metaheaders(self):
new_list = []

input_format_tags = self.format_metaheaders.keys()
if set(input_format_tags) != set(self.format_tag_mapping.keys()):
raise utils.JQException('Unable to create MergeVcfReader. '
'Transformed format tag metaheaders differ '
'from input format tag metaheaders.')

for metaheader in self.metaheaders:
if metaheader in self.format_metaheaders.values():
for tag, format_metaheader in self.format_metaheaders.items():
if metaheader == format_metaheader:
new_list.extend(self.format_tag_mapping[tag])
else:
new_list.append(metaheader)

return new_list

class MergeVcfReader(vcf.VcfReader):
def __init__(self, file_reader, specified_regex=None):
super(self.__class__,self).__init__(file_reader)

if specified_regex:
self.format_tag_regexes = specified_regex.split(',')
self.format_tag_regexes = specified_regex
else:
self.format_tag_regexes = _DEFAULT_INCLUDED_FORMAT_TAGS

Expand Down Expand Up @@ -367,6 +393,44 @@ def _alter_description(metaheader, caller_name):
r'\g<0>[%s]: ' % caller_name,
metaheader)

def _get_format_tag_distribution(vcf_readers, specified_regex):
#TODO: add logic from _get_format_tag_regex closer to this method (?)
#that will eliminate the need to do this check
if type(specified_regex) != list:
msg = ("Unable to process regular expression [{}]. It must be a list "
"of regular expressions")
raise utils.JQException(msg, specified_regex)

format_tag_distribution = defaultdict(list)
regexes_used = set()

for vcf_reader in vcf_readers:
for tag, metaheader in list(vcf_reader.format_metaheaders.items()):
for regex in specified_regex:
if re.match(regex, tag):
regexes_used.add(regex)

if metaheader not in format_tag_distribution[tag]:
format_tag_distribution[tag].append(metaheader)

ordered_distribution = OrderedDict(sorted(format_tag_distribution.items()))

unused_regexes = set(specified_regex).difference(regexes_used)
if unused_regexes:
for unused_regex in unused_regexes:
msg = ("In the specified list of regexes {}, the regex [{}] "
"does not match any format tags; this expression may be "
"irrelevant.").format(specified_regex, unused_regex)
logger.warning(msg)

if len(format_tag_distribution)==0:
msg = ("The specified format tag regex [{}] would exclude all "
"format tags. Review inputs/usage and try again")
raise utils.UsageError(msg, specified_regex)

return ordered_distribution

#TODO: remove
def _get_format_tags(vcf_readers):
format_tags = defaultdict(list)
caller_names = defaultdict(list)
Expand All @@ -387,23 +451,24 @@ def _get_format_tags(vcf_readers):
return format_tags

#TODO: rename
def _disambiguate_format_tags_new(vcf_readers, format_tags):
format_tag_mapping = {}
for i, vcf_reader in enumerate(vcf_readers):
format_tag_mapping[vcf_reader.file_name] = {}
for tag, metaheaders in list(format_tags.items()):
if tag in vcf_reader.format_metaheaders:
if len(metaheaders) > 1:
new_tag = "JX{}_{}".format(i+1, tag)
metaheader = re.sub(r'(^##FORMAT=.*?[<,]ID=)([^,>]*)',
r'\g<1>%s' % new_tag,
metaheaders[i])
format_tag_mapping[vcf_reader.file_name][tag] = metaheader
else:
metaheader = metaheaders[0]
format_tag_mapping[vcf_reader.file_name][tag] = metaheader

return format_tag_mapping
def _disambiguate_format_tags_new(format_tag_distribution):
count = 0
unambiguous_distribution = defaultdict(list)
for tag, metaheaders in format_tag_distribution.items():
if len(metaheaders) > 1:
for metaheader in metaheaders:
count += 1
new_tag = "JX{}_{}".format(count, tag)
metaheader = re.sub(r'(^##FORMAT=.*?[<,]ID=)([^,>]*)',
r'\g<1>{}'.format(new_tag),
metaheader)
unambiguous_distribution[tag].append(metaheader)
else:
unambiguous_distribution[tag] = metaheaders

ordered_distribution = OrderedDict(sorted(unambiguous_distribution.items()))

return ordered_distribution

#TODO: remove
def _disambiguate_format_tags(merge_vcf_readers, format_tags):
Expand Down Expand Up @@ -462,12 +527,13 @@ def _write_metaheaders(file_writer, all_headers):
file_writer.write("\n".join(all_headers) + "\n")

def _create_merge_vcf_readers(file_readers, specified_regex):
merge_vcf_readers = []
vcf_readers = [vcf.VcfReader(i) for i in file_readers]
format_tag_distribution = _get_format_tag_distribution(vcf_readers,
specified_regex)
format_tag_mapping = _disambiguate_format_tags_new(format_tag_distribution)

merge_vcf_readers = []

format_tags = _get_format_tags(vcf_readers)
# format_tag_mapping = _disambiguate_format_tags_new(vcf_readers,
# format_tags)
for file_reader in file_readers:
merge_vcf_reader = MergeVcfReader(file_reader, specified_regex)
merge_vcf_readers.append(merge_vcf_reader)
Expand Down Expand Up @@ -773,9 +839,11 @@ def _get_format_tag_regex(args):
raise utils.UsageError(msg)

if args.include_all:
format_tag_regex = '.*'
format_tag_regex = ['.*']
elif args.tags:
format_tag_regex = args.tags.split(",")
else:
format_tag_regex = args.tags
format_tag_regex = _DEFAULT_INCLUDED_FORMAT_TAGS

return format_tag_regex

Expand Down
Loading

0 comments on commit 2e5d813

Please sign in to comment.