Skip to content

Commit

Permalink
EX-258 (jebene/dkriti) - created a merged vcf reader to display
Browse files Browse the repository at this point in the history
disambiguated tags
  • Loading branch information
jebene committed May 14, 2015
1 parent a35521b commit 8e4aacf
Show file tree
Hide file tree
Showing 4 changed files with 386 additions and 155 deletions.
146 changes: 104 additions & 42 deletions jacquard/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,63 @@ def _get_next(self):
except StopIteration:
return None

class MergeVcfReader(vcf.VcfReader):
def __init__(self, file_reader):
super(self.__class__,self).__init__(file_reader)
self.format_tags = {}

def modify_metaheader(self, original_metaheader, transformed_tag):
updated_metaheader = re.sub(r'(^##FORMAT=.*?[<,]ID=)([^,>]*)',
r'\g<1>%s' % transformed_tag,
original_metaheader)

self.metaheaders.append(updated_metaheader)
if original_metaheader in self.metaheaders:
self.metaheaders.remove(original_metaheader)

def store_format_tags(self, original_tag, new_tag):
self.format_tags[original_tag] = new_tag

@staticmethod
def modify_format_tag(vcf_record, format_tags):
for tags in list(vcf_record.sample_tag_values.values()):
for original_tag, new_tag in list(format_tags.items()):
if new_tag not in tags and original_tag in tags:
tags[new_tag] = tags[original_tag]
del tags[original_tag]

return vcf_record

def vcf_records(self, format_tags=None, qualified=False):
"""Generates parsed VcfRecord objects.
Typically called in a for loop to process each vcf record in a
VcfReader. VcfReader must be opened in advanced and closed when
complete. Skips all headers.
Args:
qualified: When True, sample names are prefixed with file name
Returns:
Parsed VcfRecord
Raises:
StopIteration: when reader is exhausted.
TypeError: if reader is closed.
"""
if qualified:
sample_names = self.qualified_sample_names
else:
sample_names = self.sample_names

for line in self._file_reader.read_lines():
if line.startswith("#"):
continue
vcf_record = vcf.VcfRecord.parse_record(line, sample_names)
if format_tags:
vcf_record = self.modify_format_tag(vcf_record, format_tags)
yield vcf_record


class _Filter(object):
"""Interprets command-line args to initialize variant and locus filters
Expand Down Expand Up @@ -220,43 +277,42 @@ def _include_row_if_any_somatic(records):
return True
return False

#TODO: (jebene) hook this up
def _add_format_tags(vcf_reader, original_tag, new_tag):
vcf_reader.open()
for vcf_record in vcf_reader.vcf_records():
for tags in list(vcf_record.sample_tag_values.values()):
tags[new_tag] = tags[original_tag]
vcf_reader.close()
return vcf_reader

def _disambiguate_tags(retained_tags, vcf_readers):
format_tags = {}
for tag, metaheaders in list(retained_tags.items()):
if len(metaheaders) > 1:
for vcf_reader in vcf_readers:
for i, metahdr in enumerate(list(metaheaders)):
new_tag = "JX{}_{}".format(i+1, tag)
if metahdr in list(vcf_reader.format_metaheaders.values()):
format_tags[new_tag] = tag
vcf_reader.modify_metaheader(metahdr, new_tag)
else:
format_tags[tag] = tag
def _get_format_tags(merge_vcf_readers):
format_tags = defaultdict(list)
for merge_vcf_reader in merge_vcf_readers:
for tag, metahdr in list(merge_vcf_reader.format_metaheaders.items()):
if metahdr not in format_tags[tag]:
format_tags[tag].append(metahdr)
return format_tags

def _disambiguate_format_tags(merge_vcf_readers, format_tags):
for tag, metaheaders in list(format_tags.items()):
for merge_vcf_reader in merge_vcf_readers:
for i, metahdr in enumerate(list(metaheaders)):
vcf_metaheaders = merge_vcf_reader.format_metaheaders.values()
if metahdr in list(vcf_metaheaders):
if len(metaheaders) > 1:
new_tag = "JX{}_{}".format(i+1, tag)
format_tags[new_tag] = metaheaders

merge_vcf_reader.modify_metaheader(metahdr, new_tag)
merge_vcf_reader.store_format_tags(tag, new_tag)
else:
merge_vcf_reader.store_format_tags(tag, tag)

return format_tags

def _build_format_tags(format_tag_regex, vcf_readers):
retained_tags = defaultdict(set)
retained_tags = set()
regexes_used = set()

for vcf_reader in vcf_readers:
for tag_regex in format_tag_regex:
for tag, metaheader in list(vcf_reader.format_metaheaders.items()):
if re.match(tag_regex + "$", tag):
retained_tags[tag].add(metaheader)
for original_tag, new_tag in list(vcf_reader.format_tags.items()):
if re.match(tag_regex + "$", original_tag):
retained_tags.add(new_tag)
regexes_used.add(tag_regex)

retained_tags = _disambiguate_tags(retained_tags, vcf_readers)

if len(retained_tags) == 0:
msg = ("The specified format tag regex [{}] would exclude all format "
"tags. Review inputs/usage and try again")
Expand All @@ -269,8 +325,7 @@ def _build_format_tags(format_tag_regex, vcf_readers):
"not match any format tags; this expression may be "
"irrelevant.")
logger.warning(msg, format_tag_regex, unused_regex)

return retained_tags
return sorted(list(retained_tags))

def _compile_metaheaders(incoming_headers,
vcf_readers,
Expand Down Expand Up @@ -311,18 +366,19 @@ def _write_metaheaders(file_writer, all_headers):
file_writer.write("\n".join(all_headers) + "\n")

def _create_vcf_readers(file_readers):
vcf_readers = []
merge_vcf_readers = []
for file_reader in file_readers:
vcf_reader = vcf.VcfReader(file_reader)
vcf_readers.append(vcf_reader)
# vcf_reader = vcf.VcfReader(file_reader)
merge_vcf_reader = MergeVcfReader(file_reader)
merge_vcf_readers.append(merge_vcf_reader)

return vcf_readers
return merge_vcf_readers

def _create_buffered_readers(vcf_readers):
buffered_readers = []
for vcf_reader in vcf_readers:
vcf_reader.open()
records = vcf_reader.vcf_records(qualified=True)
records = vcf_reader.vcf_records(vcf_reader.format_tags, qualified=True)
buffered_readers.append(_BufferedReader(records))

return buffered_readers
Expand Down Expand Up @@ -576,6 +632,7 @@ def _merge_records(vcf_readers,
vcf_records = _pull_matching_records(filter_strategy,
coordinate,
buffered_readers)

if vcf_records:
merged_record = _build_merged_record(coordinate,
vcf_records,
Expand Down Expand Up @@ -664,30 +721,35 @@ def execute(args, execution_context):
#and _build_contigs behave differently. It seems like we could make the
#signatures of these methods more similar or even combine some methods to
#reduce excess iterations over the coordinates/vcf_readers
vcf_readers = _create_vcf_readers(file_readers)
format_tags_to_keep = _build_format_tags(format_tag_regex, vcf_readers)
vcf_readers = _sort_readers(vcf_readers, output_path)
all_sample_names, merge_metaheaders = _build_sample_list(vcf_readers)
coordinates = _build_coordinates(vcf_readers)
merge_vcf_readers = _create_vcf_readers(file_readers)
merge_vcf_readers = _sort_readers(merge_vcf_readers, output_path)
format_tags = _get_format_tags(merge_vcf_readers)
_disambiguate_format_tags(merge_vcf_readers, format_tags)
format_tags_to_keep = _build_format_tags(format_tag_regex,
merge_vcf_readers)
(all_sample_names,
merge_metaheaders) = _build_sample_list(merge_vcf_readers)

coordinates = _build_coordinates(merge_vcf_readers)
info_tags_to_keep = _build_info_tags(coordinates)
contigs_to_keep = _build_contigs(coordinates)
incoming_headers = _FILE_FORMAT + execution_context + merge_metaheaders
headers = _compile_metaheaders(incoming_headers,
vcf_readers,
merge_vcf_readers,
all_sample_names,
contigs_to_keep,
format_tags_to_keep,
info_tags_to_keep)

_write_metaheaders(file_writer, headers)

_merge_records(vcf_readers,
_merge_records(merge_vcf_readers,
coordinates,
filter_strategy,
all_sample_names,
format_tags_to_keep,
file_writer)
finally:
for vcf_reader in vcf_readers:
for vcf_reader in merge_vcf_readers:
vcf_reader.close()
file_writer.close()
19 changes: 10 additions & 9 deletions jacquard/utils/vcf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Classes to parse, interpret, and manipulate VCF files and records."""
from __future__ import print_function, absolute_import, division

from collections import OrderedDict, defaultdict
from collections import OrderedDict
import os
import re
import sys
Expand Down Expand Up @@ -96,14 +96,14 @@ def _init_headers(self):
# return column_header, tuple(metaheaders)
return column_header, metaheaders

def modify_metaheader(self, original_metaheader, transformed_tag):
updated_metaheader = re.sub(r'(^##FORMAT=.*?[<,]ID=)([^,>]*)',
r'\g<1>%s' % transformed_tag,
original_metaheader)

self.metaheaders.append(updated_metaheader)
if original_metaheader in self.metaheaders:
self.metaheaders.remove(original_metaheader)
# def modify_metaheader(self, original_metaheader, transformed_tag):
# updated_metaheader = re.sub(r'(^##FORMAT=.*?[<,]ID=)([^,>]*)',
# r'\g<1>%s' % transformed_tag,
# original_metaheader)
#
# self.metaheaders.append(updated_metaheader)
# if original_metaheader in self.metaheaders:
# self.metaheaders.remove(original_metaheader)

#TODO (cgates): qualified is used by ONE invocation in merge. Can we
#somehow make merge do this instead of universally complicating the method?
Expand Down Expand Up @@ -134,6 +134,7 @@ def vcf_records(self, qualified=False):
continue
yield VcfRecord.parse_record(line, sample_names)


def open(self):
self._file_reader.open()

Expand Down
Loading

0 comments on commit 8e4aacf

Please sign in to comment.