Skip to content

Commit

Permalink
ex-164 (jebene) - added flag for varscan high confidence files
Browse files Browse the repository at this point in the history
  • Loading branch information
jebene committed Mar 13, 2015
1 parent fec27c0 commit 2c93839
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 87 deletions.
148 changes: 79 additions & 69 deletions jacquard/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,18 +84,6 @@ def add_tag_values(self, record):
if record.alt == self._MISSING_ALT:
record.add_or_replace_filter(self._TAG_ID)

def _mangle_output_filename(input_file):
basename, extension = os.path.splitext(os.path.basename(input_file))
return ".".join([basename, _FILE_OUTPUT_SUFFIX, extension.strip(".")])

def _write_headers(reader, new_tags, execution_context, file_writer):
headers = reader.metaheaders
headers.extend(execution_context)
for tag in new_tags:
headers.append(tag.metaheader)
headers.append(reader.column_header)

file_writer.write("\n".join(headers) + "\n")

def _build_file_readers(input_dir):
in_files = glob.glob(os.path.join(input_dir, "*"))
Expand All @@ -106,6 +94,49 @@ def _build_file_readers(input_dir):

return file_readers

def _mangle_output_filename(input_file):
basename, extension = os.path.splitext(os.path.basename(input_file))
return ".".join([basename, _FILE_OUTPUT_SUFFIX, extension.strip(".")])

def validate_args(args):
unclaimed_readers, trans_vcf_readers = _claim_readers(args)
if unclaimed_readers and not args.force:
raise utils.UsageError(_build_validation_message(unclaimed_readers))
elif not trans_vcf_readers:
raise utils.UsageError(("no vcfs in input dir "
"[{}] can be translated.").format(args.input))

def _build_validation_message(unclaimed_readers):
total_unclaimed = len(unclaimed_readers)
if total_unclaimed == 1:
unclaimed_details = "file [{}]".format(unclaimed_readers[0].file_name)
else:
cutoff = 5
file_names = [reader.file_name for reader in unclaimed_readers]
file_names = file_names[0:min(cutoff, total_unclaimed)]
unclaimed_list = ", ".join(file_names)
if total_unclaimed > cutoff:
omitted = total_unclaimed - cutoff
unclaimed_list += ", ...({} file(s) omitted)".format(omitted)
unclaimed_details = "files [{}]".format(unclaimed_list)

return ("{} input {} cannot be "
"translated; review input dir and try again, or "
"use '--force' to ignore these files.").format(total_unclaimed,
unclaimed_details)

def _claim_readers(args):
input_dir = os.path.abspath(args.input)
file_readers = _build_file_readers(input_dir)

return variant_caller_factory.claim(file_readers)

def _log_unclaimed_readers(unclaimed_readers):
unclaimed_log_messgae = "The input file [{}] will not be translated"
for reader in unclaimed_readers:
msg = unclaimed_log_messgae.format(reader.file_name)
logger.warning(msg)

def _translate_files(trans_vcf_reader,
new_tags,
execution_context,
Expand All @@ -128,21 +159,48 @@ def _translate_files(trans_vcf_reader,
trans_vcf_reader.close()
file_writer.close()

def _write_headers(reader, new_tags, execution_context, file_writer):
headers = reader.metaheaders
headers.extend(execution_context)
for tag in new_tags:
headers.append(tag.metaheader)
headers.append(reader.column_header)

file_writer.write("\n".join(headers) + "\n")

def _store_hc_file(args):
if args.varscan_hc_filter_filename:
for caller in variant_caller_factory._CALLERS:
try:
caller.hc_file_pattern
except AttributeError:
continue
caller.hc_file_pattern = args.varscan_hc_filter_filename

def get_required_input_output_types():
return ("directory", "directory")

def report_prediction(args):
input_dir = os.path.abspath(args.input)
file_readers = _build_file_readers(input_dir)
output_file_names = set()

for reader in file_readers:
mangled_fname = _mangle_output_filename(reader.file_name)
extension = os.path.splitext(os.path.basename(mangled_fname))[1]
if extension == ".vcf":
output_file_names.add(mangled_fname)

return output_file_names

def add_subparser(subparser):
#pylint: disable=line-too-long
parser = subparser.add_parser("translate", help="Accepts a directory of VCF results (and VarScan high confidence files). Creates a new directory of VCFs, adding Jacquard-specific FORMAT tags for each VCF record.")
parser.add_argument("input", help="Path to directory containing VCFs (and VarScan high confidence files). Other file types ignored")
parser.add_argument("output", help="Path to Jacquard-tagged VCFs. Will create if doesn't exist and will overwrite files in output directory as necessary")
parser.add_argument("-v", "--verbose", action='store_true')
parser.add_argument("--force", action='store_true', help="Overwrite contents of output directory")


def _log_unclaimed_readers(unclaimed_readers):
unclaimed_log_messgae = "The input file [{}] will not be translated"
for reader in unclaimed_readers:
msg = unclaimed_log_messgae.format(reader.file_name)
logger.warning(msg)

parser.add_argument("--varscan_hc_filter_filename", help="Regex pattern that identifies optional VarScan high-confidence filter files. The VCF, high-confidence file pairs should share the same prefix. For expample, given patientA.snp.vcf, patientA.indel.vcf, patientA.snp.fpfilter.pass, patientA.indel.fpfilter.pass, you could enable this option as varscan_hc_filter_filename='.fpfilter.pass$'")

#TODO (cgates): This module is both a command and also manipulates VcfRecords
# like a caller. This is the only body of code that does both these things.
Expand All @@ -152,6 +210,7 @@ def execute(args, execution_context):

output_dir = os.path.abspath(args.output)
unclaimed_readers, trans_vcf_readers = _claim_readers(args)
_store_hc_file(args)

_log_unclaimed_readers(unclaimed_readers)

Expand All @@ -173,52 +232,3 @@ def execute(args, execution_context):
file_writer,)

logger.info("Wrote [{}] VCF file(s)", len(trans_vcf_readers))

def report_prediction(args):
input_dir = os.path.abspath(args.input)
file_readers = _build_file_readers(input_dir)
output_file_names = set()

for reader in file_readers:
mangled_fname = _mangle_output_filename(reader.file_name)
extension = os.path.splitext(os.path.basename(mangled_fname))[1]
if extension == ".vcf":
output_file_names.add(mangled_fname)

return output_file_names

def get_required_input_output_types():
return ("directory", "directory")

def _claim_readers(args):
input_dir = os.path.abspath(args.input)
file_readers = _build_file_readers(input_dir)
return variant_caller_factory.claim(file_readers)

def validate_args(args):
unclaimed_readers, trans_vcf_readers = _claim_readers(args)
if unclaimed_readers and not args.force:
raise utils.UsageError(_build_validation_message(unclaimed_readers))
elif not trans_vcf_readers:
raise utils.UsageError(("no vcfs in input dir "
"[{}] can be translated.").format(args.input))

def _build_validation_message(unclaimed_readers):
total_unclaimed = len(unclaimed_readers)
if total_unclaimed == 1:
unclaimed_details = "file [{}]".format(unclaimed_readers[0].file_name)
else:
cutoff = 5
file_names = [reader.file_name for reader in unclaimed_readers]
file_names = file_names[0:min(cutoff, total_unclaimed)]
unclaimed_list = ", ".join(file_names)
if total_unclaimed > cutoff:
omitted = total_unclaimed - cutoff
unclaimed_list += ", ...({} file(s) omitted)".format(omitted)
unclaimed_details = "files [{}]".format(unclaimed_list)

return ("{} input {} cannot be "
"translated; review input dir and try again, or "
"use '--force' to ignore these files.").format(total_unclaimed,
unclaimed_details)

9 changes: 6 additions & 3 deletions jacquard/variant_callers/varscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,14 @@
See tag definitions for more info.
"""
from __future__ import print_function, absolute_import

from collections import defaultdict, OrderedDict
import os
import re

import jacquard.utils as utils
import jacquard.variant_callers.common_tags as common_tags
import jacquard.vcf as vcf
import os


_VARSCAN_SOMATIC_HEADER = ("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|"
Expand Down Expand Up @@ -158,12 +161,12 @@ def add_tag_values(self, vcf_record):

class Varscan(object):
"""Recognize and transform VarScan VCFs to standard Jacquard format."""
_HC_FILE_SUFFIX = "fpfilter.pass"

def __init__(self):
self.name = "VarScan"
self.abbr = "VS"
self.meta_header = "##jacquard.normalize_varscan.sources={0},{1}\n"
self.hc_file_pattern = re.compile("fpfilter.pass")

##TODO (cgates): deprecated; remove
@staticmethod
Expand All @@ -186,7 +189,7 @@ def _is_varscan_vcf(file_reader):

#TODO: (cgates): Add check of header line (extract constant from HCTag?)
def _is_varscan_hc_file(self, file_reader):
return file_reader.file_name.endswith(self._HC_FILE_SUFFIX)
return self.hc_file_pattern.search(file_reader.file_name)

@staticmethod
def _get_files_per_patient(file_readers):
Expand Down
39 changes: 30 additions & 9 deletions test/translate_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,33 @@
#pylint: disable=missing-docstring,protected-access,too-few-public-methods
#pylint: disable=too-many-arguments,too-many-instance-attributes
from argparse import Namespace
from jacquard import __version__, vcf
from test import vcf_test
from test.vcf_test import MockVcfReader, MockTag, MockWriter
import os

from testfixtures import TempDirectory

from jacquard import vcf
import jacquard.logger
import jacquard.translate as translate
import jacquard.utils as utils
import os
import jacquard.variant_callers.variant_caller_factory as variant_caller_factory
from test import vcf_test
import test.mock_logger
import test.test_case as test_case
from test.vcf_test import MockVcfReader, MockTag, MockWriter, MockCaller


class MockVariantCallerFactory(object):
def __init__(self, unclaimed, claimed):
self.unclaimed = unclaimed
self.claimed = claimed
_CALLERS = [MockCaller()]

def __init__(self, unclaimed=None, claimed=None):
if unclaimed:
self.unclaimed = unclaimed
else:
self.unclaimed = []
if claimed:
self.claimed = claimed
else:
self.claimed = []

def claim(self, dummy):
return self.unclaimed, self.claimed
Expand All @@ -41,7 +53,8 @@ def test_execute_forceWarnsUnclaimedFiles(self):
translate.validate_args = lambda x: x
args = Namespace(input=temp_dir.path,
output=temp_dir.path,
force=True)
force=True,
varscan_hc_filter_filename=None)
unclaimed1 = vcf_test.MockFileReader("unclaimed1.vcf")
unclaimed2 = vcf_test.MockFileReader("unclaimed2.vcf")
unclaimed3 = vcf_test.MockFileReader("unclaimed3.vcf")
Expand All @@ -65,7 +78,6 @@ def test_execute_forceWarnsUnclaimedFiles(self):
self.assertRegexpMatches(actual_log_warnings[5],
r"input file \[unclaimed6.vcf\] will not be translated")


def test_validate_args_ok(self):
with TempDirectory() as input_dir, TempDirectory() as output_dir:
args = Namespace(input=input_dir.path, output=output_dir.path)
Expand Down Expand Up @@ -218,6 +230,15 @@ def test_translate_write_metaheaders_addsExecutionMetaheaders(self):
"#CHROM\tPOS\tREF\tALT\tStuff"]
self.assertEquals(expected_headers, writer._content)

def test_store_hc_file(self):
with TempDirectory() as temp_dir:
args = Namespace(input=temp_dir.path,
output=temp_dir.path,
force=True,
varscan_hc_filter_filename="pass$")
translate._store_hc_file(args)

self.assertEquals("pass$", variant_caller_factory._CALLERS[0].hc_file_pattern)

class ExcludeMalformedRefTestCase(test_case.JacquardBaseTestCase):
def test_metaheader(self):
Expand Down
40 changes: 34 additions & 6 deletions test/variant_callers/varscan_test.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# pylint: disable=line-too-long,too-many-public-methods,too-few-public-methods
# pylint: disable=invalid-name,global-statement
import re

from jacquard.variant_callers import varscan
from jacquard.variant_callers.varscan import _HCTag
from test.vcf_test import MockFileReader, MockVcfReader
import jacquard.vcf as vcf
import test.test_case as test_case
from test.vcf_test import MockFileReader, MockVcfReader


class HCTagTestCase(test_case.JacquardBaseTestCase):
Expand Down Expand Up @@ -144,7 +146,7 @@ def append_hc_files(readers, file1="snp.somatic.hc.fpfilter.pass", file2="indel.
def _get_tag_class_names(vcf_reader):
return [tag.__class__.__name__ for tag in vcf_reader.tags]

def test_claim_multiple_patients(self):
def test_claim_multiplePatients(self):
record1 = "chr1\t.\t.\t.\t.\t.\t.\t.\t."
content1 = ["##foo", "##source=VarScan2", "#chrom", record1]
content2 = ["chrom\tposition", "1\t23"]
Expand All @@ -168,7 +170,7 @@ def test_claim_multiple_patients(self):
self._get_tag_class_names(vcf_readers[2]))
self.assertEquals(reader1.file_name, vcf_readers[0]._vcf_reader.file_name)

def test_claim_varscan_vcf_only(self):
def test_claim_varscanVcfOnly(self):
record1 = "chr1\t.\t.\t.\t.\t.\t.\t.\t."
content1 = ["##foo", "##MuTect=123", "#chrom", record1]
content2 = ["##foo", "##source=VarScan2", "#chrom", record1]
Expand All @@ -186,7 +188,7 @@ def test_claim_varscan_vcf_only(self):

self.assertEquals(reader2.file_name, vcf_readers[0]._vcf_reader.file_name)

def test_claim_vcf_and_filter_file(self):
def test_claim_vcfAndFilterFile(self):
record1 = "chr1\t.\t.\t.\t.\t.\t.\t.\t."
content1 = [self.entab("chrom|position|ref|var"),
record1]
Expand All @@ -211,7 +213,33 @@ def test_claim_vcf_and_filter_file(self):
self.assertEquals(reader4.file_name, vcf_readers[1]._vcf_reader.file_name)
self.assertEquals(reader3.file_name, vcf_readers[1]._som_hc_file_reader.file_name)

def test_claim_heterogeneous_vcfs_and_filter_files(self):
def test_claim_vcfAndFilterFileNameGiven(self):
record1 = "chr1\t.\t.\t.\t.\t.\t.\t.\t."
content1 = [self.entab("chrom|position|ref|var"),
record1]
content2 = ["##foo", "##source=VarScan2", "#chrom", record1]
reader1 = MockFileReader("patientA.indel.Somatic.foo.bar", content1)
reader2 = MockFileReader("patientA.indel.vcf", content2)
reader3 = MockFileReader("patientA.snp.Somatic.foo.bar", content1)
reader4 = MockFileReader("patientA.snp.vcf", content2)
reader5 = MockFileReader("patientA.readme", ["foo"])
file_readers = [reader1, reader2, reader3, reader4, reader5]

caller = varscan.Varscan()
caller.hc_file_pattern = re.compile("foo.bar$")
unrecognized_readers, vcf_readers = caller.claim(file_readers)

self.assertEquals(1, len(unrecognized_readers))
self.assertEquals([reader5], unrecognized_readers)
self.assertEquals(2, len(vcf_readers))
self.assertIsInstance(vcf_readers[0], varscan._VarscanVcfReader)
self.assertEquals(reader2.file_name, vcf_readers[0]._vcf_reader.file_name)
self.assertEquals(reader1.file_name, vcf_readers[0]._som_hc_file_reader.file_name)
self.assertIsInstance(vcf_readers[1], varscan._VarscanVcfReader)
self.assertEquals(reader4.file_name, vcf_readers[1]._vcf_reader.file_name)
self.assertEquals(reader3.file_name, vcf_readers[1]._som_hc_file_reader.file_name)

def test_claim_heterogeneousVcfsAndFilterFiles(self):
record1 = "chr1\t.\t.\t.\t.\t.\t.\t.\t."
content1 = [self.entab("chrom|position|ref|var"),
record1]
Expand All @@ -236,7 +264,7 @@ def test_claim_heterogeneous_vcfs_and_filter_files(self):
self.assertEquals(reader4.file_name, vcf_readers[1]._vcf_reader.file_name)
self.assertEquals(None, vcf_readers[1]._som_hc_file_reader)

def test_claim_ignores_unpaired_non_vcf_files(self):
def test_claim_ignoresUnpairedNonVcfFiles(self):
record1 = "chr1\t.\t.\t.\t.\t.\t.\t.\t."
content1 = ["##foo", "##source=VarScan2", "#chrom", record1]
reader1 = MockFileReader("fileA.txt", content1)
Expand Down

0 comments on commit 2c93839

Please sign in to comment.