Skip to content

Commit

Permalink
ex-198 (cgates): Added more docstrings. Light refactoring.
Browse files Browse the repository at this point in the history
  • Loading branch information
cgates committed Mar 6, 2015
1 parent b58e8c4 commit aac1641
Show file tree
Hide file tree
Showing 8 changed files with 110 additions and 226 deletions.
4 changes: 2 additions & 2 deletions jacquard/variant_callers/strelka.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""Interprets Strelka VCF files adding Jacquard standard information.
* Strelka VCFs are assumed to have a ".vcf" extension and have a
"##source=strelka" metaheader.
* Strelka produces a separate file for SNVs and indels. Jacquard can process
either or both.
* Jacquard standard tags are based on tier 2 data and use different source tags
based on whether the file is indel or snp.
* Strelka VCFs are assumed to have a ".vcf" extension and have a
"##source=strelka" metaheader.
See tag definitions for more info.
"""
Expand Down
4 changes: 2 additions & 2 deletions jacquard/variant_callers/summarize_caller.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Classes to summarize data for a sample-variant or variant as a whole.
A collection of individual Tag classes hold the metaheader and logic to
transform incoming VcfRecords.
transform Jacquard-standardized VcfRecords.
"""
#pylint: disable=missing-docstring
from __future__ import print_function, absolute_import
Expand Down Expand Up @@ -420,7 +420,7 @@ def add_tag_values(self, vcf_record):
somatic_count)

class SummarizeCaller(object):
"""Provides metaheaders for VcfReader; adds summary tags to VcfRecord."""
"""Provides metaheaders for VcfReader; adds summary tags to VcfRecords."""
def __init__(self):
self.tags = [_CallersReportedTag(),
_CallersPassedTag(),
Expand Down
49 changes: 45 additions & 4 deletions jacquard/variant_callers/varscan.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,22 @@
"""Interprets VarScan2 VCF files adding Jacquard standard information.
* VarScan VCFs are assumed to have a ".vcf" extension and have a
"##source=VarScan2" metaheader.
* VarScan produces a separate file for SNPs and indels. Jacquard can process
either or both.
* Jacquard requires the VarScan VCF outputs. The VarScan workflow has optional
extra steps to:
a) partition the results into Germline, LOH, and Somatic files
b) filter to a subset of high-confidence variants
If these files are provided in the input, Jacquard will use them to
flag variants not found in the high-confidence files so that they can be
optionally filtered out.
* If provided, high-confidence files should follow the naming convention of
patientName.*.fpfilter.pass
The high-confidence file suffix can be supplied as a command line arg.
See tag definitions for more info.
"""
from __future__ import print_function, absolute_import
from collections import defaultdict, OrderedDict
from jacquard import __version__
Expand All @@ -7,7 +26,7 @@
import os


VARSCAN_SOMATIC_HEADER = ("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|"
_VARSCAN_SOMATIC_HEADER = ("#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT|"
"NORMAL|TUMOR").replace("|", "\t")
JQ_VARSCAN_TAG = "JQ_VS_"

Expand Down Expand Up @@ -147,20 +166,21 @@ def add_tag_values(self, vcf_record):


class Varscan(object):
"""Recognize and transform VarScan VCFs to standard Jacquard format."""
_HC_FILE_SUFFIX = "fpfilter.pass"

def __init__(self):
self.name = "VarScan"
self.abbr = "VS"
self.meta_header = "##jacquard.normalize_varscan.sources={0},{1}\n"

##TODO (cgates): deprecated
##TODO (cgates): deprecated; remove
@staticmethod
def validate_input_file(meta_headers, column_header):
if "##source=VarScan2" not in meta_headers:
return 0

if VARSCAN_SOMATIC_HEADER == column_header:
if _VARSCAN_SOMATIC_HEADER == column_header:
return 1
else:
raise utils.JQException("Unexpected VarScan VCF structure - "
Expand All @@ -173,7 +193,7 @@ def _is_varscan_vcf(file_reader):
return "##source=VarScan2" in vcf_reader.metaheaders
return False

#TODO: (cgates): Add check of header line (extract constant from HCTag)
#TODO: (cgates): Add check of header line (extract constant from HCTag?)
def _is_varscan_hc_file(self, file_reader):
return file_reader.file_name.endswith(self._HC_FILE_SUFFIX)

Expand All @@ -188,6 +208,19 @@ def _get_files_per_patient(file_readers):
return patient_to_files

def claim(self, file_readers):
"""Recognizes and claims MuTect VCFs form the set of all input VCFs.
Each defined caller has a chance to evaluate and claim all the incoming
files as something that it can process. Since VarScan can claim
high-confidence files as well, this process is significantly more
complex than for other callers.
Args:
file_readers: the collection of currently unclaimed files
Returns:
A tuple of unclaimed readers and MuTectVcfReaders.
"""
files_per_patient = self._get_files_per_patient(file_readers)

unclaimed_set = set()
Expand Down Expand Up @@ -222,6 +255,14 @@ def claim(self, file_readers):
#TODO: (cgates): If we can, I would rather inflate the high confidence set when
# we open and not on construction. There is a pretty safe/clean way to do this.
class _VarscanVcfReader(object):
"""Adapter that presents a VarScan VCF as a VcfReader.
This follows the VcfReader interface, delegating calls to the base
VcfReader, adjusting metaheaders and individual
variants as appropriate.
See VcfReader for more info.
"""
def __init__(self, vcf_reader, som_hc_file_reader=None):
self._vcf_reader = vcf_reader
self._som_hc_file_reader = som_hc_file_reader
Expand Down
23 changes: 17 additions & 6 deletions jacquard/variant_callers/zscore_caller.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
#pylint: disable=too-few-public-methods, unused-argument
"""Classes to summarize zscore data for a sample-variant.
A collection of individual Tag classes hold the metaheader and logic to
transform Jacquard-standardized VcfRecords.
These transforms combine info from a single sample-variant tag with aggregate
info about all sample-variants in the VCF. For example, the zscore of a
sample-variant depth is:
(my_depth - average of all depths) / stddev of all depths
For this reason, zscore values require the full VCF as one of the inputs.
"""
from __future__ import print_function, absolute_import
import math
from jacquard import __version__
Expand All @@ -11,11 +21,11 @@
class _AlleleFreqZScoreTag(object):
TAG_ID = "{0}AF_ZSCORE".format(_JQ_SUMMARY_TAG)
_RANGE_TAG = "{0}AF_RANGE".format(_JQ_SUMMARY_TAG)
_METAHEADER_DESCRIPTION = ('''
Concordance of reported allele frequencies across callers:
[(this AF range - mean AF range)/standard dev(all AF ranges)].
Values with null or missing AF range will be assigned zscore of \'.\';
for multi-valued ranges, zscore is of largest range.''').replace("\n", "")
_METAHEADER_DESCRIPTION = (\
'''Concordance of reported allele frequencies across callers:
[(this AF range - mean AF range)/standard dev(all AF ranges)].
Values with null or missing AF range will be assigned zscore of \'.\';
for multi-valued ranges, zscore is of largest range.''').replace("\n", "")

def __init__(self, vcf_reader):
self.tag_id = self.TAG_ID
Expand Down Expand Up @@ -175,6 +185,7 @@ def _init_population_stats(self, vcf_reader, dependent_tag_id):


class ZScoreCaller(object):
"""Provides metaheaders for VcfReader; adds summary tags to VcfRecords."""
def __init__(self, vcf_reader):
self._tags = [_AlleleFreqZScoreTag(vcf_reader),
_DepthZScoreTag(vcf_reader)]
Expand Down
60 changes: 9 additions & 51 deletions test/variant_callers/mutect_test.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,12 @@
# pylint: disable=line-too-long,too-many-public-methods,too-few-public-methods
# pylint: disable=invalid-name,global-statement
from jacquard import __version__
import jacquard.variant_callers.common_tags as common_tags
import jacquard.variant_callers.mutect as mutect
import jacquard.vcf as vcf
import test.test_case as test_case
from test.vcf_test import MockFileReader, MockVcfReader


ORIGINAL_REPORTED_TAG = None
ORIGINAL_PASSED_TAG = None

class MockCommonTag(object):
def __init__(self, input_caller_name):
self.input_caller_name = input_caller_name

class MockWriter(object):
def __init__(self):
self._content = []
self.opened = False
self.closed = False

def open(self):
self.opened = True

def write(self, content):
self._content.extend(content.splitlines())

def lines(self):
return self._content

def close(self):
self.closed = True

class MockReader(object):
def __init__(self, lines=None):
if not lines:
lines = []
self._lines_iter = iter(lines)
self.opened = False
self.closed = False
self.input_filepath = "foo"

def open(self):
self.opened = True

def read_lines(self):
return self._lines_iter

def close(self):
self.closed = True


class AlleleFreqTagTestCase(test_case.JacquardBaseTestCase):
def test_metaheader(self):
self.assertEqual('##FORMAT=<ID={0}AF,Number=A,Type=Float,Description="Jacquard allele frequency for MuTect: Decimal allele frequency rounded to 2 digits (based on FA)",Source="Jacquard",Version={1}>'.format(mutect.JQ_MUTECT_TAG, __version__), mutect._AlleleFreqTag().metaheader)
Expand Down Expand Up @@ -199,24 +154,27 @@ def test_vcf_records_newTagsPresent(self):
pos="22",
ref="A",
alt="G",
sample_tag_values={"sampleA": {"DP": "46"},
"sampleB": {"DP": "68"}})
sample_tag_values={"sampleA": {"FA": "0.54"},
"sampleB": {"FA": "0.76"}})
vcf_reader = MockVcfReader(records=[record1, record2])

mutect_vcf_reader = mutect._MutectVcfReader(vcf_reader)
vcf_records = [record for record in mutect_vcf_reader.vcf_records()]

self.assertEquals(2, len(vcf_records))
self.assertIn(mutect.JQ_MUTECT_TAG + "DP",
vcf_records[0].sample_tag_values["sampleA"])
self.assertIn(mutect.JQ_MUTECT_TAG + "DP",
vcf_records[1].sample_tag_values["sampleA"])

self.assertIn("DP", vcf_records[0].format_tags)
self.assertIn(mutect.JQ_MUTECT_TAG + "DP", vcf_records[0].format_tags)
self.assertIn(mutect.JQ_MUTECT_TAG + "HC_SOM", vcf_records[0].format_tags)
self.assertIn(mutect.JQ_MUTECT_TAG + "CALLER_REPORTED", vcf_records[0].format_tags)
self.assertIn(mutect.JQ_MUTECT_TAG + "CALLER_PASSED", vcf_records[0].format_tags)

self.assertIn("FA", vcf_records[1].format_tags)
self.assertIn(mutect.JQ_MUTECT_TAG + "AF", vcf_records[1].format_tags)
self.assertIn(mutect.JQ_MUTECT_TAG + "HC_SOM", vcf_records[1].format_tags)
self.assertIn(mutect.JQ_MUTECT_TAG + "CALLER_REPORTED", vcf_records[1].format_tags)
self.assertIn(mutect.JQ_MUTECT_TAG + "CALLER_PASSED", vcf_records[1].format_tags)


def test_open_and_close(self):
vcf_reader = MockVcfReader(metaheaders=["##foo", "##MuTect=123"])
Expand Down
73 changes: 15 additions & 58 deletions test/variant_callers/strelka_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,62 +2,11 @@
# pylint: disable=invalid-name,global-statement,too-many-format-args
from jacquard import __version__
from test.vcf_test import MockFileReader, MockVcfReader
import jacquard.variant_callers.common_tags as common_tags
import jacquard.variant_callers.strelka as strelka
import jacquard.vcf as vcf
import test.test_case as test_case


#TODO: (cgates): Lots of PEP8 cleanup in this class
ORIGINAL_REPORTED_TAG = None
ORIGINAL_PASSED_TAG = None


class MockCommonTag(object):
def __init__(self, input_caller_name):
self.input_caller_name = input_caller_name


class MockWriter(object):
def __init__(self):
self._content = []
self.opened = False
self.closed = False

def open(self):
self.opened = True

def write(self, content):
self._content.extend(content.splitlines())

def lines(self):
return self._content

def close(self):
self.closed = True


class CommonTagTestCase(test_case.JacquardBaseTestCase):
def setUp(self):
global ORIGINAL_REPORTED_TAG
global ORIGINAL_PASSED_TAG
ORIGINAL_REPORTED_TAG = common_tags.ReportedTag
ORIGINAL_PASSED_TAG = common_tags.PassedTag
common_tags.ReportedTag = MockCommonTag
common_tags.PassedTag = MockCommonTag

def tearDown(self):
common_tags.ReportedTag = ORIGINAL_REPORTED_TAG
common_tags.PassedTag = ORIGINAL_PASSED_TAG

def test_reported_tag(self):
strelka_instance = strelka._StrelkaVcfReader(MockVcfReader())
reported_tag = strelka_instance.tags[0]
passed_tag = strelka_instance.tags[1]
self.assertEquals("JQ_SK_", reported_tag.input_caller_name)
self.assertEquals("JQ_SK_", passed_tag.input_caller_name)


class AlleleFreqTagTestCase(test_case.JacquardBaseTestCase):

def test_metaheader(self):
Expand All @@ -71,7 +20,7 @@ def test_format_missingAFTag(self):
tag.add_tag_values(processedVcfRecord)
self.assertEquals(originalVcfRecord.text(), processedVcfRecord.text())

def test_format_AUTag(self):
def test_format_AUTagWhenMultAlt(self):
tag = strelka._AlleleFreqTag()
line = "CHROM|POS|ID|REF|A,C|QUAL|FILTER|INFO|AU:CU:GU:TU|1,2:3,4:5,6:7,8|9,10:11,12:13,14:15,16\n".replace('|', "\t")
expected = "CHROM|POS|ID|REF|A,C|QUAL|FILTER|INFO|AU:CU:GU:TU:{0}AF|1,2:3,4:5,6:7,8:0.1,0.2|9,10:11,12:13,14:15,16:0.19,0.23\n".format(strelka.JQ_STRELKA_TAG).replace('|', "\t")
Expand Down Expand Up @@ -207,18 +156,26 @@ def test_vcf_records_newTagsPresent(self):
pos="22",
ref="A",
alt="G",
sample_tag_values={"sampleA": {"DP2": "46"},
"sampleB": {"DP2": "68"}})
sample_tag_values={"sampleA": {"TIR": "10,20", "DP2":"100"},
"sampleB": {"TIR": "15,25", "DP2":"100"}})
vcf_reader = MockVcfReader(records=[record1, record2])

strelka_vcf_reader = strelka._StrelkaVcfReader(vcf_reader)
vcf_records = [record for record in strelka_vcf_reader.vcf_records()]

self.assertEquals(2, len(vcf_records))
self.assertIn(strelka.JQ_STRELKA_TAG + "DP",
vcf_records[0].sample_tag_values["sampleA"])
self.assertIn(strelka.JQ_STRELKA_TAG + "DP",
vcf_records[1].sample_tag_values["sampleA"])

self.assertIn("DP2", vcf_records[0].format_tags)
self.assertIn(strelka.JQ_STRELKA_TAG + "DP", vcf_records[0].format_tags)
self.assertIn(strelka.JQ_STRELKA_TAG + "HC_SOM", vcf_records[0].format_tags)
self.assertIn(strelka.JQ_STRELKA_TAG + "CALLER_REPORTED", vcf_records[0].format_tags)
self.assertIn(strelka.JQ_STRELKA_TAG + "CALLER_PASSED", vcf_records[0].format_tags)

self.assertIn("TIR", vcf_records[1].format_tags)
self.assertIn(strelka.JQ_STRELKA_TAG + "AF", vcf_records[1].format_tags)
self.assertIn(strelka.JQ_STRELKA_TAG + "HC_SOM", vcf_records[1].format_tags)
self.assertIn(strelka.JQ_STRELKA_TAG + "CALLER_REPORTED", vcf_records[1].format_tags)
self.assertIn(strelka.JQ_STRELKA_TAG + "CALLER_PASSED", vcf_records[1].format_tags)

def test_open_and_close(self):
vcf_reader = MockVcfReader(metaheaders=["##foo", "##source=strelka"])
Expand Down
Loading

0 comments on commit aac1641

Please sign in to comment.