ex-198 (cgates): Added docstrings. Light refactoring.

umich-brcf-bioinf · Mar 6, 2015 · b58e8c4 · b58e8c4
1 parent 43397e1
commit b58e8c4
Show file tree

Hide file tree

Showing 11 changed files with 234 additions and 83 deletions.
diff --git a/jacquard/jacquard.py b/jacquard/jacquard.py
@@ -1,11 +1,27 @@
 #!/usr/bin/env python
-"""Launcher for suite of VCF sub-commands.
+"""Launch-point for suite of sub-commands.
 
 The only executable module in the project; this module
  * validates command line args
  * manages use of temp directories (to keep output clean and atomic)
  * dipatches to sub-commands as appropriate
  * attempts to deal with usage and run-time errors
+
+Jacquard first writes results to temp dir and only copies results on successful
+    completion.
+
+Then architecture of Jacquard modules can be divided into:
+ * commands : These transform files or directories (e.g. translate.py) and are
+     indirectly executable through the jacquard module. Each command must
+     implement an execute method that does the heavy lifting along with some
+     simpler methods that expedite command validation
+ * callers : These transform VcfRecords (e.g. mutect.py). They typically have
+     a collection of tag classes; where each tag holds the metaheader and
+     code to transform a single VcfRecord. Note that a caller could
+     manipulate any aspect of a VcfRecord, but (by strong convention) typically
+     only adds information, for example add a sample-format tag, add an info
+     field, or add a filter field.
+ * helpers : Common functionality (e.g. command_validator, vcf, logger, etc.)
 """
 ##   Copyright 2014 Bioinformatics Core, University of Michigan
 ##

diff --git a/jacquard/merge.py b/jacquard/merge.py
@@ -1,12 +1,28 @@
 """Merges a set of VCF files into a single VCF file.
 
-Each incoming VCF record is joined with other VCF records that share the same
-"coordinate", where coordinate is the (chrom, pos, ref, and alt).
-The merged file will have as many records as the distinct set of
-(chrom, pos, ref, alt) across all input files.
-The FORMAT tags from all incoming VCFs are aggregated to a single merged list.
-Each variant record will be the merged set of incoming format tags.
-Incoming fixed fields (e.g. QUAL, FILTER, INFO) are ignored.
+* Merge assumes the incoming VCFs are aligned to the same set of contigs.
+* Merge assumes incoming VCFs names follow this pattern:
+    patientIdentifier.*.vcf
+    Specifically, the first element of the VCF file name should be the patient
+    name; for example you mihght have this:
+        patientA.mutect.vcf, patientA.strelka.vcf, patientA.varscan.vcf,
+        patientB.mutect.vcf, patientB.strelka.vcf, patientB.varscan.vcf,
+        etc.
+* Merge assumes the sample names (i.e. the VCF sample column headers, typically 
+    TUMOR and NORMAL) are consistent across the input VCFs. (The preceding
+    Jacquard command "translate" ensures this is true.)
+* Each incoming VCF record is joined with other VCF records that share the same
+    "coordinate", where coordinate is the (chrom, pos, ref, and alt).
+* Calls for the same patient/sample are joined into a single column. Each output
+    column is the patient name (prefix of the file name) plus the sample name
+    from the column header.
+* The merged file will have as many records as the distinct set of
+    (chrom, pos, ref, alt) across all input files.
+* Each variant record will have the minimal set of incoming format tags for
+    that variant (i.e. the list of format tags is specific to each record).
+* Incoming QUAL and INFO fields are ignored.
+* By default, merge only includes "Jacqaurd" FORMAT tags, but other tags can be
+    included through and optional a command line arg.
 """
 from __future__ import print_function, absolute_import
 from collections import defaultdict, OrderedDict

diff --git a/jacquard/translate.py b/jacquard/translate.py
@@ -1,8 +1,14 @@
 """Translates a set of VCF files by adding standardized tags.
 
-Reads incoming VCF files determining appropriate origin caller (e.g. MuTect);
-emits a new file with additional translated versions of incoming FORMAT tags
-(e.g. JQ_MT_AF).
+Reads incoming VCF files determining appropriate "origin caller" (e.g. MuTect);
+emits a new translated file. The translated file is similar to the input
+file, with these exceptions:
+    * translate will add a filter flag anomalous VCF records, i.e. records
+        that don't conform to the standard; for example both Strelka and
+        Varscan emit VCF records with invalid ALT values.
+    * translate will add new Jacquard-standard FORMAT tags that augment the
+        caller specific tags (e.g. a Varscan FREQ tag would generates a new
+        JQ_VS_AF tag).
 
 There will typically be a translated VCF file for each input VCF file.
 Unrecognized VCFs are not copied to output.
@@ -141,6 +147,10 @@ def _log_unclaimed_readers(unclaimed_readers):
         msg = unclaimed_log_messgae.format(reader.file_name)
         logger.warning(msg)
 
+
+#TODO (cgates): This module is both a command and also manipulates VcfRecords
+# like a caller. This is the only body of code that does both these things.
+# Does this bother anyone else?
 def execute(args, execution_context):
     validate_args(args)
 

diff --git a/jacquard/utils.py b/jacquard/utils.py
@@ -6,6 +6,7 @@
 from __future__ import absolute_import, print_function
 
 
+#TODO (cgates): Why does this need a string? Seems like it should take a number?
 def round_two_digits(val):
     if len(val.split(".")[1]) > 2:
         return "{0:.2f}".format(float(val))

diff --git a/jacquard/variant_callers/common_tags.py b/jacquard/variant_callers/common_tags.py
@@ -1,11 +1,17 @@
-#pylint: disable=too-few-public-methods, unused-argument
+"""Common tags used by several callers."""
 from __future__ import print_function, absolute_import
 from jacquard import __version__
 
 CALLER_REPORTED_TAG = "CALLER_REPORTED"
 CALLER_PASSED_TAG = "CALLER_PASSED"
 
 class ReportedTag(object):
+    """Tracks whether the caller reported this variant (i.e. it's in the VCF).
+
+    This tag could be inferred through the presence of other tags, but adding
+    it explicitly simplifies how summary tags are generated.
+    """
+    #pylint: disable=too-few-public-methods
     def __init__(self, tag_name):
         self.tag_name = tag_name
         self.metaheader = ('##FORMAT=<ID={}{},'
@@ -25,6 +31,14 @@ def add_tag_values(self, vcf_record):
                                         sample_values)
 
 class PassedTag(object):
+    """Tracks whether the caller passed this variant.
+
+    This enables a useful summary tag. Note that Jaquard may flag a variant
+    that originally passed as invalid (e.g. invalid Varscan ALT value). In
+    this case, this tag represents what the origin caller thought and not
+    Jacquard's opinion.
+    """
+    #pylint: disable=too-few-public-methods
     def __init__(self, tag_name):
         self.tag_name = tag_name
         self.metaheader = ('##FORMAT=<ID={}{},'

diff --git a/jacquard/variant_callers/mutect.py b/jacquard/variant_callers/mutect.py
@@ -1,13 +1,18 @@
-#pylint: disable=too-few-public-methods, unused-argument
+"""Interprets MuTect VCF files adding Jacquard standard information.
+
+MuTect VCFs are assumed to have a ".vcf" extension and have a valid 
+"##MuTect=..." metaheader.
+"""
 from __future__ import print_function, absolute_import
-import jacquard.variant_callers.common_tags as common_tags
+from jacquard import __version__
 import jacquard.utils as utils
+import jacquard.variant_callers.common_tags as common_tags
 import jacquard.vcf as vcf
-from jacquard import __version__
 
 JQ_MUTECT_TAG = "JQ_MT_"
 
 class _AlleleFreqTag(object):
+    #pylint: disable=too-few-public-methods
     def __init__(self):
         self.metaheader = ('##FORMAT=<ID={0}AF,'
                            'Number=A,'
@@ -34,6 +39,7 @@ def _standardize_af(value):
         return ",".join(new_values)
 
 class _DepthTag(object):
+    #pylint: disable=too-few-public-methods
     def __init__(self):
         self.metaheader = ('##FORMAT=<ID={0}DP,'
                            'Number=1,'
@@ -53,6 +59,7 @@ def add_tag_values(vcf_record):
             vcf_record.add_sample_tag_value(JQ_MUTECT_TAG + "DP", sample_values)
 
 class _SomaticTag(object):
+    #pylint: disable=too-few-public-methods
     def __init__(self):
         self.metaheader = ('##FORMAT=<ID={0}HC_SOM,'
                            'Number=1,'
@@ -83,37 +90,50 @@ def _somatic_status(ss_value):
             return "0"
 
 class Mutect(object):
+    """Recognize and transform MuTect VCFs to standard Jacquard format.
+    
+    MuTect VCFs are blessedly compliant and straightforward to translate, with
+    the following exception. The incoming header has the sample name values
+    (derived from the input alignments). To play well with other callers like
+    Strelka and VarScan, the sample headers are replaced with Normal and Tumor.
+    """
+    _MUTECT_METAHEADER_PREFIX = "##MuTect"
     _NORMAL_SAMPLE_KEY = "normal_sample_name"
     _TUMOR_SAMPLE_KEY = "tumor_sample_name"
 
     def __init__(self):
         self.name = "MuTect"
         self.abbr = "MT"
-        self.tags = [common_tags.ReportedTag(JQ_MUTECT_TAG),
-                     common_tags.PassedTag(JQ_MUTECT_TAG),
-                     _AlleleFreqTag(), _DepthTag(), _SomaticTag()]
-        self.file_name_search = ""
 
-    #TODO: (cgates): Why using ints instead of boolean for this method?
+    ##TODO (cgates): deprecate; remove
     @staticmethod
-    def validate_input_file(meta_headers, column_header):
-        valid = 0
+    def validate_input_file(meta_headers, dummy_column_header):
         for line in meta_headers:
-            if "##MuTect" in line:
-                valid = 1
-                break
-        return valid
+            if line.startswith(Mutect._MUTECT_METAHEADER_PREFIX):
+                return True
+        return False
 
     @staticmethod
     def _is_mutect_vcf(file_reader):
-        if file_reader.file_name.endswith(".vcf"):
+        if file_reader.file_name.lower().endswith(".vcf"):
             vcf_reader = vcf.VcfReader(file_reader)
             for metaheader in vcf_reader.metaheaders:
-                if metaheader.startswith("##MuTect"):
+                if metaheader.startswith(Mutect._MUTECT_METAHEADER_PREFIX):
                     return True
         return False
 
     def claim(self, file_readers):
+        """Recognizes and claims MuTect VCFs form the set of all input VCFs.
+
+        Each defined caller has a chance to evaluate and claim all the incoming
+        files as something that it can process.
+        
+        Args:
+            file_readers: the collection of currently unclaimed files
+        
+        Returns:
+            A tuple of unclaimed readers and MuTectVcfReaders.
+        """
         unclaimed_readers = []
         vcf_readers = []
         for file_reader in file_readers:
@@ -128,7 +148,7 @@ def claim(self, file_readers):
     def _build_mutect_dict(metaheaders):
         mutect_dict = {}
         for metaheader in metaheaders:
-            if metaheader.startswith("##MuTect="):
+            if metaheader.startswith(Mutect._MUTECT_METAHEADER_PREFIX):
                 split_line = metaheader.strip('"').split(" ")
                 for item in split_line:
                     split_item = item.split("=")
@@ -140,6 +160,12 @@ def _build_mutect_dict(metaheaders):
         return mutect_dict
 
     def _get_new_column_header(self, vcf_reader):
+        """Returns a standardized column header.
+
+        MuTect sample headers include the name of input alignment, which is
+        nice, but doesn't match up with the sample names reported in Strelka
+        or VarScan. To fix this, we replace with NORMAL and TUMOR using the
+        MuTect metadata command line to replace them correctly."""
         mutect_dict = self._build_mutect_dict(vcf_reader.metaheaders)
 
         new_header_list = []
@@ -162,6 +188,14 @@ def _get_new_column_header(self, vcf_reader):
 
 
 class _MutectVcfReader(object):
+    """Adapter that presents a MuTect VCF as a VcfReader.
+
+    This follows the VcfReader interface, delegating calls to the base
+    VcfReader, adjusting metaheaders, column header, and individual
+    variants as appropriate.
+
+    See VcfReader for more info.
+    """
     def __init__(self, vcf_reader):
         self._vcf_reader = vcf_reader
         self._caller = Mutect()

diff --git a/jacquard/variant_callers/strelka.py b/jacquard/variant_callers/strelka.py
@@ -1,4 +1,14 @@
-#pylint: disable=too-few-public-methods, unused-argument
+"""Interprets Strelka VCF files adding Jacquard standard information.
+
+* Strelka produces a separate file for SNVs and indels. Jacquard can process 
+    either or both.
+* Jacquard standard tags are based on tier 2 data and use different source tags
+    based on whether the file is indel or snp.
+* Strelka VCFs are assumed to have a ".vcf" extension and have a
+    "##source=strelka" metaheader.
+
+See tag definitions for more info.
+"""
 from __future__ import print_function, absolute_import
 import jacquard.vcf as vcf
 import jacquard.variant_callers.common_tags as common_tags
@@ -8,6 +18,7 @@
 JQ_STRELKA_TAG = "JQ_SK_"
 
 class _AlleleFreqTag(object):
+    #pylint: disable=too-few-public-methods
     def __init__(self):
         #pylint: disable=line-too-long
         self.metaheader = ('##FORMAT=<ID={0}AF,'
@@ -84,6 +95,7 @@ def _standardize_af(value):
         return utils.round_two_digits(value)
 
 class _DepthTag(object):
+    #pylint: disable=too-few-public-methods
     REQUIRED_TAGS = set(["DP2", "AU"])
     NUCLEOTIDE_DEPTH_TAGS = ["AU", "CU", "TU", "GU"]
 
@@ -117,7 +129,10 @@ def add_tag_values(vcf_record):
             sample_values[sample] = _DepthTag._get_tier2_base_depth(sample_tags)
         vcf_record.add_sample_tag_value(JQ_STRELKA_TAG + "DP", sample_values)
 
+
+##TODO (cgates): Make this robust to sample order changes
 class _SomaticTag(object):
+    #pylint: disable=too-few-public-methods
     def __init__(self):
         #pylint: disable=line-too-long
         self.metaheader = ('##FORMAT=<ID={0}HC_SOM,'
@@ -143,13 +158,22 @@ def _somatic_status(cls, sample_index, vcf_record):
             return "0"
 
 class Strelka(object):
+    """Recognize and transform Strelka VCFs to standard Jacquard format.
+    
+    Note that Strelka sometimes reports variants which fail the filter as
+    having an ALT of "."; this seems to be Strelka's shorthand for saying,
+    "I couldn't be sure there was an ALT". Unfortunately, that's not a valid
+    ALT value so these rows are flagged and (in a typical workflow) excluded.
+    """
+
     def __init__(self):
         self.name = "Strelka"
         self.abbr = "SK"
         self.meta_header = "##jacquard.normalize_strelka.sources={0},{1}\n"
 
+    ##TODO (cgates): deprecated; remove
     @staticmethod
-    def validate_input_file(meta_headers, column_header):
+    def validate_input_file(meta_headers, dummy_column_header):
         return "##source=strelka" in meta_headers
 
     @staticmethod
@@ -160,6 +184,17 @@ def _is_strelka_vcf(file_reader):
         return False
 
     def claim(self, file_readers):
+        """Recognizes and claims Strelka VCFs form the set of all input VCFs.
+
+        Each defined caller has a chance to evaluate and claim all the incoming
+        files as something that it can process.
+        
+        Args:
+            file_readers: the collection of currently unclaimed files
+        
+        Returns:
+            A tuple of unclaimed readers and StrelkaVcfReaders.
+        """
         unclaimed_readers = []
         vcf_readers = []
         for file_reader in file_readers:
@@ -171,6 +206,14 @@ def claim(self, file_readers):
         return (unclaimed_readers, vcf_readers)
 
 class _StrelkaVcfReader(object):
+    """Adapter that presents a Strelka VCF as a VcfReader.
+
+    This follows the VcfReader interface, delegating calls to the base
+    VcfReader, adjusting metaheaders, and individual
+    variants as appropriate.
+
+    See VcfReader for more info.
+    """
     def __init__(self, vcf_reader):
         self._vcf_reader = vcf_reader
         self._caller = Strelka()