diff --git a/jacquard/jacquard.py b/jacquard/jacquard.py index 8c88a4c..5c2c752 100644 --- a/jacquard/jacquard.py +++ b/jacquard/jacquard.py @@ -1,11 +1,27 @@ #!/usr/bin/env python -"""Launcher for suite of VCF sub-commands. +"""Launch-point for suite of sub-commands. The only executable module in the project; this module * validates command line args * manages use of temp directories (to keep output clean and atomic) * dipatches to sub-commands as appropriate * attempts to deal with usage and run-time errors + +Jacquard first writes results to temp dir and only copies results on successful + completion. + +Then architecture of Jacquard modules can be divided into: + * commands : These transform files or directories (e.g. translate.py) and are + indirectly executable through the jacquard module. Each command must + implement an execute method that does the heavy lifting along with some + simpler methods that expedite command validation + * callers : These transform VcfRecords (e.g. mutect.py). They typically have + a collection of tag classes; where each tag holds the metaheader and + code to transform a single VcfRecord. Note that a caller could + manipulate any aspect of a VcfRecord, but (by strong convention) typically + only adds information, for example add a sample-format tag, add an info + field, or add a filter field. + * helpers : Common functionality (e.g. command_validator, vcf, logger, etc.) """ ## Copyright 2014 Bioinformatics Core, University of Michigan ## diff --git a/jacquard/merge.py b/jacquard/merge.py index 82af313..eb88217 100644 --- a/jacquard/merge.py +++ b/jacquard/merge.py @@ -1,12 +1,28 @@ """Merges a set of VCF files into a single VCF file. -Each incoming VCF record is joined with other VCF records that share the same -"coordinate", where coordinate is the (chrom, pos, ref, and alt). -The merged file will have as many records as the distinct set of -(chrom, pos, ref, alt) across all input files. -The FORMAT tags from all incoming VCFs are aggregated to a single merged list. -Each variant record will be the merged set of incoming format tags. -Incoming fixed fields (e.g. QUAL, FILTER, INFO) are ignored. +* Merge assumes the incoming VCFs are aligned to the same set of contigs. +* Merge assumes incoming VCFs names follow this pattern: + patientIdentifier.*.vcf + Specifically, the first element of the VCF file name should be the patient + name; for example you mihght have this: + patientA.mutect.vcf, patientA.strelka.vcf, patientA.varscan.vcf, + patientB.mutect.vcf, patientB.strelka.vcf, patientB.varscan.vcf, + etc. +* Merge assumes the sample names (i.e. the VCF sample column headers, typically + TUMOR and NORMAL) are consistent across the input VCFs. (The preceding + Jacquard command "translate" ensures this is true.) +* Each incoming VCF record is joined with other VCF records that share the same + "coordinate", where coordinate is the (chrom, pos, ref, and alt). +* Calls for the same patient/sample are joined into a single column. Each output + column is the patient name (prefix of the file name) plus the sample name + from the column header. +* The merged file will have as many records as the distinct set of + (chrom, pos, ref, alt) across all input files. +* Each variant record will have the minimal set of incoming format tags for + that variant (i.e. the list of format tags is specific to each record). +* Incoming QUAL and INFO fields are ignored. +* By default, merge only includes "Jacqaurd" FORMAT tags, but other tags can be + included through and optional a command line arg. """ from __future__ import print_function, absolute_import from collections import defaultdict, OrderedDict diff --git a/jacquard/translate.py b/jacquard/translate.py index eb73da4..802c006 100644 --- a/jacquard/translate.py +++ b/jacquard/translate.py @@ -1,8 +1,14 @@ """Translates a set of VCF files by adding standardized tags. -Reads incoming VCF files determining appropriate origin caller (e.g. MuTect); -emits a new file with additional translated versions of incoming FORMAT tags -(e.g. JQ_MT_AF). +Reads incoming VCF files determining appropriate "origin caller" (e.g. MuTect); +emits a new translated file. The translated file is similar to the input +file, with these exceptions: + * translate will add a filter flag anomalous VCF records, i.e. records + that don't conform to the standard; for example both Strelka and + Varscan emit VCF records with invalid ALT values. + * translate will add new Jacquard-standard FORMAT tags that augment the + caller specific tags (e.g. a Varscan FREQ tag would generates a new + JQ_VS_AF tag). There will typically be a translated VCF file for each input VCF file. Unrecognized VCFs are not copied to output. @@ -141,6 +147,10 @@ def _log_unclaimed_readers(unclaimed_readers): msg = unclaimed_log_messgae.format(reader.file_name) logger.warning(msg) + +#TODO (cgates): This module is both a command and also manipulates VcfRecords +# like a caller. This is the only body of code that does both these things. +# Does this bother anyone else? def execute(args, execution_context): validate_args(args) diff --git a/jacquard/utils.py b/jacquard/utils.py index cce03a5..553cc38 100644 --- a/jacquard/utils.py +++ b/jacquard/utils.py @@ -6,6 +6,7 @@ from __future__ import absolute_import, print_function +#TODO (cgates): Why does this need a string? Seems like it should take a number? def round_two_digits(val): if len(val.split(".")[1]) > 2: return "{0:.2f}".format(float(val)) diff --git a/jacquard/variant_callers/common_tags.py b/jacquard/variant_callers/common_tags.py index ad7aa7e..27c44db 100644 --- a/jacquard/variant_callers/common_tags.py +++ b/jacquard/variant_callers/common_tags.py @@ -1,4 +1,4 @@ -#pylint: disable=too-few-public-methods, unused-argument +"""Common tags used by several callers.""" from __future__ import print_function, absolute_import from jacquard import __version__ @@ -6,6 +6,12 @@ CALLER_PASSED_TAG = "CALLER_PASSED" class ReportedTag(object): + """Tracks whether the caller reported this variant (i.e. it's in the VCF). + + This tag could be inferred through the presence of other tags, but adding + it explicitly simplifies how summary tags are generated. + """ + #pylint: disable=too-few-public-methods def __init__(self, tag_name): self.tag_name = tag_name self.metaheader = ('##FORMAT=