Skip to content

Commit

Permalink
Bumped version; adjusted Mutect transform and summary to improve how …
Browse files Browse the repository at this point in the history
…phased variants are handled
  • Loading branch information
cgates committed Sep 26, 2020
1 parent 5d05dcf commit ef5e79b
Show file tree
Hide file tree
Showing 13 changed files with 56 additions and 18 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
Changelog
=========

x.y.z (MM/DD/YYYY)
-----------------
- Adjusted Mutect handling to correctly marked phased normal (0|0)
as non-somatic.
- Adjusted summarize to ignore phasing when creating consensus GT

1.1.3 (9/24/2020)
-----------------
- Adjusted Mutect handling to accommodate either older FA or newer AF
Expand Down
Binary file modified examples/examples.zip
Binary file not shown.
2 changes: 1 addition & 1 deletion jacquard/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.1.3"
__version__ = "1.1.3x"
22 changes: 15 additions & 7 deletions jacquard/utils/summarize_rollup_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,22 +223,30 @@ def __init__(self):
'from JQ_*_GT and JQ_*_CALLER_PASSED). Majority '
'rules; ties go to the least unusual variant '
'(0/1>0/2>1/1). Variants which failed their '
'filter are ignored.'))
'filter are ignored. Phasing is removed.'))

@staticmethod
def _prioritize_genotype(values):
def _prioritize_genotype(gts):

def _dephase(gt):
result = gt
if '|' in gt:
result = '/'.join(sorted(gt.split('|')))
return result

def _break_ties(value):
if value == "0/0":
return "99"
return value

dephased_gts = list(map(_dephase, gts))
count = defaultdict(int)
for value in values:
count[value] += 1
for gt in dephased_gts:
count[gt] += 1

sorted_values = sorted(values,
key=lambda x: (-count[x], _break_ties(x)))
return sorted_values[0]
sorted_gts = sorted(dephased_gts,
key=lambda x: (-count[x], _break_ties(x)))
return sorted_gts[0]

def add_tag_values(self, vcf_record):
new_sample_tag_values = {}
Expand Down
2 changes: 1 addition & 1 deletion jacquard/variant_caller_transforms/mutect.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def _somatic_status(vcf_record, sample):
vcf_record.alt)
raise utils.JQException(msg)

if gt == "0/0":
if gt == "0/0" or gt == '0|0':
return "0"
else:
return "1"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
##FORMAT=<ID=JQ_SUMMARY_DP_AVERAGE,Number=1,Type=Float,Description="Average depth across recognized variant callers that reported depth for this sample-locus; rounded to integer [round(average(JQ_*_DP))].">
##FORMAT=<ID=JQ_SUMMARY_DP_RANGE,Number=1,Type=Float,Description="Max(depth) - min (depth) across recognized callers.">
##FORMAT=<ID=JQ_SUMMARY_DP_ZSCORE,Number=1,Type=Float,Description="Concordance of reported depth across callers: [(this DP range - mean DP range)/standard dev(all DP ranges)]. Values with null or missing DP range will be assigned zscore of '.'.">
##FORMAT=<ID=JQ_SUMMARY_HC_GT,Number=1,Type=String,Description="High confidence consensus genotype (inferred from JQ_*_GT and JQ_*_CALLER_PASSED). Majority rules; ties go to the least unusual variant (0/1>0/2>1/1). Variants which failed their filter are ignored.">
##FORMAT=<ID=JQ_SUMMARY_HC_GT,Number=1,Type=String,Description="High confidence consensus genotype (inferred from JQ_*_GT and JQ_*_CALLER_PASSED). Majority rules; ties go to the least unusual variant (0/1>0/2>1/1). Variants which failed their filter are ignored. Phasing is removed.">
##FORMAT=<ID=JQ_SUMMARY_SOM_COUNT,Number=1,Type=Integer,Description="Count of recognized variant callers that reported confident somatic call for this sample-locus.">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT tiny_strelka|NORMAL tiny_strelka|TUMOR
chr1 1147545 . A G . . JQ_SUMMARY_SAMPLES_REPORTED_COUNT=2;JQ_SUMMARY_SAMPLES_PASSED_COUNT=0 JQ_SK_AF:JQ_SK_DP:JQ_SK_HC_SOM:JQ_SK_CALLER_REPORTED:JQ_SK_CALLER_PASSED:JQ_SUMMARY_CALLERS_REPORTED_COUNT:JQ_SUMMARY_CALLERS_PASSED_COUNT:JQ_SUMMARY_CALLERS_REPORTED_LIST:JQ_SUMMARY_CALLERS_PASSED_LIST:JQ_SUMMARY_AF_AVERAGE:JQ_SUMMARY_AF_RANGE:JQ_SUMMARY_DP_AVERAGE:JQ_SUMMARY_DP_RANGE:JQ_SUMMARY_SOM_COUNT:JQ_SUMMARY_HC_GT 0.0:27:1:1:0:1:0:SK:.:0.0:.:27:.:1:. 0.31:35:1:1:0:1:0:SK:.:0.31:.:35:.:1:.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
##FORMAT=<ID=JQ_SUMMARY_DP_AVERAGE,Number=1,Type=Float,Description="Average allele frequency across recognized variant callers that reported frequency for this position; rounded to integer [round(average(JQ_*_DP))].">
##FORMAT=<ID=JQ_SUMMARY_DP_RANGE,Number=1,Type=Float,Description="Max(depth) - min (depth) across recognized callers.">
##FORMAT=<ID=JQ_SUMMARY_DP_ZSCORE,Number=1,Type=Float,Description="Concordance of reported depth across callers: [(this DP range - mean DP range)/standard dev(all DP ranges)]. Values with null or missing DP range will be assigned zscore of '.'.">
##FORMAT=<ID=JQ_SUMMARY_HC_GT,Number=1,Type=String,Description="High confidence consensus genotype (inferred from JQ_*_GT and JQ_*_CALLER_PASSED). Majority rules; ties go to the least unusual variant (0/1>0/2>1/1). Variants which failed their filter are ignored.">
##FORMAT=<ID=JQ_SUMMARY_HC_GT,Number=1,Type=String,Description="High confidence consensus genotype (inferred from JQ_*_GT and JQ_*_CALLER_PASSED). Majority rules; ties go to the least unusual variant (0/1>0/2>1/1). Variants which failed their filter are ignored. Phasing is removed.">
##FORMAT=<ID=JQ_SUMMARY_SOM_COUNT,Number=1,Type=Integer,Description="Count of recognized variant callers that reported confident somatic call for this sample-position.">
##FORMAT=<ID=JQ_VS_AF,Number=A,Type=Float,Description="Jacquard allele frequency for VarScan: Decimal allele frequency rounded to 4 digits (based on FREQ)">
##FORMAT=<ID=JQ_VS_CALLER_PASSED,Number=1,Type=Integer,Description="1 = variant FILTER is PASS in original VCF">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ JQ_SUMMARY_CALLERS_REPORTED_LIST FORMAT Comma-separated list variant callers whi
JQ_SUMMARY_DP_AVERAGE FORMAT Average allele frequency across recognized variant callers that reported frequency for this position; rounded to integer [round(average(JQ_*_DP))].
JQ_SUMMARY_DP_RANGE FORMAT Max(depth) - min (depth) across recognized callers.
JQ_SUMMARY_DP_ZSCORE FORMAT Concordance of reported depth across callers: [(this DP range - mean DP range)/standard dev(all DP ranges)]. Values with null or missing DP range will be assigned zscore of '.'.
JQ_SUMMARY_HC_GT FORMAT High confidence consensus genotype (inferred from JQ_*_GT and JQ_*_CALLER_PASSED). Majority rules; ties go to the least unusual variant (0/1>0/2>1/1). Variants which failed their filter are ignored.
JQ_SUMMARY_HC_GT FORMAT High confidence consensus genotype (inferred from JQ_*_GT and JQ_*_CALLER_PASSED). Majority rules; ties go to the least unusual variant (0/1>0/2>1/1). Variants which failed their filter are ignored. Phasing is removed.
JQ_SUMMARY_SAMPLES_PASSED_COUNT INFO Count of samples where a variant caller passed the filter in any of the Jacquard tagged VCFs
JQ_SUMMARY_SAMPLES_REPORTED_COUNT INFO Count of samples where this variant appeared in any of the Jacquard tagged VCFs (regardless of quality/filtering)
JQ_SUMMARY_SOM_COUNT FORMAT Count of recognized variant callers that reported confident somatic call for this sample-position.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
##FORMAT=<ID=JQ_SUMMARY_DP_AVERAGE,Number=1,Type=Float,Description="Average allele frequency across recognized variant callers that reported frequency for this position; rounded to integer [round(average(JQ_*_DP))].">
##FORMAT=<ID=JQ_SUMMARY_DP_RANGE,Number=1,Type=Float,Description="Max(depth) - min (depth) across recognized callers.">
##FORMAT=<ID=JQ_SUMMARY_DP_ZSCORE,Number=1,Type=Float,Description="Concordance of reported depth across callers: [(this DP range - mean DP range)/standard dev(all DP ranges)]. Values with null or missing DP range will be assigned zscore of '.'.">
##FORMAT=<ID=JQ_SUMMARY_HC_GT,Number=1,Type=String,Description="High confidence consensus genotype (inferred from JQ_*_GT and JQ_*_CALLER_PASSED). Majority rules; ties go to the least unusual variant (0/1>0/2>1/1). Variants which failed their filter are ignored.">
##FORMAT=<ID=JQ_SUMMARY_HC_GT,Number=1,Type=String,Description="High confidence consensus genotype (inferred from JQ_*_GT and JQ_*_CALLER_PASSED). Majority rules; ties go to the least unusual variant (0/1>0/2>1/1). Variants which failed their filter are ignored. Phasing is removed.">
##FORMAT=<ID=JQ_SUMMARY_SOM_COUNT,Number=1,Type=Integer,Description="Count of recognized variant callers that reported confident somatic call for this sample-position.">
##FORMAT=<ID=JQ_VS_AF,Number=A,Type=Float,Description="Jacquard allele frequency for VarScan: Decimal allele frequency rounded to 4 digits (based on FREQ)">
##FORMAT=<ID=JQ_VS_CALLER_PASSED,Number=1,Type=Integer,Description="1 = variant FILTER is PASS in original VCF">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
##FORMAT=<ID=JQ_SUMMARY_DP_AVERAGE,Number=1,Type=Float,Description="Average depth across recognized variant callers that reported depth for this sample-locus; rounded to integer [round(average(JQ_*_DP))].">
##FORMAT=<ID=JQ_SUMMARY_DP_RANGE,Number=1,Type=Float,Description="Max(depth) - min (depth) across recognized callers.">
##FORMAT=<ID=JQ_SUMMARY_DP_ZSCORE,Number=1,Type=Float,Description="Concordance of reported depth across callers: [(this DP range - mean DP range)/standard dev(all DP ranges)]. Values with null or missing DP range will be assigned zscore of '.'.">
##FORMAT=<ID=JQ_SUMMARY_HC_GT,Number=1,Type=String,Description="High confidence consensus genotype (inferred from JQ_*_GT and JQ_*_CALLER_PASSED). Majority rules; ties go to the least unusual variant (0/1>0/2>1/1). Variants which failed their filter are ignored.">
##FORMAT=<ID=JQ_SUMMARY_HC_GT,Number=1,Type=String,Description="High confidence consensus genotype (inferred from JQ_*_GT and JQ_*_CALLER_PASSED). Majority rules; ties go to the least unusual variant (0/1>0/2>1/1). Variants which failed their filter are ignored. Phasing is removed.">
##FORMAT=<ID=JQ_SUMMARY_SOM_COUNT,Number=1,Type=Integer,Description="Count of recognized variant callers that reported confident somatic call for this sample-locus.">
##FORMAT=<ID=JQ_VS_AF,Number=A,Type=Float,Description="Jacquard allele frequency for VarScan: Decimal allele frequency rounded to 4 digits (based on FREQ)">
##FORMAT=<ID=JQ_VS_CALLER_PASSED,Number=1,Type=Integer,Description="1 = variant FILTER is PASS in original VCF">
Expand Down
2 changes: 1 addition & 1 deletion test/jacquard_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_error_raisesTransformedMessage(self):

class JacquardTestCase(test_case.JacquardBaseTestCase):
def test_version(self):
self.assertEquals("1.1.3", jacquard.__version__)
self.assertEquals("1.1.3x", jacquard.__version__)

def test_get_execution_context(self):
command = "foo input_dir output_dir"
Expand Down
19 changes: 16 additions & 3 deletions test/utils/summarize_rollup_transform_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ class HCGenotypeTagTestCase(test_case.JacquardBaseTestCase):
_TAG_ID = "{}HC_GT".format(summarize_caller.JQ_SUMMARY_TAG)

def test_metaheader(self):
self.assertEquals('##FORMAT=<ID={}HC_GT,Number=1,Type=String,Description="High confidence consensus genotype (inferred from JQ_*_GT and JQ_*_CALLER_PASSED). Majority rules; ties go to the least unusual variant (0/1>0/2>1/1). Variants which failed their filter are ignored.">'.format(summarize_caller.JQ_SUMMARY_TAG),
self.assertEquals('##FORMAT=<ID={}HC_GT,Number=1,Type=String,Description="High confidence consensus genotype (inferred from JQ_*_GT and JQ_*_CALLER_PASSED). Majority rules; ties go to the least unusual variant (0/1>0/2>1/1). Variants which failed their filter are ignored. Phasing is removed.">'.format(summarize_caller.JQ_SUMMARY_TAG),
summarize_caller._HCGenotypeTag().metaheader)

def test_add_tag_values(self):
Expand Down Expand Up @@ -225,10 +225,23 @@ def test_prioritize_genotype(self):
tag = summarize_caller._HCGenotypeTag()
self.assertEquals("0/1", tag._prioritize_genotype(["0/1"]))
self.assertEquals("0/1", tag._prioritize_genotype(["0/0", "0/1"]))
self.assertEquals("1/1", tag._prioritize_genotype(["0/0", "1/1"]))
self.assertEquals("0/0", tag._prioritize_genotype(["0/0", "0/0", "1/1"]))
self.assertEquals("1/1", tag._prioritize_genotype(["1/1", "1/1", "0/1"]))
self.assertEquals("0/1", tag._prioritize_genotype(["1/1", "0/1", "0/1"]))
self.assertEquals("0/1", tag._prioritize_genotype(["0/1", "0/2"]))
self.assertEquals("0/1", tag._prioritize_genotype(["0/1", "0/2", "1/1", "1/2", "2/2"]))

def test_prioritize_genotype_dephases(self):
tag = summarize_caller._HCGenotypeTag()
self.assertEquals("0/1", tag._prioritize_genotype(["0|1"]))
self.assertEquals("0/1", tag._prioritize_genotype(["1|0"]))
self.assertEquals("1/3", tag._prioritize_genotype(["3|1"]))
self.assertEquals("0/0", tag._prioritize_genotype(["0|0", "0/0"]))
self.assertEquals("0/1", tag._prioritize_genotype(["0|0", "0|1", "1|0"]))
self.assertEquals("1/1", tag._prioritize_genotype(["0|0", "1|1"]))
self.assertEquals("1/1", tag._prioritize_genotype(["1|1", "1|1", "0|1"]))
self.assertEquals("0/1", tag._prioritize_genotype(["1|1", "0|1", "1|0"]))
self.assertEquals("0/1", tag._prioritize_genotype(["0|1", "0|2"]))


class AlleleFreqRangeTagTestCase(test_case.JacquardBaseTestCase):
_TAG_ID = "{}AF_RANGE".format(summarize_caller.JQ_SUMMARY_TAG)
Expand Down
11 changes: 11 additions & 0 deletions test/variant_caller_transforms/mutect_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,17 @@ def test_filterPassSomeSamplesVariants(self):
tag.add_tag_values(processedVcfRecord)
self.assertEquals(expected, processedVcfRecord.text())

def test_filterPassSomeSamplesPhasedVariants(self):
tag = mutect._SomaticTagFilterMutectCalls()
line = 'CHROM^POS^ID^REF^ALT^QUAL^PASS^INFO^F1:F2:F3:GT^SA.1:SA.2:SA.3:0|0^SB.1:SB.2:SB.3:0|1\n'.replace('^', '\t')
expected = 'CHROM^POS^ID^REF^ALT^QUAL^PASS^INFO^F1:F2:F3:GT:{0}HC_SOM^SA.1:SA.2:SA.3:0|0:0^SB.1:SB.2:SB.3:0|1:1\n'\
.replace('^', '\t')\
.format(mutect.JQ_MUTECT_TAG)
processedVcfRecord = vcf.VcfRecord.parse_record(line, ["SA", "SB"])
tag.add_tag_values(processedVcfRecord)
self.assertEquals(expected, processedVcfRecord.text())



class MutectTestCase(test_case.JacquardBaseTestCase):
def setUp(self):
Expand Down

0 comments on commit ef5e79b

Please sign in to comment.