Skip to content

Commit

Permalink
ex-258 (jebene/dkriti) fixed bug in merge for unsorted vcfs. added
Browse files Browse the repository at this point in the history
funcitonal test
  • Loading branch information
dkriti committed May 26, 2015
1 parent 7d55ff7 commit 9a39011
Show file tree
Hide file tree
Showing 6 changed files with 328 additions and 4 deletions.
5 changes: 3 additions & 2 deletions jacquard/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ def _sort_vcf(reader, sorted_dir):
writer.write(vcf_record.text())

writer.close()
reader = vcf.VcfReader(vcf.FileReader(writer.output_filepath))
reader = MergeVcfReader(vcf.FileReader(writer.output_filepath))
return reader

def _get_unsorted_readers(vcf_readers):
Expand Down Expand Up @@ -491,7 +491,8 @@ def _sort_readers(vcf_readers, output_path):
unsorted_count = 0
if unsorted_readers:
sorted_dir = os.path.join(os.path.dirname(output_path), "tmp")
os.makedirs(sorted_dir)
if not os.path.isdir(sorted_dir):
os.makedirs(sorted_dir)

for reader in vcf_readers:
if reader in unsorted_readers:
Expand Down
34 changes: 34 additions & 0 deletions test/functional_tests/02_merge_unsorted/benchmark/merged.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
##fileformat=VCFv4.1
##jacquard.merge.sample=<Column=1,Name=tiny|NORMAL,Source=tiny.mutect.normalized.jacquardTags.HCsomatic.vcf|tiny.strelka.normalized.jacquardTags.HCsomatic.vcf|tiny.varscan.normalized.jacquardTags.HCsomatic.vcf>
##jacquard.merge.sample=<Column=2,Name=tiny|TUMOR,Source=tiny.mutect.normalized.jacquardTags.HCsomatic.vcf|tiny.strelka.normalized.jacquardTags.HCsomatic.vcf|tiny.varscan.normalized.jacquardTags.HCsomatic.vcf>
##jacquard=<Timestamp="2015-03-30 13:37:01",Command="<module 'jacquard.merge' from 'C:\Users\jebene\git\Jacquard\jacquard\merge.pyc'>",Cwd="C:\Users\jebene\git">
##contig=<ID=chr1,length=249250621>
##contig=<ID=chr2,length=243199373>
##contig=<ID=chr3,length=198022430>
##contig=<ID=chr13,length=115169878>
##INFO=<ID=JQ_MULT_ALT_LOCUS,Number=0,Type=Flag,Description="More than one alt allele was seen at this locus.">
##FORMAT=<ID=JQ_MT_AF,Number=A,Type=Float,Description="Jacquard allele frequency for MuTect: Decimal allele frequency rounded to 2 digits (based on FA)">
##FORMAT=<ID=JQ_MT_DP,Number=1,Type=Integer,Description="Jacquard depth for MuTect (based on DP)">
##FORMAT=<ID=JQ_MT_HC_SOM,Number=1,Type=Integer,Description="Jacquard somatic status for MuTect: 0=non-somatic,1=somatic (based on SS FORMAT tag)">
##FORMAT=<ID=JQ_SK_AF,Number=A,Type=Float,Description="Jacquard allele frequency for Strelka: Decimal allele frequency rounded to 2 digits (based on alt_depth/total_depth. Uses (TIR tier 2)/DP2 if available, otherwise uses (ACGT tier2 depth) / DP2)">
##FORMAT=<ID=JQ_SK_DP,Number=1,Type=Integer,Description="Jacquard depth for Strelka (uses DP2 if available, otherwise uses ACGT tier2 depth)">
##FORMAT=<ID=JQ_SK_HC_SOM,Number=1,Type=Integer,Description="Jacquard somatic status for Strelka: 0=non-somatic,1=somatic (based on PASS in FILTER column)">
##FORMAT=<ID=JQ_VS_AF,Number=A,Type=Float,Description="Jacquard allele frequency for VarScan: Decimal allele frequency rounded to 2 digits (based on FREQ)">
##FORMAT=<ID=JQ_VS_DP,Number=1,Type=Integer,Description="Jacquard depth for VarScan (based on DP)">
##FORMAT=<ID=JQ_VS_HC_SOM,Number=1,Type=Integer,Description="Jacquard somatic status for VarScan: 0=non-somatic,1=somatic (based on SOMATIC info tag and if sample is TUMOR)">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT tiny|NORMAL tiny|TUMOR
chr1 14948 . G A . . . JQ_MT_AF:JQ_MT_DP:JQ_MT_HC_SOM:JQ_VS_AF:JQ_VS_DP:JQ_VS_HC_SOM 0.09:174:0:0.06:171:0 0.13:302:0:0.14:303:1
chr1 137622 . G A . . . JQ_MT_AF:JQ_MT_DP:JQ_MT_HC_SOM 0.24:35:0 0.29:42:1
chr1 1147545 . A G . . . JQ_SK_AF:JQ_SK_DP:JQ_SK_HC_SOM 0.0:27:0 0.31:35:1
chr1 1169795 . C T . . . JQ_SK_AF:JQ_SK_DP:JQ_SK_HC_SOM 0.0:37:0 0.22:46:1
chr1 1444553 . C T . . . JQ_SK_AF:JQ_SK_DP:JQ_SK_HC_SOM 0.0:29:0 0.35:31:1
chr1 1459635 . C A . . . JQ_SK_AF:JQ_SK_DP:JQ_SK_HC_SOM 0.0:35:0 0.17:30:1
chr1 1572893 . G A . . . JQ_SK_AF:JQ_SK_DP:JQ_SK_HC_SOM 0.02:65:0 0.09:81:1
chr1 1696633 . GA G . . . JQ_SK_AF:JQ_SK_DP:JQ_SK_HC_SOM 0.0:48:0 0.27:45:1
chr1 1910112 . G A . . . JQ_SK_AF:JQ_SK_DP:JQ_SK_HC_SOM 0.0:28:0 0.34:35:1
chr1 1912090 . G A . . JQ_MULT_ALT_LOCUS JQ_SK_AF:JQ_SK_DP:JQ_SK_HC_SOM 0.0:27:0 0.31:32:1
chr1 1912090 . G T . . JQ_MULT_ALT_LOCUS JQ_SK_AF:JQ_SK_DP:JQ_SK_HC_SOM 0.0:27:0 0.31:32:1
chr2 3412474 . G A . . . JQ_SK_AF:JQ_SK_DP:JQ_SK_HC_SOM 0.0:46:0 0.31:61:1
chr2 3545096 . G T . . . JQ_SK_AF:JQ_SK_DP:JQ_SK_HC_SOM 0.0:37:0 0.25:63:1
chr3 137624 . G A . . . JQ_MT_AF:JQ_MT_DP:JQ_MT_HC_SOM 0.26:34:0 0.23:41:1
chr13 3545099 . G C . . . JQ_SK_AF:JQ_SK_DP:JQ_SK_HC_SOM 0.0:35:0 0.22:63:1
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="Accept as a confident somatic mutation">
##FILTER=<ID=REJECT,Description="Rejected as a confident somatic mutation">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=BQ,Number=A,Type=Float,Description="Average base quality for reads supporting alleles">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=FA,Number=A,Type=Float,Description="Allele fraction of the alternate allele with regard to reference">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SS,Number=1,Type=Integer,Description="Variant status relative to non-adjacent Normal,0=wildtype,1=germline,2=somatic,3=LOH,4=post-transcriptional modification,5=unknown">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Somatic event">
##INFO=<ID=VT,Number=1,Type=String,Description="Variant type, can be SNP, INS or DEL">
##MuTect="analysis_type=MuTect input_file=[11N_25714.sorted.bam, 11Ta_25715.sorted.bam] read_buffer_size=null phone_home=STANDARD gatk_key=null tag=NA read_filter=[BadCigar] intervals=[TargetRegion_buffered10bases.bed] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=ucsc.hg19.fasta nonDeterministicRandomSeed=false disableRandomization=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 enable_experimental_downsampling=false baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 defaultBaseQualities=-1 validation_strictness=SILENT remove_program_records=false keep_program_records=false unsafe=null num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false noop=false enable_extended_output=false artifact_detection_mode=false tumor_sample_name=25715 bam_tumor_sample_name=null normal_sample_name=25714 force_output=false force_alleles=false only_passing_calls=false initial_tumor_lod=4.0 tumor_lod=6.3 fraction_contamination=0.02 minimum_mutation_cell_fraction=0.0 normal_lod=2.2 normal_artifact_lod=1.0 strand_artifact_lod=2.0 strand_artifact_power_threshold=0.9 dbsnp_normal_lod=5.5 somatic_classification_normal_power_threshold=0.95 minimum_normal_allele_fraction=0.0 tumor_f_pretest=0.0050 min_qscore=5 gap_events_threshold=3 heavily_clipped_read_fraction=0.3 clipping_bias_pvalue_threshold=0.05 fraction_mapq0_threshold=0.5 pir_median_threshold=10.0 pir_mad_threshold=3.0 required_maximum_alt_allele_mapping_quality_score=20 max_alt_alleles_in_normal_count=2 max_alt_alleles_in_normal_qscore_sum=20 max_alt_allele_in_normal_fraction=0.03 power_constant_qscore=30 absolute_copy_number_data=null power_constant_af=0.30000001192092896 vcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub dbsnp=[(RodBinding name=dbsnp source=dbsnp_138.hg19.vcf)] cosmic=[(RodBinding name=cosmic source=Cosmic.v68.hg19.vcf)] normal_panel=[] coverage_20_q20_file=null power_file=null tumor_depth_file=null normal_depth_file=null filter_mismatching_base_and_quals=false"
##contig=<ID=chrM,length=16571,assembly=hg19>
##contig=<ID=chr1,length=249250621,assembly=hg19>
##contig=<ID=chr2,length=243199373,assembly=hg19>
##contig=<ID=chr3,length=198022430,assembly=hg19>
##contig=<ID=chr4,length=191154276,assembly=hg19>
##contig=<ID=chr5,length=180915260,assembly=hg19>
##contig=<ID=chr6,length=171115067,assembly=hg19>
##contig=<ID=chr7,length=159138663,assembly=hg19>
##contig=<ID=chr8,length=146364022,assembly=hg19>
##contig=<ID=chr9,length=141213431,assembly=hg19>
##contig=<ID=chr10,length=135534747,assembly=hg19>
##contig=<ID=chr11,length=135006516,assembly=hg19>
##contig=<ID=chr12,length=133851895,assembly=hg19>
##contig=<ID=chr13,length=115169878,assembly=hg19>
##contig=<ID=chr14,length=107349540,assembly=hg19>
##contig=<ID=chr15,length=102531392,assembly=hg19>
##contig=<ID=chr16,length=90354753,assembly=hg19>
##contig=<ID=chr17,length=81195210,assembly=hg19>
##contig=<ID=chr18,length=78077248,assembly=hg19>
##contig=<ID=chr19,length=59128983,assembly=hg19>
##contig=<ID=chr20,length=63025520,assembly=hg19>
##contig=<ID=chr21,length=48129895,assembly=hg19>
##contig=<ID=chr22,length=51304566,assembly=hg19>
##contig=<ID=chrX,length=155270560,assembly=hg19>
##contig=<ID=chrY,length=59373566,assembly=hg19>
##contig=<ID=chr1_gl000191_random,length=106433,assembly=hg19>
##contig=<ID=chr1_gl000192_random,length=547496,assembly=hg19>
##contig=<ID=chr4_ctg9_hap1,length=590426,assembly=hg19>
##contig=<ID=chr4_gl000193_random,length=189789,assembly=hg19>
##contig=<ID=chr4_gl000194_random,length=191469,assembly=hg19>
##contig=<ID=chr6_apd_hap1,length=4622290,assembly=hg19>
##contig=<ID=chr6_cox_hap2,length=4795371,assembly=hg19>
##contig=<ID=chr6_dbb_hap3,length=4610396,assembly=hg19>
##contig=<ID=chr6_mann_hap4,length=4683263,assembly=hg19>
##contig=<ID=chr6_mcf_hap5,length=4833398,assembly=hg19>
##contig=<ID=chr6_qbl_hap6,length=4611984,assembly=hg19>
##contig=<ID=chr6_ssto_hap7,length=4928567,assembly=hg19>
##contig=<ID=chr7_gl000195_random,length=182896,assembly=hg19>
##contig=<ID=chr8_gl000196_random,length=38914,assembly=hg19>
##contig=<ID=chr8_gl000197_random,length=37175,assembly=hg19>
##contig=<ID=chr9_gl000198_random,length=90085,assembly=hg19>
##contig=<ID=chr9_gl000199_random,length=169874,assembly=hg19>
##contig=<ID=chr9_gl000200_random,length=187035,assembly=hg19>
##contig=<ID=chr9_gl000201_random,length=36148,assembly=hg19>
##contig=<ID=chr11_gl000202_random,length=40103,assembly=hg19>
##contig=<ID=chr17_ctg5_hap1,length=1680828,assembly=hg19>
##contig=<ID=chr17_gl000203_random,length=37498,assembly=hg19>
##contig=<ID=chr17_gl000204_random,length=81310,assembly=hg19>
##contig=<ID=chr17_gl000205_random,length=174588,assembly=hg19>
##contig=<ID=chr17_gl000206_random,length=41001,assembly=hg19>
##contig=<ID=chr18_gl000207_random,length=4262,assembly=hg19>
##contig=<ID=chr19_gl000208_random,length=92689,assembly=hg19>
##contig=<ID=chr19_gl000209_random,length=159169,assembly=hg19>
##contig=<ID=chr21_gl000210_random,length=27682,assembly=hg19>
##contig=<ID=chrUn_gl000211,length=166566,assembly=hg19>
##contig=<ID=chrUn_gl000212,length=186858,assembly=hg19>
##contig=<ID=chrUn_gl000213,length=164239,assembly=hg19>
##contig=<ID=chrUn_gl000214,length=137718,assembly=hg19>
##contig=<ID=chrUn_gl000215,length=172545,assembly=hg19>
##contig=<ID=chrUn_gl000216,length=172294,assembly=hg19>
##contig=<ID=chrUn_gl000217,length=172149,assembly=hg19>
##contig=<ID=chrUn_gl000218,length=161147,assembly=hg19>
##contig=<ID=chrUn_gl000219,length=179198,assembly=hg19>
##contig=<ID=chrUn_gl000220,length=161802,assembly=hg19>
##contig=<ID=chrUn_gl000221,length=155397,assembly=hg19>
##contig=<ID=chrUn_gl000222,length=186861,assembly=hg19>
##contig=<ID=chrUn_gl000223,length=180455,assembly=hg19>
##contig=<ID=chrUn_gl000224,length=179693,assembly=hg19>
##contig=<ID=chrUn_gl000225,length=211173,assembly=hg19>
##contig=<ID=chrUn_gl000226,length=15008,assembly=hg19>
##contig=<ID=chrUn_gl000227,length=128374,assembly=hg19>
##contig=<ID=chrUn_gl000228,length=129120,assembly=hg19>
##contig=<ID=chrUn_gl000229,length=19913,assembly=hg19>
##contig=<ID=chrUn_gl000230,length=43691,assembly=hg19>
##contig=<ID=chrUn_gl000231,length=27386,assembly=hg19>
##contig=<ID=chrUn_gl000232,length=40652,assembly=hg19>
##contig=<ID=chrUn_gl000233,length=45941,assembly=hg19>
##contig=<ID=chrUn_gl000234,length=40531,assembly=hg19>
##contig=<ID=chrUn_gl000235,length=34474,assembly=hg19>
##contig=<ID=chrUn_gl000236,length=41934,assembly=hg19>
##contig=<ID=chrUn_gl000237,length=45867,assembly=hg19>
##contig=<ID=chrUn_gl000238,length=39939,assembly=hg19>
##contig=<ID=chrUn_gl000239,length=33824,assembly=hg19>
##contig=<ID=chrUn_gl000240,length=41933,assembly=hg19>
##contig=<ID=chrUn_gl000241,length=42152,assembly=hg19>
##contig=<ID=chrUn_gl000242,length=43523,assembly=hg19>
##contig=<ID=chrUn_gl000243,length=43341,assembly=hg19>
##contig=<ID=chrUn_gl000244,length=39929,assembly=hg19>
##contig=<ID=chrUn_gl000245,length=36651,assembly=hg19>
##contig=<ID=chrUn_gl000246,length=38154,assembly=hg19>
##contig=<ID=chrUn_gl000247,length=36422,assembly=hg19>
##contig=<ID=chrUn_gl000248,length=39786,assembly=hg19>
##contig=<ID=chrUn_gl000249,length=38502,assembly=hg19>
##reference=file:ucsc.hg19.fasta
##jacquard.version=X
##jacquard.command=tag c:\\users\\jebene\\appdata\\local\\temp\\tmptivahf\\normalize c:\\users\\jebene\\appdata\\local\\temp\\tmptivahf\\tag --force
##jacquard.cwd=C:\\Users\\jebene\\git
##jacquard.tag.caller=MuTect
##FORMAT=<ID=JQ_MT_AF,Number=A,Type=Float,Description="Jacquard allele frequency for MuTect: Decimal allele frequency rounded to 2 digits (based on FA)">
##FORMAT=<ID=JQ_MT_DP,Number=1,Type=Integer,Description="Jacquard depth for MuTect (based on DP)">
##FORMAT=<ID=JQ_MT_HC_SOM,Number=1,Type=Integer,Description="Jacquard somatic status for MuTect: 0=non-somatic,1=somatic (based on SS FORMAT tag)">
##jacquard.filterHCSomatic.excluded_variants=37
##jacquard.version=X
##jacquard.command=filter_hc_somatic c:\\users\\jebene\\appdata\\local\\temp\\tmptivahf\\tag c:\\users\\jebene\\appdata\\local\\temp\\tmptivahf\\filter_hc_somatic --force
##jacquard.cwd=C:\\Users\\jebene\\git
##jacquard.filterHCSomatic.total_highConfidence_somatic_positions=12
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR
chr1 14948 rs201855936 G A . REJECT DB GT:AD:BQ:DP:FA:JQ_MT_AF:JQ_MT_DP:JQ_MT_HC_SOM 0:179,17:.:174:0.087:0.09:174:0 0/1:264,38:33:302:0.126:0.13:302:0
chr3 137624 rs376555728 G A . PASS DB GT:AD:BQ:DP:FA:SS:JQ_MT_AF:JQ_MT_DP:JQ_MT_HC_SOM 0:64,21:.:36:0.253:1:0.26:34:0 0/1:32,15:33:46:0.252:2:0.23:41:1
chr1 137622 rs376555721 G A . PASS DB GT:AD:BQ:DP:FA:SS:JQ_MT_AF:JQ_MT_DP:JQ_MT_HC_SOM 0:63,20:.:35:0.241:1:0.24:35:0 0/1:30,12:32:42:0.286:2:0.29:42:1
Loading

0 comments on commit 9a39011

Please sign in to comment.