# Reformat 12/11/2019 strelka2 and loFreq results

Reformatting Strelka and loFreq outputs for use with hap.py.  
Brad Chapman at BCBio has python code for more accurate VCF genotype annotations, https://github.com/bcbio/bcbio-nextgen/blob/master/bcbio/variation/strelka2.py

## Strelka2

In [202]:
%%bash
## New header line 
##  Using FOO as work around where GT was causing a segfault with bcftools annotate
echo '##FORMAT=<ID=FOO,Number=1,Type=String,Description="Genotype">' > header_line.txt

## rename file
echo "TUMOR HG002" > rename.txt


## Refomatting STRELKA VCFs
reformat_strelka_somatic () {
    VCF=$1
    OUTVCF=$2
    
    ## Get chrom and pos for annotation file
    rm annots.bed*
    bcftools query -f'%CHROM\t%POS0\t%END\n' $VCF \
        | awk -F $'\t' 'BEGIN {OFS = FS} { print $1, $2, $3, "0/1"}' \
        > annots.bed
    bgzip annots.bed
    tabix -s1 -b2 -e3 annots.bed.gz

    bcftools view -Oz -s TUMOR $VCF \
        | bcftools annotate \
            -a annots.bed.gz \
            -h header_line.txt \
            -c CHROM,FROM,TO,FMT/FOO \
            -s TUMOR \
        | sed 's/FOO/GT/g' \
        | bcftools view -Oz \
        | bcftools reheader -s rename.txt \
        > $OUTVCF
    tabix $OUTVCF
} 


reformat_strelka_somatic \
    data/HG2r5_S2_L001_001.markdup.sorted.indels.vcf.gz \
    processed/HG2r5_20191211_strelka_reformatted.indels.vcf.gz

reformat_strelka_somatic \
    data/HG2r5_S2_L001_001.markdup.sorted.snvs.vcf.gz \
    processed/HG2r5_20191211_strelka_reformatted.snvs.vcf.gz

## Combining snv and indel vcfs
bcftools concat -a \
    processed/HG2r5_20191211_strelka_reformatted.indels.vcf.gz \
    processed/HG2r5_20191211_strelka_reformatted.snvs.vcf.gz \
    | bcftools sort -Oz > processed/HG2r5_20191211_strelka.vcf.gz
tabix processed/HG2r5_20191211_strelka.vcf.gz

## File clean-up
rm processed/HG2r5_20191211_strelka_reformatted.indels.vcf.gz*
rm processed/HG2r5_20191211_strelka_reformatted.snvs.vcf.gz*
rm rename.txt
rm annots*
rm header_line.txt

Writing to /tmp/bcftools-sort.XB0LHU
Merging 1 temporary files
Cleaning
Done


Verifying format change

In [203]:
%%bash
bcftools view -h processed/HG2r5_20191211_strelka.vcf.gz \
    | zgrep ^##FORMAT=
bcftools view -h processed/HG2r5_20191211_strelka.vcf.gz \
    | zgrep ^#CHROM
bcftools view -H processed/HG2r5_20191211_strelka.vcf.gz |
    head -n 5

##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth for tier1">
##FORMAT=<ID=DP2,Number=1,Type=Integer,Description="Read depth for tier2">
##FORMAT=<ID=TAR,Number=2,Type=Integer,Description="Reads strongly supporting alternate allele for tiers 1,2">
##FORMAT=<ID=TIR,Number=2,Type=Integer,Description="Reads strongly supporting indel allele for tiers 1,2">
##FORMAT=<ID=TOR,Number=2,Type=Integer,Description="Other reads (weak support or insufficient indel breakpoint overlap) for tiers 1,2">
##FORMAT=<ID=DP50,Number=1,Type=Float,Description="Average tier1 read depth within 50 bases">
##FORMAT=<ID=FDP50,Number=1,Type=Float,Description="Average tier1 number of basecalls filtered from original read depth within 50 bases">
##FORMAT=<ID=SUBDP50,Number=1,Type=Float,Description="Average number of reads below tier1 mapping quality threshold aligned across sites within 50 bases">
##FORMAT=<ID=BCN50,Number=1,Type=Float,Description="Fraction of filtered reads within 50 bases of the indel.">

## loFreq

Checking loFreq VCF format

In [207]:
%%bash
VCF=data/HG2r5_S2_L001_001.markdup.sorted.somatic-snvs.vcf.gz
bcftools view -h $VCF 
bcftools view -H $VCF |
    head -n 5

##fileformat=VCFv4.0
##FILTER=<ID=PASS,Description="All filters passed">
##fileDate=20191211
##source=lofreq call -d 101000 -f ./genome/hs37d5.fa --verbose --no-default-filter -b 1 -a 0.010000 -C 7 -s -S prefixnormal_stringent.snvs.vcf.gz,prefixnormal_stringent.indels.vcf.gz -l /home/dnanexus/in/target_bed/No_chr_SortBed_on_removed_ex_trusight_inh_dis_manifest_a.bed --no-default-filter -r 1:1-124625310 -o /tmp/lofreq2_call_parallelhL9J0c/0.vcf.gz /home/dnanexus/in/tumor_bam/HG2r5_S2_L001_001.markdup.sorted.bam 
##reference=./genome/hs37d5.fa
##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw Depth">
##INFO=<ID=AF,Number=1,Type=Float,Description="Allele Frequency">
##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias at this position">
##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
##INFO=<ID=C

[W::vcf_parse] INFO 'SOMATIC' is not defined in the header, assuming Type=String


In [248]:
%%bash 
reformat_lofreq () {
    VCF=$1
    OUTVCF=$2
    
    # Extract header
    zgrep "^##" $VCF > $OUTVCF
    # adding FORMAT GT
    echo '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">' >> $OUTVCF
    # Adding chrom
    zgrep "^#CHROM" $VCF |
        awk -F $'\t' 'BEGIN {OFS = FS} { print $1, $2, $3, $4, $5, $6, $7, $8, "FORMAT", "HG002"}' \
       >> $OUTVCF
    
    ## Adding FORMAT and sample columns
    zgrep -v "^#" $VCF | awk -F $'\t' 'BEGIN {OFS = FS} { print $1, $2, $3, $4, $5, $6, $7, $8, "GT", "0/1"}' \
        >> $OUTVCF
    
    bgzip $OUTVCF
    tabix $OUTVCF.gz
} 

reformat_lofreq \
    data/HG2r5_S2_L001_001.markdup.sorted.somatic-snvs.vcf.gz \
    processed/HG2r5_20191211_loFreq.vcf

In [249]:
%%bash
VCF=processed/HG2r5_20191211_loFreq.vcf.gz
bcftools view -h $VCF 
bcftools view -H $VCF |
    head -n 5

##fileformat=VCFv4.0
##FILTER=<ID=PASS,Description="All filters passed">
##fileDate=20191211
##source=lofreq call -d 101000 -f ./genome/hs37d5.fa --verbose --no-default-filter -b 1 -a 0.010000 -C 7 -s -S prefixnormal_stringent.snvs.vcf.gz,prefixnormal_stringent.indels.vcf.gz -l /home/dnanexus/in/target_bed/No_chr_SortBed_on_removed_ex_trusight_inh_dis_manifest_a.bed --no-default-filter -r 1:1-124625310 -o /tmp/lofreq2_call_parallelhL9J0c/0.vcf.gz /home/dnanexus/in/tumor_bam/HG2r5_S2_L001_001.markdup.sorted.bam 
##reference=./genome/hs37d5.fa
##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw Depth">
##INFO=<ID=AF,Number=1,Type=Float,Description="Allele Frequency">
##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias at this position">
##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
##INFO=<ID=C

[W::vcf_parse] INFO 'SOMATIC' is not defined in the header, assuming Type=String


In [250]:
!jupyter nbconvert --to script 20191211_reformat_vcfs.ipynb

[NbConvertApp] Converting notebook 20191211_reformat_vcfs.ipynb to script
[NbConvertApp] Writing 3822 bytes to 20191211_reformat_vcfs.py
