# Burden test:
    Date: 13 mar 2019
    Last Edit: 14 Apr 2019
    Install all dependencies
    Ref: Rvtest https://genome.sph.umich.edu/wiki/Rvtests 
         Exomiser https://www.sanger.ac.uk/science/tools/exomiser
    Run cells step by step
    Runtime: 30 minutes with input vcf 120MB
    

## Update
Not edit GT ./.

## Dependencies:

    1/ python packages in import cell (pandas, numpy, pysam)
    2/ bedtools
    3/ vcftools (vcf-subset, vcf-sort, vcf-merge)
    4/ htslib (bgzip, tabix)
    5/ exomiser 8.0.0 https://github.com/exomiser/Exomiser/releases/tag/8.0.0
    6/ rvtests https://github.com/zhanxw/rvtests

# Burden test Framework

## 1/ Review data
## 2/ Filter position with bed file
## 3/ Split sample from vcf
## 4/ Filter read depth (min_base_count = 10) and maybe ref, alt
## 5/ Annotation with exomiser
## 6/ Filter function_class (protein affecting) and maybe freq
## 7/ Merge results
## 8/ Preprocessing and Burden test
## 9/ Output report


In [1]:
# import cell
import subprocess
import os
import fileinput
import sys

import pandas as pd
import numpy as np
from pysam import VariantFile as psVa

## ============= Modify these parameters ====================

In [2]:
# folder containing all input: vcf, bedfile, correspondance_table, exome file
input_path = "/data/BurdenTest/Lithiasis_mar/"

# vcf and input file name. All input files must be in input folder
vcf_name = "Lithiasis_set_20082018-123-samples.vcf"
bedfile = "BED_Letav-sorted.bed"
ctable = "Correspondance_table.tsv"
exomefile = "exome.yml"
genefile = input_path+"refFlat.txt.gz"

# List of protein affection for filtering
af_pro = ["missense_variant", "splice_region_variant", "stop_gained", "frameshift_variant", "stop_lost", "inframe_deletion", "inframe_insertion"] 

# Tool path: 
exom_path = "/home/phung/Documents/work/exomiser/exomiser_cli_8/exomiser-cli-8.0.0.jar"
rvtest_pwd = "/data/rvtests/executable/rvtest"


## ============= Run cells and do not modify ====================

In [3]:
output = input_path+"output"
out_data = output + "/data"
out_exome = output + "/exome"
out_rvtest = output + "/rvtest"
out_report = output +"/report"
os.system("mkdir "+output)
os.system("mkdir "+out_data)
os.system("mkdir "+out_exome)
os.system("mkdir "+out_rvtest)
os.system("mkdir "+out_report)
print "***** All outputs are in the folder: ", output

***** All outputs are in the folder:  /data/BurdenTest/Lithiasis_mar/output


In [4]:
input_vcf = input_path + vcf_name
input_bed = input_path + bedfile
input_table = input_path + ctable
input_exome = input_path + exomefile

In [5]:
name = vcf_name[0:len(vcf_name)-4]

# 1/ Review input data

In [6]:
a = subprocess.check_output("grep -v '^#' "+input_vcf+" | cut -f 1 | sort", shell = True)
print "Number of variants (base pairs): \n", a.count('\n')

Number of variants (base pairs): 
13280


In [7]:
a = subprocess.check_output("grep -v '^#' "+input_vcf+" | cut -f 1 | sort | uniq -c", shell = True)
print "Number of chrome: \n", a.count('\n')
print "Number of variants in each chrome: \n", a

Number of chrome: 
18
Number of variants in each chrome: 
    878 1
    584 10
    694 11
    836 12
    766 13
   1057 16
    424 19
    363 2
    267 20
    324 21
    778 3
    598 4
    870 5
    796 6
    844 7
    694 8
   1846 9
    661 X



In [8]:
vcf_reader = psVa(input_vcf)
l_samples = list(vcf_reader.header.samples)
print "Number of samples:", len(l_samples)
print "Name of samples:\n", l_samples

Number of samples: 123
Name of samples:
['c36cb72c-36c7-4dfc-b627-e68d51659e00', '1745b724-f5cd-4d7c-b966-05867e5d8915', '6d7d6e6d-3acf-46ab-87fc-35fab69dbb61', 'c17be704-6dec-460f-8604-6ab18a884a13', '7a1f3b10-cbbe-4add-88e4-1153968b31a9', '72741619-3a88-4583-b630-fc8acb056865', 'effdc78a-4118-4434-9731-f6c4786b8e1e', 'fe4f46ba-4582-4c12-9f55-597aad067a0d', '06c3e551-31d9-4118-ba7a-a94f5b1d95e6', '064d6e92-11d9-44ea-8b6a-98708ebc80b1', 'c468f784-c044-4256-89ee-e19a117434a7', '2bc926dd-710c-4844-af85-3428bbf46fea', '895840c1-6479-4d2a-bf91-7b3b781da157', '95fc3f50-daa0-4268-ae0e-3e21851afb72', '48006d66-7118-4cdf-a58f-aed98c7d10c0', '0dfb23a0-2a65-4e50-bad5-11f036ae8ef5', '981436f2-8ccb-4f4e-b961-bb14d1197d80', 'f97978ac-e9de-4e13-973d-66063061bc9a', 'f98e84b6-bff5-4a1f-80e7-b8e34c597f7a', 'c6520554-d64f-419e-8d23-1e8f286ae648', '91b80aa1-3d66-49e8-92a6-38ef4931f498', '25e071f1-9503-4b6f-98b1-d96579d36ccf', '9666e41a-9cd8-4013-8fe4-8d1c5ddeb861', '129848a8-0a7d-4daf-ad4c-bb7b719d0e67',

# 2/ Filter with bed file

In [9]:
# compare chrome names in vcf and bed file. Make sure the same format (or replace("X", "GL000192.1", 1))
df_bed = pd.read_csv(input_bed, sep = "\t")
df_bed.head()

Unnamed: 0,1,21835858,21904905,1.1
0,1,165370159,165414433,-1
1,1,43198764,43205925,-1
2,1,156211753,156213112,1
3,10,97471536,97637023,1
4,10,75668935,75677255,1


In [10]:
vcf_bedfil = out_data + '/' + name + '_filterposition.vcf'
cmn = "bedtools intersect -a " + input_vcf + " -b " + input_bed + " -header > " + vcf_bedfil
print subprocess.check_output(cmn, shell = True)
print "***** Filter position with bedtools. Output is ", vcf_bedfil


***** Filter position with bedtools. Output is  /data/BurdenTest/Lithiasis_mar/output/data/Lithiasis_set_20082018-123-samples_filterposition.vcf


In [11]:
# check number of variants after filter position 
a = subprocess.check_output("grep -v '^#' "+ vcf_bedfil +" | cut -f 1 | sort", shell = True)
print "Number of variants after filtering positions: ", a.count('\n')

Number of variants after filtering positions:  13211


In [12]:
a = subprocess.check_output("grep -v '^#' "+ vcf_bedfil +" | cut -f 1 | sort | uniq -c", shell = True)
print "Number of chrome: \n", a.count('\n')
print "Number of variants(Snps) in each chrome: \n", a

Number of chrome: 
18
Number of variants(Snps) in each chrome: 
    871 1
    584 10
    694 11
    836 12
    766 13
   1057 16
    424 19
    363 2
    267 20
    324 21
    775 3
    596 4
    861 5
    748 6
    844 7
    694 8
   1846 9
    661 X



In [13]:
## delete ##contig lines created by position filter. If not, it's false when running exomiser

ffinput = fileinput.input(vcf_bedfil, inplace=1)

for i, line in enumerate(ffinput):
    if line[0:8] == "##contig":
        newline = "" 
    else: newline = line

    sys.stdout.write(newline)

ffinput.close()

# 3/ Split sample, Extract sample from multi-sample vcf
    To run exomiser, we have to split vcf file to be single-sample vcf

In [14]:
%%time
# use vcftools to extract sample so it must have bgzip and tabix files


print "***** Sort, zip, tabix file: ", vcf_bedfil
out_sort = vcf_bedfil[0:len(vcf_bedfil)-4] + "_sort.vcf"
bzip = out_sort + ".gz"
comm1 = "vcf-sort " + vcf_bedfil + " > " + out_sort
comm2 = "bgzip -c " + out_sort + " > " + bzip
comm3 = "tabix -p vcf " + bzip
subprocess.check_output(comm1, shell = True)
subprocess.check_output(comm2, shell = True)
subprocess.check_output(comm3, shell = True)



***** Sort, zip, tabix file:  /data/BurdenTest/Lithiasis_mar/output/data/Lithiasis_set_20082018-123-samples_filterposition.vcf
CPU times: user 7.35 ms, sys: 6.45 ms, total: 13.8 ms
Wall time: 3.52 s


In [15]:
%%time

print "***** Split samples ..."
for k in range(len(l_samples)):
    # monitor the process:
    sys.stdout.write('\r') 
    sys.stdout.write("[%-20s] %d/%d" % ('='*(1+k*20/len(l_samples)), k+1,len(l_samples)))
    sys.stdout.flush()
    
    comm = "vcf-subset -c " + l_samples[k] + " " + bzip + " > " + vcf_bedfil[0:len(vcf_bedfil)-4] + "_s" + str(k+1) + ".vcf"
    subprocess.check_output(comm, shell = True)
print "\n***** Done."

***** Split samples ...
***** Done.
CPU times: user 220 ms, sys: 501 ms, total: 721 ms
Wall time: 5min 19s


# 4/ Filter read depth (min_depth = 10, max_depth = 500)

In [16]:
def filter_DP(file_vcf):
    vcf_out = file_vcf[0:len(file_vcf)-4] + "_DP.vcf"
    f_in = open(file_vcf)
    F_OUT = open(vcf_out, "wb")

    read = f_in.readline()

    while read:
        newline = read
        if "#" not in read[0]:
            row = newline.split("\t")
            l_col = len(row)

            if l_col>7:

                tmp_samp = row[l_col-1].split(":")
                if len(tmp_samp[2]) == 1: # less than 10
                    newline = ""
                else:
                    if int(tmp_samp[2]) > 500:
                        newline = ""


        F_OUT.write(newline)       
        read = f_in.readline()


    f_in.close()
    F_OUT.close()

In [17]:
%%time
print "***** Running filter read depth ..."
for k in range(len(l_samples)):
    file_vcf = vcf_bedfil[0:len(vcf_bedfil)-4] + "_s" + str(k+1) + ".vcf"
    filter_DP(file_vcf)
print "***** Done."

***** Running filter read depth ...
***** Done.
CPU times: user 12.3 s, sys: 1.64 s, total: 14 s
Wall time: 14.7 s


In [18]:
print "***** File name after filter read depth: ", vcf_bedfil[0:len(vcf_bedfil)-4] + "_s" + str(1) + "_DP.vcf"

***** File name after filter read depth:  /data/BurdenTest/Lithiasis_mar/output/data/Lithiasis_set_20082018-123-samples_filterposition_s1_DP.vcf


# 5/ Annotation with Exomiser

In [19]:
print "***** Output for annotation in: ", out_exome
out_exome_conf = out_exome + "/conf"
out_exome_result = out_exome + "/result"
os.system("mkdir "+out_exome_conf)
os.system("mkdir "+out_exome_result)


***** Output for annotation in:  /data/BurdenTest/Lithiasis_mar/output/exome


0

In [20]:

print "***** Generate conf files for running EXOMISER in ", out_exome_conf
for k in range(len(l_samples)):
    #print "sample:", k+1
    file_yml = out_exome_conf + "/exome_s" + str(k+1) + ".yml"
    os.system("cp " + input_exome + " " + file_yml)

    finput = fileinput.input(file_yml, inplace=1)
    file_vcf = vcf_bedfil[0:len(vcf_bedfil)-4] + "_s" + str(k+1) + "_DP.vcf"
    file_result = out_exome_result + '/'+name + "_exome_s" + str(k+1)
    
    for i, line in enumerate(finput):
        newline = line.replace("dataname", file_vcf)
        newline = newline.replace("resultname", file_result)
        sys.stdout.write(newline)
        
    finput.close()
print "***** Output this part will be in ", out_exome_result

***** Generate conf files for running EXOMISER in  /data/BurdenTest/Lithiasis_mar/output/exome/conf
***** Output this part will be in  /data/BurdenTest/Lithiasis_mar/output/exome/result


In [21]:
%%time
print "***** Annotation by EXOMISER *****"
for k in range(len(l_samples)):
    # monitor the process:
    sys.stdout.write('\r') 
    sys.stdout.write("[%-20s] %d/%d" % ('='*(1+k*20/len(l_samples)), k+1,len(l_samples)))
    sys.stdout.flush()
    
    file_yml = out_exome_conf + "/exome_s" + str(k+1) + ".yml"
    comm = "time java -Xms2g -Xmx10g -jar " + exom_path + " --analysis " + file_yml
    
    subprocess.check_output(comm, shell = True)
print "\n"    

***** Annotation by EXOMISER *****

CPU times: user 321 ms, sys: 462 ms, total: 783 ms
Wall time: 18min 55s


In [22]:
# Parallel execute to speed up process of annotation EXOMISER
# EXOMISER also has paralle processing inside but if your computer has many threads, 
# you can try following code to speed up your process.
"""
%%time
from joblib import Parallel, delayed
print "***** Annotation by EXOMISER *****"
def exome_run(k):
    
    file_yml = out_exome_conf + "/exome_s" + str(k+1) + ".yml"
    comm = "time java -Xms2g -Xmx10g -jar " + exom_path + " --analysis " + file_yml

    subprocess.check_output(comm, shell = True)
    print "Finish annotation sample", k+1
    
element_information = Parallel(n_jobs=-1)(delayed(exome_run)(k) for k in range(len(l_samples)))
"""

'\n%%time\nfrom joblib import Parallel, delayed\nprint "***** Annotation by EXOMISER *****"\ndef exome_run(k):\n    \n    file_yml = out_exome_conf + "/exome_s" + str(k+1) + ".yml"\n    comm = "time java -Xms2g -Xmx10g -jar " + exom_path + " --analysis " + file_yml\n\n    subprocess.check_output(comm, shell = True)\n    print "Finish annotation sample", k+1\n    \nelement_information = Parallel(n_jobs=-1)(delayed(exome_run)(k) for k in range(len(l_samples)))\n'

# 6/ Filter function_class (protein affecting) and maybe freq

In [23]:
# Remind list of protein affecting 
#af_pro = ["missense_variant", "splice_region_variant", "stop_gained", "frameshift_variant", "stop_lost", "inframe_deletion", "inframe_insertion"] 
print "***** Filter genes with protein affecting: ", af_pro
def filter_fc(file_va, file_vcf):
    df_va = pd.read_csv(file_va, sep = "\t")
    df_tmp = df_va[df_va.FUNCTIONAL_CLASS.isin(af_pro)]
    pos = list(df_tmp.POS)

    os.system("cp " + file_vcf + " " + file_vcf[0:len(file_vcf)-4] + "_fc.vcf")
    tmp = file_vcf[0:len(file_vcf)-4] + "_fc.vcf"
    tmpinput = fileinput.input(tmp, inplace=1)
    for i, line in enumerate(tmpinput):
        a = line.split("\t")
        if len(a)>3:
            if a[1] == "POS": newline = line
            else: 
                if int(a[1]) in pos: newline = line
                else: newline = ""

        else: 
            newline = line
        sys.stdout.write(newline)


            #sys.stderr.write(line +'\n')
    tmpinput.close()

***** Filter genes with protein affecting:  ['missense_variant', 'splice_region_variant', 'stop_gained', 'frameshift_variant', 'stop_lost', 'inframe_deletion', 'inframe_insertion']


In [24]:
%%time
for k in range(len(l_samples)):
    file_va = out_exome_result + '/'+name + "_exome_s" + str(k+1) + ".variants.tsv"
    file_vcf = out_exome_result + '/'+name + "_exome_s" + str(k+1) + ".vcf"
    filter_fc(file_va, file_vcf)

print "***** Output file name: ", out_exome_result + '/'+name + "_exome_s" + str(1) + "_fc.vcf"

***** Output file name:  /data/BurdenTest/Lithiasis_mar/output/exome/result/Lithiasis_set_20082018-123-samples_exome_s1_fc.vcf
CPU times: user 1.16 s, sys: 417 ms, total: 1.58 s
Wall time: 2.34 s


# 7/ Merge results

In [22]:
%%time
# Merge vcf results
vcf_merge = out_exome + '/' +name + "_merge_exomiser_all.vcf"
# sort and bzip files
for k in range(len(l_samples)):
    result = out_exome_result + '/'+name + "_exome_s" + str(k+1) + ".vcf"
    out_sort = out_exome_result + '/'+name + "_exome_sort_s" + str(k+1)+".vcf"
    bzip = out_exome_result + '/'+name + "_exome_bz_s" + str(k+1)+ ".vcf.gz"
    
    comm1 = "vcf-sort " + result + " > " + out_sort
    comm2 = "bgzip -c " + out_sort + " > " + bzip
    comm3 =  "tabix -p vcf " + bzip
    subprocess.check_output(comm1, shell = True)
    subprocess.check_output(comm2, shell = True)
    subprocess.check_output(comm3, shell = True)
comm4 = "vcf-merge"

CPU times: user 42.7 ms, sys: 1.14 s, total: 1.19 s
Wall time: 8.66 s


In [23]:
%%time

for k in range(len(l_samples)):
    bzip = " " + out_exome_result + '/'+name + "_exome_bz_s" + str(k+1)+ ".vcf.gz"
    comm4 = comm4 + bzip
comm4 = comm4 + " > " + vcf_merge
print subprocess.check_output(comm4, shell = True)   

print "***** Merge results of exomiser. Output is ", vcf_merge


***** Merge results of exomiser. Output is  /data/BurdenTest/Lithiasis_mar/output/exome/Lithiasis_set_20082018-123-samples_merge_exomiser_all.vcf
CPU times: user 1.31 ms, sys: 4.46 ms, total: 5.77 ms
Wall time: 21.5 s


In [24]:
# check number of variants after merging

a = subprocess.check_output("grep -v '^#' "+ vcf_merge +" | cut -f 1 | sort", shell = True)
print "Number of variants after merging: ", a.count('\n')

Number of variants after merging:  5770


# 8/ Preprocessing and Burden test

    Input to run rvtest-burden test:
            - vcf file with genetype of all patients
            - pheno file: phenotype of all patients
            - genefile: mapping position with gene-names

## 8.1/ Prepare input

In [25]:
def findchar(s, ch):
    return [i for i, ltr in enumerate(s) if ltr == ch]

In [48]:
# edit values, keep GT 1/1, 0/1 or 0/0
mfile = vcf_merge
print "***** Edit values, keep Genetype 1/1, 0/1 or 0/0"

os.system("cp " + mfile + " " + mfile[0:len(mfile)-4] + "_1.vcf")
mfile = mfile[0:len(mfile)-4] + "_1.vcf"
minput = fileinput.input(mfile, inplace=1)

for i, line in enumerate(minput):
    #sys.stderr.write(line +'\n')
    newline = line
    at1 = findchar(newline,"\t")
    if len(at1)>5:
        line1 = newline[at1[8]+1:len(newline)]
        line1 = line1.replace("./", "0/")
        line1 = line1.replace("/.", "/1")
        #line1 = line1.replace(".", "0/0")
        ac1 = findchar(line1,":")
        while len(ac1)>1:
            at2 = findchar(line1[ac1[0]:len(line1)],"\t")
            if len(at2)>0:
                line1 = line1.replace(line1[ac1[0]:ac1[0]+at2[0]], "")
            else:
                line1 = line1.replace(line1[ac1[0]:len(line1)], "\n")

            ac1 = findchar(line1,":")

        newline = newline[0:at1[8]+1] + line1
    sys.stdout.write(newline)
minput.close()
print "***** Output is ", mfile

***** Edit values, keep Genetype 1/1, 0/1 or 0/0
***** Output is  /data/BurdenTest/Lithiasis_mar/output/exome/Lithiasis_set_20082018-123-samples_merge_exomiser_all_1.vcf


In [49]:
# bgzip and tabix files for burden test using rvtest
result = mfile
#mfile = vcf_merge
#result = vcf_merge
out_sort = mfile[0:len(mfile)-6] + "_sort.vcf"
bzip = out_sort[0:len(out_sort)-4] + "_bz.vcf.gz"
comm1 = "vcf-sort " + mfile + " > " + out_sort
comm2 = "bgzip -c " + out_sort + " > " + bzip
comm3 =  "tabix -p vcf " + bzip
subprocess.check_output(comm1, shell = True)
subprocess.check_output(comm2, shell = True)
subprocess.check_output(comm3, shell = True)
print "***** Sort, zip, tabix file vcf to make input for burden test: ", bzip

***** Sort, zip, tabix file vcf to make input for burden test:  /data/BurdenTest/Lithiasis_mar/output/exome/Lithiasis_set_20082018-123-samples_merge_exomiser_all_sort_bz.vcf.gz


## 8.2/ Prepare phenotype values for burden test

extract samples ID in df_tab, then compare with ID 1 in df_bio to get values

### ===== View table and Check the correspondence table =====

In [28]:
df_tab = pd.read_csv(input_table, sep = "\t")

In [29]:
df_tab.head()

Unnamed: 0,#HASH-ID,SAMPLE-Number,SAMPLES-NAMES
0,23117b5d-6580-4d23-9b4d-d33f05d51304,p1,LABDO-F-48-PLq0-HOX0-HCaLciU0-HCt3oL0-RCt3oLOH1
1,33d41660-b7f6-455b-a8fa-b71e138c5284,p2,BERT-M-62-PLq0-HOX0-HCaLciU0-HCt3oL0-RCt3oLOH0
2,53f34b70-a595-4851-b097-b09a3b9bfb3f,p3,JAM-M-24-PLq0-HOX0-HCaLciU1-HCt3oL0-RCt3oLOH0
3,48006d66-7118-4cdf-a58f-aed98c7d10c0,p4,FRIT-M-45-PLq0-HOX0-HCaLciU0-HCt3oL1-RCt3oLOH1
4,e2509c87-a435-4d80-976b-af5163283915,p5,CHAP-F-39-PLq1-HOX0-HCaLciU1-HCt3oL1-RCt3oLOH1


In [30]:
print "Verifying correspondence table ..."
flag = 0
for i in l_samples:
    if i not in list(df_tab["#HASH-ID"]):
        print i
        flag = flag + 1
if flag == 0:
    print "***** All samples are in the table."
else:
    print "***** Above samples are not in the table"
    

Verifying correspondence table ...
***** All samples are in the table.


In [31]:
def pheno_binary(idx1, idx2, phenoname):
    plq = []
    for i in range(len(df_tab)):
        tmp = df_tab["SAMPLES-NAMES"][i].split("-")
        if tmp[idx1][idx2] != 'n':
            plq.append(tmp[idx1][idx2])
        else: 
            plq.append('0')
    df_tab[phenoname] = plq

In [32]:
pheno_binary(3, 3, "PLq")
pheno_binary(4, 3, "HOX")
pheno_binary(5, 7, "HCaLciU")
pheno_binary(6, 6, "HCt3oL")
pheno_binary(7, 8, "RCt3oLOH")
print "***** Extract phenotype from sample names. Please check below table."

***** Extract phenotype from sample names. Please check below table.


In [33]:
def pheno_extract(idx1, phenoname):
    plq = []
    for i in range(len(df_tab)):
        tmp = df_tab["SAMPLES-NAMES"][i].split("-")
        plq.append(tmp[idx1])
    df_tab[phenoname] = plq

In [34]:
pheno_extract(1, 'Sex')
pheno_extract(2, 'Age')
df_tab['Sex'] = df_tab['Sex'].map({'F': '2', 'M': '1'})

In [35]:
df_tab.head()

Unnamed: 0,#HASH-ID,SAMPLE-Number,SAMPLES-NAMES,PLq,HOX,HCaLciU,HCt3oL,RCt3oLOH,Sex,Age
0,23117b5d-6580-4d23-9b4d-d33f05d51304,p1,LABDO-F-48-PLq0-HOX0-HCaLciU0-HCt3oL0-RCt3oLOH1,0,0,0,0,1,2,48
1,33d41660-b7f6-455b-a8fa-b71e138c5284,p2,BERT-M-62-PLq0-HOX0-HCaLciU0-HCt3oL0-RCt3oLOH0,0,0,0,0,0,1,62
2,53f34b70-a595-4851-b097-b09a3b9bfb3f,p3,JAM-M-24-PLq0-HOX0-HCaLciU1-HCt3oL0-RCt3oLOH0,0,0,1,0,0,1,24
3,48006d66-7118-4cdf-a58f-aed98c7d10c0,p4,FRIT-M-45-PLq0-HOX0-HCaLciU0-HCt3oL1-RCt3oLOH1,0,0,0,1,1,1,45
4,e2509c87-a435-4d80-976b-af5163283915,p5,CHAP-F-39-PLq1-HOX0-HCaLciU1-HCt3oL1-RCt3oLOH1,1,0,1,1,1,2,39


### ===== Make phenotype file using df_tab =====

In [36]:
#shell1 = "if [ ! -d '" +out_pheno+ "' ]; then\n\t" + "mkdir " + out_pheno + "\nfi" 
#print subprocess.check_output(shell1, shell = True)
out_pheno = out_rvtest + "/pheno"
out_burden = out_rvtest + "/burden"
out_burden_gene = out_rvtest + "/burden_gene"

os.system("mkdir "+ out_pheno)
os.system("mkdir "+ out_burden_gene)
os.system("mkdir "+ out_burden)

0

In [37]:
print "***** Writing pheno file for burden test: ", out_pheno
f = open(out_pheno + '/pheno_all.ped','wb')
f.write("fid iid fatid matid sex age PLq HOX HCaLciU HCt3oL RCt3oLOH\n")

for k in range(len(df_tab)):
    print k
    name = df_tab["#HASH-ID"][k]

    text = name + " " + name + " 0 0 " + df_tab['Sex'][k] + ' ' \
            + df_tab['Age'][k] + ' '+ df_tab['PLq'][k] + ' ' \
            + df_tab['HOX'][k]+ ' '+ df_tab['HCaLciU'][k]+ ' ' \
            + df_tab['HCt3oL'][k]+ ' '+ df_tab['RCt3oLOH'][k]+ "\n"
    f.write(text)
    

f.close()

***** Writing pheno file for burden test:  /data/BurdenTest/Lithiasis_mar/output/rvtest/pheno
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122


## 8.3/ Burden normal with genefile refFlat.txt.gz
https://genome.sph.umich.edu/wiki/Rvtests

### 8.3.1/ Burden normal with only a genename 

In [44]:
bzip

'/data/BurdenTest/Lithiasis_mar/output/exome/Lithiasis_set_20082018-123-samples_merge_exomiser_all_sort_bz.vcf.gz'

### =====================================================================

### 8.3.2/ Burden test with all available genes

In [50]:
def rvtest(phenoname):
    invcf = bzip
    pheno = out_pheno + '/pheno_all.ped'
    
    
    #model = "cmc,cmcWald,zeggini,zegginiWald"
    model = "zeggini,cmc"
    outrv = out_burden + "/output_"+model+phenoname
    
    comm = rvtest_pwd + " --inVcf " + invcf + " --pheno " + pheno + " --pheno-name " +phenoname \
    + " --covar " + pheno + " --covar-name age,sex --qtl --freqUpper 0.05"  + " --geneFile "+ genefile \
    + " --out " + outrv + " --burden "+model + " --kernel skat" # adjust covar
    #comm = rvtest_pwd + " --inVcf " + invcf + " --pheno " + pheno + " --pheno-name " \
    #+ phenoname + " --qtl --freqUpper 0.05 --geneFile "+ genefile    \
    #+ " --out " + outrv + " --burden "+model + " --kernel skat"
    print comm
    print subprocess.check_output(comm, shell = True)

In [51]:
bzip

'/data/BurdenTest/Lithiasis_mar/output/exome/Lithiasis_set_20082018-123-samples_merge_exomiser_all_sort_bz.vcf.gz'

In [52]:
rvtest("PLq")

/data/rvtests/executable/rvtest --inVcf /data/BurdenTest/Lithiasis_mar/output/exome/Lithiasis_set_20082018-123-samples_merge_exomiser_all_sort_bz.vcf.gz --pheno /data/BurdenTest/Lithiasis_mar/output/rvtest/pheno/pheno_all.ped --pheno-name PLq --covar /data/BurdenTest/Lithiasis_mar/output/rvtest/pheno/pheno_all.ped --covar-name age,sex --qtl --freqUpper 0.05 --geneFile /data/BurdenTest/Lithiasis_mar/refFlat.txt.gz --out /data/BurdenTest/Lithiasis_mar/output/rvtest/burden/output_zeggini,cmcPLq --burden zeggini,cmc --kernel skat
Thank you for using rvtests (version: 20190205, git: c86e589efef15382603300dc7f4c3394c82d69b8)
  For documentations, refer to http://zhanxw.github.io/rvtests/
  For questions and comments, plase send to Xiaowei Zhan <zhanxw@umich.edu>
  For bugs and feature requests, please submit at: https://github.com/zhanxw/rvtests/issues

RVTESTS finished successfully



In [52]:
%%time
print "***** Output : ", out_burden
print "***** Running burden test ...\n"
rvtest("PLq")
rvtest("HOX")
rvtest("HCaLciU")
rvtest("HCt3oL")
rvtest("RCt3oLOH")
print "***** Done."

***** Output :  /data/BurdenTest/Lithiasis_mar/output/rvtest/burden
***** Running burden test ...

Thank you for using rvtests (version: 20171009, git: 02a02ba9f5927aee3df3b8162efed8e23aaea886)
  For documentations, refer to http://zhanxw.github.io/rvtests/
  For questions and comments, send to Xiaowei Zhan <zhanxw@umich.edu>
  For bugs and feature requests, please submit at: https://github.com/zhanxw/rvtests/issues

RVTESTS finished successfully

Thank you for using rvtests (version: 20171009, git: 02a02ba9f5927aee3df3b8162efed8e23aaea886)
  For documentations, refer to http://zhanxw.github.io/rvtests/
  For questions and comments, send to Xiaowei Zhan <zhanxw@umich.edu>
  For bugs and feature requests, please submit at: https://github.com/zhanxw/rvtests/issues

RVTESTS finished successfully

Thank you for using rvtests (version: 20171009, git: 02a02ba9f5927aee3df3b8162efed8e23aaea886)
  For documentations, refer to http://zhanxw.github.io/rvtests/
  For questions and comments, send t

# 9/ Output report

In [53]:
file_list = []
for file in os.listdir(out_burden):
    if file.endswith(".assoc"):
        a = os.path.join(out_burden, file)
        file_list.append(a)

In [54]:
print "***** Output files of burden test"
file_list

***** Output files of burden test


['/data/BurdenTest/Lithiasis_mar/output/rvtest/burden/output_HCaLciU.Zeggini.assoc',
 '/data/BurdenTest/Lithiasis_mar/output/rvtest/burden/output_RCt3oLOH.Zeggini.assoc',
 '/data/BurdenTest/Lithiasis_mar/output/rvtest/burden/output_HCt3oL.Zeggini.assoc',
 '/data/BurdenTest/Lithiasis_mar/output/rvtest/burden/output_HOX.Zeggini.assoc',
 '/data/BurdenTest/Lithiasis_mar/output/rvtest/burden/output_PLq.Zeggini.assoc']

In [55]:
n = len(file_list)

for i in range(len(file_list)):
    a = file_list[i].split('/')
    b = a[len(a)-1].split('.')
    dftmp = pd.read_csv(file_list[i], sep='\t')
    col_name = b[0][7:len(b[0])] + '_' + b[1]
    if i == 0:
        df = dftmp
        df = df.rename(columns={"Pvalue": col_name})
    else:
        df[col_name] = dftmp["Pvalue"]


In [56]:
df = df.sort_values('Gene')

In [57]:
df.to_excel(out_report + '/burden_all_1.xlsx', index = False)

In [50]:
print "***** Please find path of output report here: ", out_report + '/burden_all.xlsx'

***** Please find path of output report here:  /data/BurdenTest/Lithiasis_mar/output/report/burden_all.xlsx
