## From vcf to data for SKAT in R

### 1/ Input:
- vcf file: variants with genotype
- gene file or bed file: contain gene names and their range - their positions
- ped file: phenotype
- R script: to run Skat 

### 2/ Dependencies:
- R
- bedtools
- python packages in import cell

Runtime: 2 min for this example

In [1]:
# import cell
import subprocess
import os
import fileinput
import sys

import pandas as pd

from pysam import VariantFile as psVa

In [2]:
# inputs
vcf_file = '/data/Rstudio/data/annoexom.vcf' 
gene_file = '/data/Rstudio/data/genefile.txt'
pheno_file = '/data/Rstudio/data/pheno_all.ped'
input_folder = '/data/Rstudio/data' 
# for output pvalue
out_file = input_folder+ '/out_pvalue.txt'

In [3]:
# for tmp files
tmp_folder = input_folder + '/tmp'
os.system("mkdir " + tmp_folder)

0

In [4]:
# sort, bgzip and tabix file vcf for using bedtools
vcf_sort = vcf_file[0:len(vcf_file)-4]+'_sort.vcf'
comm1 = "vcf-sort " + vcf_file + " > " + vcf_sort
comm2 = "bgzip -c " + vcf_sort + " > " + vcf_sort+'.gz'
comm3 = "tabix -p vcf " + vcf_sort+'.gz'
subprocess.check_output(comm1, shell = True)
subprocess.check_output(comm2, shell = True)
subprocess.check_output(comm3, shell = True)


''

In [5]:
# read gene file, 'sep=' depends on your file
df_gene = pd.read_csv(gene_file, sep ='\t')
df_gene.head()

Unnamed: 0,Gene,RANGE
0,IDUA,"4:980784-998345,4:980784-998345"
1,AQP1,"7:30951308-30965132,7:30951308-30965132"
2,HOGA1,"10:99344101-99372555,10:99344101-99372555"
3,TRPV6,7:142568955-142583490
4,DGKH,"13:42712177-42830716,13:42712177-42817033,13:4..."


In [6]:
# read phenotype file, 'sep=' depends on your file
df_pheno = pd.read_csv(pheno_file, sep =' ')
df_pheno.head()

Unnamed: 0,fid,iid,fatid,matid,sex,age,PLq,HOX,HCaLciU,HCt3oL,RCt3oLOH
0,23117b5d-6580-4d23-9b4d-d33f05d51304,23117b5d-6580-4d23-9b4d-d33f05d51304,0,0,2,48,0,0,0,0,1
1,33d41660-b7f6-455b-a8fa-b71e138c5284,33d41660-b7f6-455b-a8fa-b71e138c5284,0,0,1,62,0,0,0,0,0
2,53f34b70-a595-4851-b097-b09a3b9bfb3f,53f34b70-a595-4851-b097-b09a3b9bfb3f,0,0,1,24,0,0,1,0,0
3,48006d66-7118-4cdf-a58f-aed98c7d10c0,48006d66-7118-4cdf-a58f-aed98c7d10c0,0,0,1,45,0,0,0,1,1
4,e2509c87-a435-4d80-976b-af5163283915,e2509c87-a435-4d80-976b-af5163283915,0,0,2,39,1,0,1,1,1


In [7]:
# function remove header to transform to dataframe
def remove_header(mfile):
    #print "***** Remove header, keep Genetype 1/1, 0/1 or 0/0"
    os.system("cp " + mfile + " " + mfile[0:len(mfile)-4] + "_1.vcf")
    mfile = mfile[0:len(mfile)-4] + "_1.vcf"
    minput = fileinput.input(mfile, inplace=1)

    for i, line in enumerate(minput):
        #sys.stderr.write(line +'\n')
        newline = line

        if line[0:2]=="##":

            newline = ""
        sys.stdout.write(newline)
    minput.close()
    return mfile
    #print "***** Output is ", mfile

In [8]:
%%time
# Generate matrices genotype 
a0 = int(0)
a1 = int(1)
a2 = int(2)
a9 = int(9)
for k in range(len(df_gene)):
    
    rang = df_gene['RANGE'][k]
    gene_name = df_gene['Gene'][k]
    
    
    # generate bed file for current gene
    input_bed = tmp_folder + '/mybed.bed'
    f = open( input_bed,'wb')
    tmp1 = rang.split(',')
    for tmp in tmp1:
        tmp2 = tmp.split(':')
        ch = tmp2[0]
        tmp3 = tmp2[1].split('-')
        f.write(ch + '\t' + tmp3[0] + '\t' + tmp3[1] + '\n')
    f.close()
    
    # filter vcf with above bed file
    
    vcf_bedfil = tmp_folder+'/vcf_filbed.vcf'
    cmn = "bedtools intersect -a " + vcf_sort + " -b " + input_bed + " -header > " + vcf_bedfil
    subprocess.check_output(cmn, shell = True)
    
    
    # Remove header
    mfile = remove_header(vcf_bedfil)
    
    # output dataframe from the new vcf
    df=pd.read_csv(mfile, sep='\t')
    df = df.drop_duplicates() # remove duplicates (after using bedtools - repeat range in gene file)
    cols = df.columns
    
    # mapping from genotype to matrix, ordering same phenotype file (df_tab)
    new_dict={
        '0/0':a0, 
        '0/1':a1, '1/0':a1, '0/2':a1, '2/0':a1, '3/0':a1, '0/3':a1,
        '1/1':a2, '2/2':a2, '3/3':a2, '4/4':a2,
        'else':a9
    }
    for col in cols[9:len(cols)]:
        df[col] = df[col].map(new_dict).fillna(new_dict['else']).astype(int)
    
    # append genotype matrix to df_sort corresponding to phenotype order
    df_sort = pd.DataFrame()
    for col in df_pheno['fid']:
        df_sort = df_sort.append(df[col])
    
    # Output matrix to csv file
    df_sort.to_csv(tmp_folder+ '/'+gene_name + '.csv', sep='\t', index = False)
    print 'Output file: ' + tmp_folder+ '/'+gene_name + '.csv with number of variants ', len(df_sort.columns)


Output file: /data/Rstudio/data/tmp/IDUA.csv with number of variants  237
Output file: /data/Rstudio/data/tmp/AQP1.csv with number of variants  72
Output file: /data/Rstudio/data/tmp/HOGA1.csv with number of variants  87
Output file: /data/Rstudio/data/tmp/TRPV6.csv with number of variants  104
Output file: /data/Rstudio/data/tmp/DGKH.csv with number of variants  203
Output file: /data/Rstudio/data/tmp/ENTPD1.csv with number of variants  123
Output file: /data/Rstudio/data/tmp/FGFR1.csv with number of variants  196
Output file: /data/Rstudio/data/tmp/PMF1-BGLAP.csv with number of variants  35
Output file: /data/Rstudio/data/tmp/SLC26A7.csv with number of variants  71
Output file: /data/Rstudio/data/tmp/AP2S1.csv with number of variants  60
Output file: /data/Rstudio/data/tmp/NT5E.csv with number of variants  74
Output file: /data/Rstudio/data/tmp/LDHA.csv with number of variants  64
Output file: /data/Rstudio/data/tmp/C10orf55.csv with number of variants  59
Output file: /data/Rstudio/

In [9]:
# see one of mapped genotype matrix 
df_sort.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,172,173,174,175,176,177,178,179,180,181
23117b5d-6580-4d23-9b4d-d33f05d51304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33d41660-b7f6-455b-a8fa-b71e138c5284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
53f34b70-a595-4851-b097-b09a3b9bfb3f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48006d66-7118-4cdf-a58f-aed98c7d10c0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
e2509c87-a435-4d80-976b-af5163283915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
%%time
# run R script with 4 arguments: phenotype file + genotype matrix + gene name + path of output_pvalue file
# Rscript runskat.R /data/Rstudio/data/pheno_all.ped /data/Rstudio/data/tmp/AGXT.csv AGXT /data/Rstudio/data/out_pvalue.txt
for k in range(len(df_gene)):
    gene_name = df_gene['Gene'][k]
    cmn = "Rscript runskat.R "+ pheno_file + ' ' +tmp_folder+ '/'+gene_name + '.csv ' + gene_name + ' ' + out_file
    subprocess.check_output(cmn, shell = True)

CPU times: user 35.4 ms, sys: 161 ms, total: 197 ms
Wall time: 55.7 s
