In [1]:
import vcf
import numpy as np
import pandas as pd
from pyVEP import VEP

In [2]:
info_type = {'del': "deletion",
              'ins': "insertion",
              'complex': "complex",
              'snp' : "Single Nucleotide Polymorphism",
              'mnp': "Multi Nucleotide Polymorphism",
              }
variation_type = {'indel' : "INDEL",
              'snp' : "SNP",
              }
variation_subtype = {'del': "deletion",
              'ins': "insertion",
              'ts': "transition",
              'tv': "transversion",
              'unknown': "unknown"}


In [3]:
def parse_vcf_record(record):
    ref_reads, variant_reads, total_reads = record.INFO['RO'], record.INFO['AO'], record.INFO['DP']
                
    var_per = []
    for var_val in variant_reads:
        var_per.append(round((var_val/total_reads)*100,2))
                    
    ref_per = round((ref_reads/total_reads)*100,2)
                
    info_types = []
    for infotype in record.INFO['TYPE']:
        info_types.append(info_type[infotype])
                    
    tmp = (record.CHROM, record.POS, record.REF, str(record.ALT)[1:-1],
           variation_type[record.var_type],str(info_types)[1:-1],variation_subtype[record.var_subtype],
           str(variant_reads)[1:-1], total_reads, str(var_per)+"%|"+str(ref_per)+"%",)
              
    return(tmp) 

In [4]:
def variant_effect(record): 
    
    var_effect = []
    
    for eachalt in record.ALT:
        mystr = str(record.CHROM)+" "+str(record.POS)+" . "+str(record.REF)+" "+str(eachalt)+" .  .  ."
        
        if len(record.REF)>1 and len(eachalt)>1:
            var_effect.append("complex")
        else:
            r = VEP(mystr, 'grch37')
            var_effect.append(r[0]['most_severe_consequence'])
           
    return((str(var_effect)[1:-1],))

In [9]:
def main(snvs):
    
    vcf_reader = vcf.Reader(open(snvs,'r'))
    snv_dtype = [('chromosome','S50'),('position',int),('reference','S50'),('variant','S50'),
                 ('var_type','S50'),('var_infotype','S50'),('var_subtype','S50'),
                 ('var_count','S50'),('read_depth',int),('var%|ref%','S50'),
                 ('var_effect','S50')]
    
    snv_df = np.empty([0,11],dtype=snv_dtype)
    
    for record in vcf_reader:
        try:
            if record.var_type == 'indel' and record.var_subtype == 'unknown' :                
                vcf_out = parse_vcf_record(record)
                eff_out = variant_effect(record)
                snv_df = np.append(snv_df,np.array(vcf_out+eff_out, dtype=snv_dtype))
                                         
        except KeyError:
            print('WARNING: missing count field(s) in record %s:%d' % (record.CHROM, record.POS))

    return pd.DataFrame(snv_df) 

In [8]:
output = main('./Challenge_data.vcf')
print(output.iloc[0:10,:])

  chromosome  position       reference                         variant  \
0          1   1650797           ATTTT                           GTTTC   
1          1   1654129       TAAAAAAAT                        TAAAAAAT   
2          1   3350212    CTGTGTGTGTGT                      CTGTGTGTGT   
3          1   6184728   TGGGGGGGGGGGA                  TGGGGGGGGGGGGA   
4          1   6209058  GTCCTCCTCCTCCT                     GTCCTCCTCCT   
5          1   6219477  TTTCTTCTTCTTCT                     TTTCTTCTTCT   
6          1   6228297         GAAAAAG                          GAAAAG   
7          1   6475586              TC                              GA   
8          1   7811328      CAAAAAAAAT                       CAAAAAAAT   
9          1  10292359  CATATATATATATA  CATATATATATA, CATATATATATATATA   

  var_type                     var_infotype var_subtype var_count  read_depth  \
0    INDEL                        'complex'   insertion        74        2636   
1    INDEL             