# Handling VCF files

In [1]:
import biu

## Loading VCF files
You can load VCF files, with or without a tabix index.
If you do not provide a tabix index, then the whole VCF file is loaded into memory and they are index into an interval tree for efficient queries.

In [2]:
vcf = biu.formats.VCF('example_files/example.vcf')

vcfTabix = biu.formats.VCF('example_files/example.vcf.bgz', tabix=True)

D: VCF Input source is unindexed file.
D: VCF Input source is tabixed file.


## Inspecting the VCF structure

You can inspect the VCF structure to get some basic information.
In a tabixed file, you cannot determine the number of entries.

In [3]:
print(vcf)

print('---\n')

print(vcfTabix)

VCF object
 Where: example_files/example.vcf
 Template: example_files/example.vcf
 Entries: 5
 Number of Samples: 3
 Tabix: No

---

VCF object
 Where: example_files/example.vcf.bgz
 Template: example_files/example.vcf.bgz
 Number of Samples: 3
 Tabix: Yes



D: Building VCF Index. May take a while.


## Get the list of samples

In [4]:
#Get a list of samples:
for sample in vcf.samples:
    print(sample)

NA00001
NA00002
NA00003


## Querying VCF files
Regardless of how you loaded the file, you can query them the same way. There are several filtering options available.


 * filters : Remove variants that match a list of filters. e.g.:  [ "AMBIGUOUS","VQLOW","NVLOC","CALLRATE","MULTI","RECMULTI"]
 * types : Remove variants that do not match a list of variant types (e.g. snp, indel, sv)
 * subTypes : Remove variants that do not match a list of variant subTypes (see pyVCF documentation)
 * sampleFilters : Remove samples from the genotype columns (We will get to this later)

In [5]:
for record in vcf.query(20, 14369, 1234567):
    print(record)

Record(CHROM=20, POS=14370, REF=G, ALT=[A])
Record(CHROM=20, POS=17330, REF=T, ALT=[A])
Record(CHROM=20, POS=1234567, REF=GTCT, ALT=[G, GTACT])
Record(CHROM=20, POS=1230237, REF=T, ALT=[None])
Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T])


D: VCF Input source is list of Records.


In [6]:
for record in vcf.query(20, 14369, 1234567, types=['snp']):
    print(record)

Record(CHROM=20, POS=14370, REF=G, ALT=[A])
Record(CHROM=20, POS=17330, REF=T, ALT=[A])
Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T])


D: VCF Input source is list of Records.


In [7]:
for record in vcf.query(20, 14369, 1234567, types=['snp'], filters=['q10']):
    print(record)

Record(CHROM=20, POS=14370, REF=G, ALT=[A])
Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T])


D: VCF Input source is list of Records.


### Query multiple regions simultaneously

In [19]:
for record in vcf.queryRegions([ (20, 14369, 1234567), (20, 14369, 1234567) ], types=['snp'], filters=['q10']):
    print(record)

Record(CHROM=20, POS=14370, REF=G, ALT=[A])
Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T])
Record(CHROM=20, POS=14370, REF=G, ALT=[A])
Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T])


D: VCF Input source is list of Records.


## Extract other information

### Extract a summary
You can also get a summary of genotypes from the query.

This is given by the extract='summary' option.

In [8]:
vcf.query(20, 14369, 1234567, types=['snp'], filters=['q10'], extract='summary')


Unnamed: 0,id,RR,R,RA,A,AA,O
0,20-14370-G-A,1,0,1,0,1,0
1,20-1110696-A-G,0,0,0,0,0,3


### Extract only the records
You can extract the raw fields.
This is done with the extract='raw' option.

This removes the VCF structure around the individual records. This removes thereby also functionality associated with that structure, but it can be useful to remove overhead (if for example you want to make custom filtering steps, and then insert those into a VCF structure again.

In [9]:
type(vcf.query(20, 14369, 1234567, types=['snp'], filters=['q10'], extract='raw'))

list

## Subquerying VCF files
You can additionally create substructures from the results of your queries.
These are loaded as if you had read them from a non-tabixed VCF file, and can be queried further just as if they had been loaded from a file.

The location of this structure is now given as a location in memory, to indicate that it is not a file.

In [10]:
subvcf = vcf.query(20, 14369, 1234567, types=['snp'])
for record in subvcf.query(20, 14369, 1234567, filters=['q10']):
    print(record)

Record(CHROM=20, POS=14370, REF=G, ALT=[A])
Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T])


D: VCF Input source is list of Records.
D: Building VCF Index. May take a while.
D: VCF Input source is list of Records.


In [11]:
for record in subvcf.query(20, 14369, 1234567, filters=['q10']):
    print(record)

Record(CHROM=20, POS=14370, REF=G, ALT=[A])
Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T])


D: VCF Input source is list of Records.


### Filtering samples
If we select a subset of samples to work with, we will see them removed from the structure in later queries

In [12]:
subvcf = vcf.query(20, 14369, 1234567, sampleFilters=['NA00001', 'NA00002'])
print(subvcf.samples)

['NA00001', 'NA00002']


D: VCF Input source is list of Records.
D: Building VCF Index. May take a while.


## Post-query filtering
You can also filter the results after the query

(Or even if you have your own list of VCF records)

In [13]:
queryResult = vcf.query(20, 14369, 1234567)
for record in queryResult.filter(filters=['q10']):
    print(record)

Record(CHROM=20, POS=14370, REF=G, ALT=[A])
Record(CHROM=20, POS=1234567, REF=GTCT, ALT=[G, GTACT])
Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T])


D: VCF Input source is list of Records.
D: VCF Input source is list of Records.


## Static methods
If you have previously made a query, you can also access the internal parts of the query function with static methods of the VCF class.

In [14]:
queryResult = vcf.query(20, 14369, 1234567)

# Extract the summary
biu.formats.VCF.summary(queryResult)

D: VCF Input source is list of Records.


Unnamed: 0,id,RR,R,RA,A,AA,O
0,20-14370-G-A,1,0,1,0,0,0
1,20-17330-T-A,1,0,1,0,0,0
2,20-1234567-GTCT-G,0,0,1,0,0,1
3,20-1230237-T--,2,0,0,0,0,0
4,20-1110696-A-G,0,0,0,0,0,2


In [15]:
# Filter the results of the query
for record in biu.formats.VCF.filter(queryResult, ['q10']):
    print(record)

Record(CHROM=20, POS=14370, REF=G, ALT=[A])
Record(CHROM=20, POS=1234567, REF=GTCT, ALT=[G, GTACT])
Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T])


D: VCF Input source is list of Records.


In [16]:
# Filter on the type of variant
for record in biu.formats.VCF.filterType(queryResult, ['snp']):
    print(record)

Record(CHROM=20, POS=14370, REF=G, ALT=[A])
Record(CHROM=20, POS=17330, REF=T, ALT=[A])
Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T])


In [17]:
# Filter in the sample ID
for sampleid in biu.formats.VCF(biu.formats.VCF.filterSamples(queryResult, ['NA00001', 'NA00002'])).samples:
    print(sampleid)

NA00001
NA00002


D: VCF Input source is list of Records.
D: Building VCF Index. May take a while.


## Summary function
The summary function by default provides a genotype summary for a biallelic variant site (0/1).

We can tell the summary functions to use a different allele if we want

In [18]:
var = vcf.query(20, 1110695, 1110696, types=['snp'])[0]
print(var)

biu.formats.VCF.summary([var], altPos=[2], refPos=[1])

Record(CHROM=20, POS=1110696, REF=A, ALT=[G, T])


D: VCF Input source is list of Records.


Unnamed: 0,id,RR,R,RA,A,AA,O
0,20-1110696-A-T,0,0,2,0,0,0
