### 1. How to read gnomAD VCF files from Google Cloud Storage or AWS S3

#### Google Cloud Storage

In [1]:
import polars_bio as pb
import polars as pl

INFO:polars_bio:Creating BioSessionContext


In [2]:
gcs_vcf_path = (
    "gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
)

In [3]:
pb.read_vcf(gcs_vcf_path).limit(3).collect()

INFO:polars_bio:Table: gnomad_v4_1_sv_sites_gz registered for path: gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz


chrom,start,end,id,ref,alt,qual,filter
str,u32,u32,str,str,str,f64,str
"""chr1""",10000,295666,"""gnomAD-SV_v3_DUP_chr1_01c2781c""","""N""","""<DUP>""",134.0,"""HIGH_NCR"""
"""chr1""",10434,10434,"""gnomAD-SV_v3_BND_chr1_1a45f73a""","""N""","""<BND>""",260.0,"""HIGH_NCR;UNRESOLVED"""
"""chr1""",10440,10440,"""gnomAD-SV_v3_BND_chr1_3fa36917""","""N""","""<BND>""",198.0,"""HIGH_NCR;UNRESOLVED"""


#### AWS S3

In [4]:
aws_s3_vcf_path = "s3://gnomad-public-us-east-1/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr21.vcf.bgz"

In [5]:
pb.read_vcf(aws_s3_vcf_path).limit(3).collect()

INFO:polars_bio:Table: gnomad_exomes_v4_1_sites_chr21_bgz registered for path: s3://gnomad-public-us-east-1/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr21.vcf.bgz


chrom,start,end,id,ref,alt,qual,filter
str,u32,u32,str,str,str,f64,str
"""chr21""",5031905,5031905,"""""","""C""","""A""",0.0,"""AC0;AS_VQSR"""
"""chr21""",5031905,5031905,"""""","""C""","""T""",0.0,"""AC0;AS_VQSR"""
"""chr21""",5031909,5031909,"""""","""T""","""C""",0.0,"""AC0;AS_VQSR"""


### 2. How to specify additional VCF INFO fields to be parsed

In [6]:
vcf_info_fields = ["SVTYPE", "SVLEN"]
pb.read_vcf(gcs_vcf_path, info_fields=vcf_info_fields).limit(3).collect()

INFO:polars_bio:Table: gnomad_v4_1_sv_sites_gz registered for path: gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz


chrom,start,end,id,ref,alt,qual,filter,svtype,svlen
str,u32,u32,str,str,str,f64,str,str,i32
"""chr1""",10000,295666,"""gnomAD-SV_v3_DUP_chr1_01c2781c""","""N""","""<DUP>""",134.0,"""HIGH_NCR""","""DUP""",285666
"""chr1""",10434,10434,"""gnomAD-SV_v3_BND_chr1_1a45f73a""","""N""","""<BND>""",260.0,"""HIGH_NCR;UNRESOLVED""","""BND""",-1
"""chr1""",10440,10440,"""gnomAD-SV_v3_BND_chr1_3fa36917""","""N""","""<BND>""",198.0,"""HIGH_NCR;UNRESOLVED""","""BND""",-1


### 3. How to spead up reading local VCF files with multiple threads

In [7]:
! gsutil cp $gcs_vcf_path /tmp/gnomad.v4.1.sv.sites.vcf.gz 2> /dev/null

In addition, using fork() with Python in general is a recipe for mysterious
deadlocks and crashes.

The most likely reason you are seeing this error is because you are using the
multiprocessing module on Linux, which uses fork() by default. This will be
fixed in Python 3.14. Until then, you want to use the "spawn" context instead.

See https://docs.pola.rs/user-guide/misc/multiprocessing/ for details.

or by setting POLARS_ALLOW_FORKING_THREAD=1.

  pid, fd = os.forkpty()


In [8]:
%%time
pb.read_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz", thread_num=1).count().collect()

INFO:polars_bio:Table: gnomad_v4_1_sv_sites_gz registered for path: /tmp/gnomad.v4.1.sv.sites.vcf.gz


0rows [00:00, ?rows/s]

CPU times: user 13.1 s, sys: 1.86 s, total: 14.9 s
Wall time: 11.1 s


chrom,start,end,id,ref,alt,qual,filter
u32,u32,u32,u32,u32,u32,u32,u32
2154486,2154486,2154486,2154486,2154486,2154486,2154486,2154486


In [9]:
%%time
pb.read_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz", thread_num=4).count().collect()

INFO:polars_bio:Table: gnomad_v4_1_sv_sites_gz registered for path: /tmp/gnomad.v4.1.sv.sites.vcf.gz


0rows [00:00, ?rows/s]

CPU times: user 12.6 s, sys: 1.77 s, total: 14.4 s
Wall time: 3.35 s


chrom,start,end,id,ref,alt,qual,filter
u32,u32,u32,u32,u32,u32,u32,u32
2154486,2154486,2154486,2154486,2154486,2154486,2154486,2154486


### 4. How to perform an overlap operation on two remote VCF files in streaming mode

In [10]:
vcf_1 = "gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
vcf_2 = "gs://gcp-public-data--gnomad/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr21.vcf.bgz"

In [11]:
vcf_read_options_1 = pb.VcfReadOptions(info_fields=["SVTYPE", "SVLEN"], thread_num=1)
read_options_1 = pb.ReadOptions(vcf_read_options=vcf_read_options_1)

In [12]:
pb.overlap(vcf_1, vcf_2, streaming=True, read_options1=read_options_1).sink_csv(
    "/tmp/streaming_run.csv"
)

INFO:polars_bio.operation:Running in streaming mode...
INFO:polars_bio.operation:Running Overlap operation with algorithm Coitrees and 1 thread(s)...


In [13]:
pl.read_csv("/tmp/streaming_run.csv").limit(3)

chrom_1,start_1,end_1,chrom_2,start_2,end_2,id_1,ref_1,alt_1,qual_1,filter_1,svtype_1,svlen_1,id_2,ref_2,alt_2,qual_2,filter_2
str,i64,i64,str,i64,i64,str,str,str,f64,str,str,i64,str,str,str,f64,str
"""chr21""",5019150,5047500,"""chr21""",5031905,5031905,"""gnomAD-SV_v3_DUP_chr21_029eb66…","""N""","""<DUP>""",34.0,"""PASS""","""DUP""",28350,"""""","""C""","""A""",0.0,"""AC0;AS_VQSR"""
"""chr21""",5019150,5047500,"""chr21""",5031905,5031905,"""gnomAD-SV_v3_DUP_chr21_029eb66…","""N""","""<DUP>""",34.0,"""PASS""","""DUP""",28350,"""""","""C""","""T""",0.0,"""AC0;AS_VQSR"""
"""chr21""",5019150,5047500,"""chr21""",5031909,5031909,"""gnomAD-SV_v3_DUP_chr21_029eb66…","""N""","""<DUP>""",34.0,"""PASS""","""DUP""",28350,"""""","""T""","""C""",0.0,"""AC0;AS_VQSR"""
