### Test code for writing variants to bed file 

In [2]:
import pandas as pd 
import pysam 
import argparse

In [5]:
vcf_path="/lustre/scratch126/humgen/projects/interval_wgs/final_release_freeze/gt_phased/interval_wgs.chr21.gt_phased.vcf.gz"
max_indel_length = 50

In [None]:
vcf = pysam.VariantFile(vcf_path, mode = "r")
records = vcf.fetch()
for rec in records: 
    af = rec.info["AF"]
    # get chromosome, position, reference and alternative alleles 
    chrom, pos, ref, alt_alleles = rec.chrom, rec.pos, rec.ref, rec.alts
    chrom_num = chrom.split("chr")[1]
    # check for multiallelic alleles
    if len(alt_alleles) > 1: 
        raise ValueError(f"Multiallelic entry in vcf at: {chrom}, {pos}")
    else: 
        alt = alt_alleles[0]
    vcf_vrnt_id = f"{rec.chrom}:{rec.pos}:{ref}:{alt}"
    # check ref and alt alleles do not contain N or . 
    if any(b not in "ATGC" for b in ref+alt): 
        raise ValueError(f"Variant {vcf_vrnt_id} contains unassigned nucleotides.")
    # convert to bed 
    # deletions (vcf2bed includes base preceding deletion)
    if len(ref) > len(alt):
        if len(alt) > 1: 
            raise ValueError(f"Variant {vcf_vrnt_id} alt allele length is greater than one: {len(alt)}") 
        start = pos - 1 
        end = pos+len(ref) - 1
        length = len(ref) - len(alt)
    # snvs and insertions (vcf2bed yields a one-base bed element)
    else: 
        start = pos - 1 
        end = pos
        length = len(alt) - len(ref)
    # check length 
    if length < 0: 
        raise ValueError(f"Negative variant length: {vcf_vrnt_id}")
    # remove indels longer than max length  
    elif length > max_indel_length: 
        continue 
    else: 
        print(chrom_num, start, end, vcf_vrnt_id, af)