In [10]:
import pandas as pd
import cyvcf2
import pyfaidx

In [None]:
fasta = pyfaidx.Fasta("./human_g1k_v37_fix.fasta")
mydata = "../ValidationData/mydata/original.snpeff.state.disease.identifiedgene.filtered.splai.tsv"
df = pd.read_table(mydata, sep='\t', dtype=str)
out_vcf = "./mydata_simple.vcf"

minimum_header = """##fileformat=VCFv4.2
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
"""

writer = cyvcf2.Writer.from_string(out_vcf, minimum_header, mode="w")

def natural_key(x) -> tuple:
    if x.isdigit():
        return (0, int(x))
    else:
        return (1, x)

def add_contigs_from_series(
        df: pd.DataFrame, writer: cyvcf2.Writer, fasta: pyfaidx.Fasta
        ) -> cyvcf2.Writer:
    sorted_arr = sorted(df["CHROM"].unique(), key=natural_key)
    for contig in sorted_arr:
        writer.add_to_header(
            f"##contig=<ID={contig},length={len(fasta[contig])}>"
            )
    return writer

In [12]:
writer = add_contigs_from_series(df, writer, fasta)
writer.write_header()

for row in df.iterrows():
    chrom, pos = row[1].iloc[34], row[1].iloc[35] 
    ref, alt = row[1].iloc[36], row[1].iloc[37]
    var_str = "\t".join([
        chrom, str(pos), ".", str(ref), str(alt), ".", "PASS", "."
        ])
    variant = writer.variant_from_string(var_str)
    writer.write_record(variant)

writer.close()
