In [1]:
import numpy as np
import pandas as pd
import cyvcf2

In [43]:
class Variant:
	"""A class to represent a variant
	Properties:
		chrom: chromosome
		pos: position
		ref: reference allele
		alt: alternate allele
		gt: genotype
		dp: depth
		gq: genotype quality
		ad: allele depth
		gt_fa: Father's genotype
		gt_mo: Mother's genotype
		gt_sib: Sibling's genotype

	"""
	def __init__(self, chrom, pos, ref, alt, gt, dp, gq, ad):
		self.chrom = chrom
		self.pos = pos
		self.ref = ref
		self.alt = alt
		self.gt = gt
		self.dp = dp
		self.gq = gq
		self.ad = ad

	def maf_filter(self, target: int, threshold: float) -> bool:
		if target < threshold:
			return True
		else:
			return False

	
class Snv(Variant):
	def __init__(self, chrom, pos, ref, alt, gt, dp, gq, ad):
		super().__init__(chrom, pos, ref, alt, gt, dp, gq, ad)

class Cnv(Variant):
	def __init__(self, chrom, pos, ref, alt, gt, dp, gq, ad):
		super().__init__(chrom, pos, ref, alt, gt, dp, gq, ad)

class Missense(Snv):
	def __init__(self, chrom, pos, ref, alt, gt, dp, gq, ad, gene, aa_change):
		super().__init__(chrom, pos, ref, alt, gt, dp, gq, ad)
		self.gene = gene
		self.aa_change = aa_change
	
class Synonymous(Snv):
	def __init__(self, chrom, pos, ref, alt, gt, dp, gq, ad, gene, aa_change):
		super().__init__(chrom, pos, ref, alt, gt, dp, gq, ad)
		self.gene = gene
		self.aa_change = aa_change

class Nonsense(Snv):
	def __init__(self, chrom, pos, ref, alt, gt, dp, gq, ad, gene, aa_change):
		super().__init__(chrom, pos, ref, alt, gt, dp, gq, ad)
		self.gene = gene
		self.aa_change = aa_change



In [2]:
input_vcf = "test_input_vcfs/gnomad.chr21.splai.vep.maxent.loftee.pangolin.squirls.vcf"
vcf = cyvcf2.VCF(input_vcf)
header = vcf.header_iter()

for h in header:
	try:
		h['ID']
	except KeyError:
		continue
	else:
		if h['ID'] == 'CSQ':
			vep_cols_list = h['Description'].split('Format: ')[1].rstrip('"').split('|')
		elif h['ID'] == 'SpliceAI':
			splai_cols_list = h['Description'].split('Format: ')[1].rstrip('"').split('|')
		elif h['ID'] == 'Pangolin':
			pang_cols_list = h['Description'].split('Format: ')[1].rstrip('"').split('|')
		else:
			pass

vepidx: dict = {col: i for i, col in enumerate(vep_cols_list)}
splaidx: dict = {col: i for i, col in enumerate(splai_cols_list)}

In [25]:
vepidx

{'Allele': 0,
 'Consequence': 1,
 'IMPACT': 2,
 'SYMBOL': 3,
 'Gene': 4,
 'Feature_type': 5,
 'Feature': 6,
 'BIOTYPE': 7,
 'EXON': 8,
 'INTRON': 9,
 'HGVSc': 10,
 'HGVSp': 11,
 'cDNA_position': 12,
 'CDS_position': 13,
 'Protein_position': 14,
 'Amino_acids': 15,
 'Codons': 16,
 'Existing_variation': 17,
 'DISTANCE': 18,
 'STRAND': 19,
 'FLAGS': 20,
 'VARIANT_CLASS': 21,
 'SYMBOL_SOURCE': 22,
 'HGNC_ID': 23,
 'CANONICAL': 24,
 'REFSEQ_MATCH': 25,
 'SOURCE': 26,
 'REFSEQ_OFFSET': 27,
 'HGVS_OFFSET': 28,
 'MaxEntScan_alt': 29,
 'MaxEntScan_diff': 30,
 'MaxEntScan_ref': 31,
 'LoF': 32,
 'LoF_filter': 33,
 'LoF_flags': 34,
 'LoF_info': 35}

In [12]:
i = 0

class Var(cyvcf2.cyvcf2.Variant):
	def __init__(self, v):
		super().__init__(v)
		self.vep = v.INFO.get('CSQ').split(',')
		self.splai = v.INFO.get('SpliceAI').split(',')
		self.pang = v.INFO.get('Pangolin').split(',')
	def filter_maf(self, target: int, threshold: float) -> bool:
		if target < threshold:
			return True
		else:
			return False

input_vcf = "test_input_vcfs/gnomad.chr21.splai.vep.maxent.loftee.pangolin.squirls.vcf"
for v in cyvcf2.VCF(input_vcf):
	i += 1
	if i > 10:
		break

	var = Var(v)
	# print(var)
	print(type(var))
	# print(variant.filter_maf(100, 0.01))


TypeError: Variant object cannot be instantiated directly.

In [None]:
i = 0
for v in vcf:
	if i > 3:
		break
	print(type(v))

In [11]:
pang_cols_list



In [None]:
vep: list = v.INFO.get('CSQ').split('|')

# Get SpliceAI scores
if v.INFO.get('SpliceAI'):
	splai: list = v.INFO.get('SpliceAI').split(',')[0].split('|')
	print(pangolin)
else:
	splai = ['NA'] * len(splai_cols_list)