##### To generate the True Negative variants list from gnomAD v2.1.1

In [1]:
import re
import pandas as pd
from cyvcf2 import VCF

In [2]:
# !gunzip -c /resources/gnomAD/gnomad.exomes.r2.1.1.sites.vcf.bgz | sed '/^#/d' | wc -l
# 17,209,972 variants

#### Load gnomAD v2.1.1 data

In [3]:
gnomad_variants = '/resources/gnomAD/gnomad.exomes.r2.1.1.sites.vcf.bgz'
canons_list = '/workspace/Github/MyProjects/SplicingScreening/data/CanonicalTranscripts_COMP/CanonicalTranscripts.exoncount.tsv'
df_canons = pd.read_csv(canons_list, sep='\t', header=0, index_col=None)
canons_set = set(df_canons['ENST'])
variant_types: set = {
    'inframe_deletion', 'inframe_insertion', 'synonymous_variant', 
    'inframe_insertion&splice_region_variant', 'intron_variant', 
    'missense_variant', 'splice_acceptor_variant',
    'splice_acceptor_variant&5_prime_UTR_variant&intron_variant',
    'splice_acceptor_variant&intron_variant', 
    'splice_region_variant&intron_variant',
    'splice_region_variant&synonymous_variant'
    }

##### Functions

In [4]:
def is_high_quality(info) -> bool:
    if ((info.get('variant_type') == 'snv')
        & (info.get('n_alt_alleles') == 1)
        & (info.get('MQ') >= 40)
        & (info.get('FS') <= 60)
        & (info.get('QD') >= 2.0)
        & (info.get('MQRankSum') >= -12.5)
        & (info.get('ReadPosRankSum') >= -8.0)
        & (info.get('decoy') is None)
        & (info.get('lcr') is None)
        & (info.get('segdup') is None)):

        return True


def is_canonical_snv(splitvep: list) -> bool:
    if ((splitvep[6] in canons_set) 
        &(splitvep[1] in variant_types)
        &(splitvep[26] == 'YES')):

        return True


def format_str(x):
    '''
    Change numbers to a string, and change list to a string.
    '''
    if x is None:
        y = '.'
    else:
        if isinstance(x, list):
            if not x:
                y = ';'.join(x)
            else:
                y = '.'
        else:
            y = str(x)
    return y


def generate_new_record(variant, splitvep) -> list:
    try:
        hgvs_c = re.search(r'(?<=:)(.*)', splitvep[10]).group(0)
    except AttributeError:
        hgvs_c = '.'

    try:
        hgvs_p = re.search(r'p\.[0-9#%a-zA-Z>]*', splitvep[11]).group(0)
    except AttributeError:
        hgvs_p = '.'

    if splitvep[8]:
        rank = splitvep[8]  # Exon No.
    else:
        rank = splitvep[9]  # Intron No.

    new_record = [
        format_str(variant.CHROM), 
        format_str(variant.POS), 
        format_str(variant.REF), 
        format_str(variant.ALT[0]),
        splitvep[3],                           # Gene Symbol
        format_str(variant.INFO.get('AF')),
        splitvep[1],                           # Variant Type
        splitvep[4],                           # ENSG
        splitvep[6],                           # ENST
        hgvs_c,
        hgvs_p, 
        rank, 
        ]
    
    return new_record

##### Generate the True Negative variants list in this directory

In [5]:
o = './true_negative_variants.tsv'
columns = [
    'CHROM', 'POS', 'REF', 'ALT', 'GeneSymbol', 'AF', 
    'csq', 'ENSG', 'ENST', 'HGVS.c', 'HGVS.p', 'Rank'
]
utrs = ['5_prime_UTR_variant', '3_prime_UTR_variant']

with open(o, 'w') as out_file:
    out_file.write('\t'.join(columns) + '\n')
    vcf = VCF(gnomad_variants)
    for variant in vcf:
        if variant.FILTER is None:
            info = variant.INFO
            if ((info.get('MQRankSum') is None) 
                | (info.get('ReadPosRankSum') is None)):
                pass
            else:
                if (info.get('AF') >= 0.01):
                    if is_high_quality(info):
                        veps: list = info.get('vep').split(',')
                        i = 0 
                        for vep in veps:
                            splitvep: list = vep.split('|')
                            if is_canonical_snv(splitvep):
                                if splitvep[2] in utrs:
                                    pass
                                else:   # In ORF (Not in UTRs)
                                    i += 1
                            else:
                                pass

                        if i == 1:  # Pick up variants with unique canonical Tx.
                            for vep in veps:
                                splitvep: list = vep.split('|')
                                if is_canonical_snv(splitvep):
                                    new_record = generate_new_record(
                                        variant, splitvep)
                                    out_file.write('\t'.join(new_record) + '\n')
                                    continue
                                else:
                                    pass
                        else:   # There are some canonical variants
                            pass
                    else:   # Low quality variants
                        pass
                else:   # Rare variants (AF < 0.01)
                    pass  
        else:   # FILTER is Not 'PASS'
            pass

##### CHECK the output file

In [16]:
o = './benign.tsv'
cols = [
    'CHROM', 'POS', 'REF', 'ALT', 'gene', 'AlleleFrequency', 'csq', 
    'ENSG', 'ENST', 'c.HGVS', 'p.HGVS', 'Rank', 'snv_flag'
    ]
df = pd.read_table(o, sep='\t', header=None, dtype=str, names=cols)

print(len(df))

143921
