##### Import package

In [10]:
import os
import re
import pandas as pd
from Bio.Seq import Seq
from liftover import get_lifter
from pathlib2 import Path
from pandarallel import pandarallel
from tqdm import tqdm
import gffutils
import pysam
from cyvcf2 import VCF

### Logging setup
from logging import getLogger, config
import yaml
parent_directory = os.path.dirname(os.path.dirname('__file__'))
config_path: str = os.path.join(parent_directory, '../../../config/logging.yaml')
with open(config_path, 'r') as f:
    config.dictConfig(yaml.safe_load(f))
logger = getLogger(__name__)

########   Initialize and setup pandas methods   ########
os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp' 
pandarallel.initialize(nb_workers=3, progress_bar=False, verbose=0, use_memory_fs=False) 
tqdm.pandas()

In [12]:
## Parse VCF to simple input table

raw_vcf = 'splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.vcf'

vcf = VCF(raw_vcf)
header = vcf.header_iter()
for h in header:
    try:
        h['ID']
    except KeyError:
        continue
    else:
        if h['ID'] == 'CSQ':
            vep_cols_list = h['Description'].split('Format: ')[1].rstrip('"').split('|')
        elif h['ID'] == 'SpliceAI':
            splai_cols_list = h['Description'].split('Format: ')[1].rstrip('"').split('|')
        else:
            pass

vepidx: dict = {col: i for i, col in enumerate(vep_cols_list)}
splaidx: dict = {col: i for i, col in enumerate(splai_cols_list)}

cols = [
    'CHROM', 'POS', 'REF', 'ALT', 'GeneSymbol', 'SymbolSource', 'HGNC_ID', 
    'ENST', 'HGVSc', 'Consequence', 'EXON', 'INTRON', 'Strand',
    'DS_AG', 'DS_AL', 'DS_DG', 'DS_DL', 'DP_AG', 'DP_AL', 'DP_DG', 'DP_DL', 'MaxSpliceAI'
]

print(vepidx)

{'Allele': 0, 'Consequence': 1, 'IMPACT': 2, 'SYMBOL': 3, 'Gene': 4, 'Feature_type': 5, 'Feature': 6, 'BIOTYPE': 7, 'EXON': 8, 'INTRON': 9, 'HGVSc': 10, 'HGVSp': 11, 'cDNA_position': 12, 'CDS_position': 13, 'Protein_position': 14, 'Amino_acids': 15, 'Codons': 16, 'Existing_variation': 17, 'DISTANCE': 18, 'STRAND': 19, 'FLAGS': 20, 'VARIANT_CLASS': 21, 'SYMBOL_SOURCE': 22, 'HGNC_ID': 23, 'CANONICAL': 24, 'REFSEQ_MATCH': 25, 'SOURCE': 26, 'REFSEQ_OFFSET': 27, 'HGVS_OFFSET': 28}


In [13]:

df: pd.DataFrame = pd.DataFrame(columns=cols)
for v in VCF(raw_vcf):
    vep: list = v.INFO.get('CSQ').split('|')

    # Get SpliceAI scores
    if v.INFO.get('SpliceAI'):
        splai: list = v.INFO.get('SpliceAI').split(',')[0].split('|')
    else:
        splai = ['NA'] * len(splai_cols_list)

    # Get HGVSc from VEP
    try:
        hgvsc = re.search('(?<=:).*',vep[vepidx['HGVSc']])[0]
    except TypeError:
        hgvsc = "NA"

    # Convert strand to +/- 
    strand = lambda s: '+' if s == 1 else '-'

    # Get max SpliceAI scores
    ds_ag: float = splai[splaidx['DS_AG']]
    ds_al: float = splai[splaidx['DS_AL']]
    ds_dg: float = splai[splaidx['DS_DG']]
    ds_dl: float = splai[splaidx['DS_DL']]
    if splai[splaidx['DP_AG']] == 'NA':
        maxsplai: float = "NA"
    maxsplai: float = max(ds_ag, ds_al, ds_dg, ds_dl)

    # Add df row
    df = pd.concat([df, pd.DataFrame([[
        v.CHROM, v.POS, v.REF, v.ALT[0], 
        vep[vepidx['SYMBOL']], vep[vepidx['SYMBOL_SOURCE']], vep[vepidx['HGNC_ID']], 
        vep[vepidx['Feature']], hgvsc, vep[vepidx['Consequence']], 
        vep[vepidx['EXON']], vep[vepidx['INTRON']],
        strand(vep[vepidx['STRAND']]), 
        ds_ag, ds_al, ds_dg, ds_dl,
        splai[splaidx['DP_AG']], splai[splaidx['DP_AL']], 
        splai[splaidx['DP_DG']], splai[splaidx['DP_DL']],
        maxsplai
    ]], columns=cols)], ignore_index=True)

    if hgvsc == "NA":
        logger.warning(f"[{v.CHROM}:{v.POS}] HGVSc were not found")
    if maxsplai == "NA":
        logger.warning(f"[{v.CHROM}:{v.POS}] SpliceAI scores were not found")

# ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL
# CHROM, POS, REF, ALT, GeneSymbol, NCBI_ID, ENST, ExonIntronNumbers, FLAGS, SYMBOL_SOURCE|



In [14]:
df.to_pickle('splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.pkl')



In [15]:
df = pd.read_pickle('splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.pkl')
print(len(df))
df.drop_duplicates(inplace=True)
print(len(df))

import sys
try: 
    __file__
    sys.path.append(os.path.join(os.path.dirname('__file__')))
except NameError:
    Path().resolve()
    sys.path.append(os.path.join(Path().resolve(), '../../../'))

from libs import utils, preprocess, variantfilter, posparser, splaiparser
# from libs import predeffect, scoring
from libs import anno_spliceai, anno_clinvar
from libs.deco import print_filtering_count


20701
20687


In [5]:
#1.   Classify 'Canonical' splice site or 'Non-canonical' splice site
logger.info('Classify "Canonical" splice site or "Non-canonical" splice site...')
df = posparser.classifying_canonical(df, cdot='HGVSc')

#2.   Calculate exonic positions
logger.info('Calculating exonic positions...')
#2-1. Generate 'exonic upstream distance and exonic downstream distance
gencode_gff = '../../../Resources/05_GENCODE_v43lift37/gencode.v43lift37.annotation.sort.gff3.gz'
tbx_anno = pysam.TabixFile(gencode_gff)
df['exon_loc'] = df.progress_apply(
    posparser.calc_exon_loc, tabixfile=tbx_anno, enstcolname='ENST', axis=1)
df = pd.concat([df, df['exon_loc'].str.split(':', expand=True)], axis=1)
df.rename(columns={0: 'ex_up_dist', 1: 'ex_down_dist'}, inplace=True)
df.drop(columns=['exon_loc'], inplace=True)
#2-2. Select minimum distance from upstream distance and downstream distance
df['exon_pos'] = df.parallel_apply(posparser.select_exon_pos, axis=1)
#2-3. Decision exonic splice sites (1 nt in acceptor site or 3 nts on Donor site)
df['exon_splice_site'] = df.parallel_apply(posparser.extract_splicing_region, axis=1)

#3.   Additional Splicing information
logger.info('Annotating splicing information...')
#3-1. Annotate splicing type ('Exonic Acceptor' etc.)
df['SpliceType'] = df.parallel_apply(posparser.select_donor_acceptor, axis=1)


2024/10/16 08:35:47 [INFO   ] (__main__) - Classify "Canonical" splice site or "Non-canonical" splice site...
-2: 321
-1: 358
+1: 648
+2: 224
Total variants      : 20687
Canonical variants  : 1551
non-Canon variants  : 19136

2024/10/16 08:35:47 [INFO   ] (__main__) - Calculating exonic positions...


100%|██████████| 20687/20687 [00:21<00:00, 973.78it/s] 


2024/10/16 08:36:09 [INFO   ] (__main__) - Annotating splicing information...


In [6]:
#3-2. Annotate rank of exon or intron
try:
    db_anno_gencode = '../../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '../../../Resources/06_gffutilsdb/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)
except ValueError:
    db_anno_gencode = '/resources/DBs/gencode.v43lift37.annotation.gtf.db'
    db_anno_intron = '/resources/DBs/gencode.v43lift37.annotation.intron.gtf.db'
    db = gffutils.FeatureDB(db_anno_gencode)
    db_intron = gffutils.FeatureDB(db_anno_intron)

df['ENST_Full'] = df.progress_apply(posparser.fetch_enst_full, db=db, axis=1)
df.to_pickle('splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.enst.pkl')

#5.   Annotate ClinVar varaints interpretations
logger.info('Annotating ClinVar varaints interpretations...')
clinvar_file = '../../../Resources/03_ClinVar/variant_summary.snv.grch37.germline.criteria.sort.bed.gz'
tbx_clinvar = pysam.TabixFile(clinvar_file)
df['clinvar_same_pos'] = df.progress_apply(
    anno_clinvar.anno_same_pos_vars, tabixfile=tbx_clinvar, axis=1)
df['clinvar_same_motif'] = df.progress_apply(
    anno_clinvar.anno_same_motif_vars, tabixfile=tbx_clinvar, axis=1)

100%|██████████| 20687/20687 [09:11<00:00, 37.51it/s]

2024/10/16 08:45:21 [INFO   ] (__main__) - Annotating ClinVar varaints interpretations...



100%|██████████| 20687/20687 [00:02<00:00, 7592.77it/s] 
100%|██████████| 20687/20687 [00:01<00:00, 10444.36it/s]


In [9]:
df.iloc[149]

CHROM                                 1
POS                             1464710
REF                    AGGTGAGTCAGGCTCC
ALT                                   A
GeneSymbol                       ATAD3A
SymbolSource                       HGNC
HGNC_ID                           25567
ENST                    ENST00000378755
HGVSc               c.1758+2_1758+16del
EXON                              15/16
INTRON                            15/15
Strand                                -
DS_AG                              0.00
DS_AL                              0.64
DS_DG                              0.03
DS_DL                              0.99
DP_AG                                17
DP_AL                              -107
DP_DG                                16
DP_DL                                 1
MaxSpliceAI                        0.99
is_Canonical                       True
Int_loc                              +2
ex_up_dist                            1
ex_down_dist                        107


In [7]:
df = pd.read_pickle('splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.enst.pkl')

#6.   Parising SpliceAI results (For PVS1 decision tree)
logger.info('Parsing SpliceAI results...')
## Thresholds configuration
TH_min_sALDL, TH_max_sALDL = 0.02, 0.2
TH_min_sAGDG, TH_max_sAGDG = 0.01, 0.05
TH_min_GExon, TH_max_GExon = 25, 500
TH_sAG, TH_sDG = 0.2, 0.2

#6-1. Annotate Exon/Intron position information
df['ExInt_INFO'] = df.progress_apply(
    splaiparser.calc_exint_info, db=db, db_intron=db_intron, axis=1)

2024/10/16 08:45:26 [INFO   ] (__main__) - Parsing SpliceAI results...


  1%|          | 150/20687 [00:03<06:54, 49.55it/s]


StopIteration: 

In [None]:
#6-2. Relative exon location
df['prc_exon_loc'] = df.parallel_apply(posparser.calc_prc_exon_loc, axis=1)

In [6]:
df.to_pickle('splai_vep_vcfs/hgmd_dm/all_DM_chr1.splai.vep.enst.pkl')

In [8]:
df[['EXON', 'INTRON', 'Int_loc']]

Unnamed: 0,EXON,INTRON,Int_loc
0,14/14,,Exonic
1,,1/1,-1
2,2/2,,Exonic
3,2/2,,Exonic
4,2/2,,Exonic
...,...,...,...
20696,5/9,,Exonic
20697,6/9,,Exonic
20698,6/9,,Exonic
20699,7/9,,Exonic


In [9]:
#5.   Annotate ClinVar varaints interpretations
logger.info('Annotating ClinVar varaints interpretations...')
clinvar_file = '../../../Resources/03_ClinVar/variant_summary.snv.grch37.germline.criteria.sort.bed.gz'
tbx_clinvar = pysam.TabixFile(clinvar_file)
df['clinvar_same_pos'] = df.progress_apply(anno_clinvar.anno_same_pos_vars, 
                                           tabixfile=tbx_clinvar, axis=1)
df['clinvar_same_motif'] = df.progress_apply(anno_clinvar.anno_same_motif_vars, 
                                             tabixfile=tbx_clinvar, axis=1)


2024/10/16 08:23:10 [INFO   ] (__main__) - Annotating ClinVar varaints interpretations...


100%|██████████| 20687/20687 [00:02<00:00, 9106.89it/s] 
100%|██████████| 20687/20687 [00:02<00:00, 10212.42it/s]


In [10]:
df

Unnamed: 0,CHROM,POS,REF,ALT,GeneSymbol,SymbolSource,HGNC_ID,ENST,HGVSc,EXON,...,is_Canonical,Int_loc,ex_up_dist,ex_down_dist,exon_pos,exon_splice_site,SpliceType,ENST_Full,clinvar_same_pos,clinvar_same_motif
0,1,879375,C,T,SAMD11,HGNC,28706,ENST00000342066,c.1888C>T,14/14,...,False,Exonic,579,87,579,non_SplExon,Acceptor_ex,ENST00000342066.8_7,[Pathogenic_1:879375],[]
1,1,949363,G,A,ISG15,HGNC,4053,ENST00000379389,c.4-1G>A,,...,True,-1,ENST_not_match,ENST_not_match,ENST_not_match,,ENST_not_match,ENST_with_Ver_not_available,[],unk_SpliceType
2,1,949443,T,A,ISG15,HGNC,4053,ENST00000379389,c.83T>A,2/2,...,False,Exonic,ENST_not_match,ENST_not_match,ENST_not_match,,ENST_not_match,ENST_with_Ver_not_available,[],unk_SpliceType
3,1,949523,C,T,ISG15,HGNC,4053,ENST00000379389,c.163C>T,2/2,...,False,Exonic,ENST_not_match,ENST_not_match,ENST_not_match,,ENST_not_match,ENST_with_Ver_not_available,[],unk_SpliceType
4,1,949643,AC,A,ISG15,HGNC,4053,ENST00000379389,c.285del,2/2,...,False,Exonic,ENST_not_match,ENST_not_match,ENST_not_match,,ENST_not_match,ENST_with_Ver_not_available,[],unk_SpliceType
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20696,1,247597502,G,A,NLRP3,HGNC,16400,ENST00000336119,c.2425G>A,5/9,...,False,Exonic,73,97,73,non_SplExon,Donor_ex,ENST00000336119.8_4,[],[]
20697,1,247599354,T,C,NLRP3,HGNC,16400,ENST00000336119,c.2581T>C,6/9,...,False,Exonic,88,82,88,non_SplExon,Acceptor_ex,ENST00000336119.8_4,[Likely pathogenic_1:247599354],[]
20698,1,247599355,A,G,NLRP3,HGNC,16400,ENST00000336119,c.2582A>G,6/9,...,False,Exonic,87,83,87,non_SplExon,Acceptor_ex,ENST00000336119.8_4,[Pathogenic_1:247599355],[]
20699,1,247607362,C,T,NLRP3,HGNC,16400,ENST00000336119,c.2758C>T,7/9,...,False,Exonic,82,88,82,non_SplExon,Donor_ex,ENST00000336119.8_4,[],[]


In [41]:
query_chr = 'chr1'
query_pos = 247599354
query_enst = 'ENST00000336119'
query_region = (query_chr, query_pos-1, query_pos)

# def fetch_enst_full(enst: str) -> str:

for transcript in db.region(region=query_region, featuretype='transcript'):
    if transcript.id.startswith(query_enst):
        print(transcript)
        print(transcript.id)
    # print(transcript)
    # print(transcript.id)
    # break

chr1	HAVANA	transcript	247579475	247612119	.	+	.	gene_id "ENSG00000162711.18_12"; transcript_id "ENST00000336119.8_4"; gene_type "protein_coding"; gene_name "NLRP3"; transcript_type "protein_coding"; transcript_name "NLRP3-201"; level "2"; protein_id "ENSP00000337383.4"; transcript_support_level "1"; hgnc_id "HGNC:16400"; tag "CAGE_supported_TSS"; tag "upstream_ATG"; tag "basic"; tag "Ensembl_canonical"; tag "MANE_Select"; tag "appris_principal_1"; havana_gene "OTTHUMG00000040647.7_12"; havana_transcript "OTTHUMT00000097740.1_4"; remap_num_mappings "1"; remap_status "full_contig"; remap_target_status "overlap";
ENST00000336119.8_4


In [23]:
i = 0
ts = db.all_features()
while i < 100:
    t = next(ts)
    # t = next(db.featuretypes())
    print(t)

    i += 1



chr1	HAVANA	gene	12010	13670	.	+	.	gene_id "ENSG00000223972.6_6"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level "2"; hgnc_id "HGNC:37102"; havana_gene "OTTHUMG00000000961.2_6"; remap_status "full_contig"; remap_num_mappings "1"; remap_target_status "overlap";
chr1	HAVANA	transcript	12010	13670	.	+	.	gene_id "ENSG00000223972.6_6"; transcript_id "ENST00000450305.2_2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-201"; level "2"; transcript_support_level "NA"; hgnc_id "HGNC:37102"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; tag "Ensembl_canonical"; havana_gene "OTTHUMG00000000961.2_6"; havana_transcript "OTTHUMT00000002844.2_2"; remap_num_mappings "1"; remap_status "full_contig"; remap_target_status "overlap";
chr1	HAVANA	exon	12010	12057	.	+	.	gene_id "ENSG00000223972.6_6"; transcript_id "ENST00000450305.2_2"; gene_type "transcribed_unprocessed_pseud

In [10]:
introns = list(introns)
print(introns)

[]


In [None]:
i = 0
while i < 5:
    t = next(db_intron.features_of_type('transcript'))
    print(t.id)
    i += 1

In [5]:
df

Unnamed: 0,CHROM,POS,REF,ALT,GeneSymbol,SymbolSource,HGNC_ID,ENST,HGVSc,EXON,...,DP_DG,DP_DL,MaxSpliceAI,is_Canonical,Int_loc,ex_up_dist,ex_down_dist,exon_pos,exon_splice_site,SpliceType
0,1,879375,C,T,SAMD11,HGNC,28706,ENST00000342066,c.1888C>T,14/14,...,-2,-1307,0.03,False,Exonic,579,87,579,non_SplExon,Acceptor_ex
1,1,949363,G,A,ISG15,HGNC,4053,ENST00000379389,c.4-1G>A,,...,288,-88,0.99,True,-1,ENST_not_match,ENST_not_match,ENST_not_match,,ENST_not_match
2,1,949443,T,A,ISG15,HGNC,4053,ENST00000379389,c.83T>A,2/2,...,-294,208,0.03,False,Exonic,ENST_not_match,ENST_not_match,ENST_not_match,,ENST_not_match
3,1,949523,C,T,ISG15,HGNC,4053,ENST00000379389,c.163C>T,2/2,...,167,128,0.01,False,Exonic,ENST_not_match,ENST_not_match,ENST_not_match,,ENST_not_match
4,1,949643,AC,A,ISG15,HGNC,4053,ENST00000379389,c.285del,2/2,...,8,43,0.02,False,Exonic,ENST_not_match,ENST_not_match,ENST_not_match,,ENST_not_match
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20696,1,247597502,G,A,NLRP3,HGNC,16400,ENST00000336119,c.2425G>A,5/9,...,1940,-180,0.01,False,Exonic,73,97,73,non_SplExon,Donor_ex
20697,1,247599354,T,C,NLRP3,HGNC,16400,ENST00000336119,c.2581T>C,6/9,...,88,1608,0.02,False,Exonic,88,82,88,non_SplExon,Acceptor_ex
20698,1,247599355,A,G,NLRP3,HGNC,16400,ENST00000336119,c.2582A>G,6/9,...,-2201,87,0.14,False,Exonic,87,83,87,non_SplExon,Acceptor_ex
20699,1,247607362,C,T,NLRP3,HGNC,16400,ENST00000336119,c.2758C>T,7/9,...,-89,82,0.18,False,Exonic,82,88,82,non_SplExon,Donor_ex


In [None]:


df['Num_ExInt'] = df.progress_apply(
    posparser.calc_ex_int_num, db=db, db_intron=db_intron, axis=1)

In [5]:
df['SpliceType'].value_counts()

SpliceType
Acceptor_ex       9264
Donor_ex          8347
ENST_not_match    2938
Donor_int           49
Acceptor_int        45
Center_of_Exon      44
Name: count, dtype: int64

ValueError: invalid literal for int() with base 10: 'ENST_not_match'

In [73]:
tbx = tbx_anno.fetch("chr1", 949442, 949443, parser=pysam.asGFF3())

In [75]:
for t in tbx:
    print(t)
    enst = re.match(r'ENST\d+', t.transcript_id).group()
    print(enst)

chr1	HAVANA	transcript	936525	949815	.	+	.	ID=ENST00000624652.1;Parent=ENSG00000187608.10;gene_id=ENSG00000187608.10_10;transcript_id=ENST00000624652.1_4;gene_type=protein_coding;gene_name=ISG15;transcript_type=protein_coding;transcript_name=ISG15-201;level=2;protein_id=ENSP00000485313.1;transcript_support_level=3;hgnc_id=HGNC:4053;tag=mRNA_end_NF,cds_end_NF;havana_gene=OTTHUMG00000040777.4_10;havana_transcript=OTTHUMT00000479385.2_4;remap_num_mappings=1;remap_status=full_contig;remap_target_status=new
ENST00000624652
chr1	HAVANA	transcript	948877	949920	.	+	.	ID=ENST00000649529.1;Parent=ENSG00000187608.10;gene_id=ENSG00000187608.10_10;transcript_id=ENST00000649529.1_4;gene_type=protein_coding;gene_name=ISG15;transcript_type=protein_coding;transcript_name=ISG15-203;level=2;protein_id=ENSP00000496832.1;hgnc_id=HGNC:4053;tag=CAGE_supported_TSS,basic,Ensembl_canonical,MANE_Select,appris_principal_1,CCDS;ccdsid=CCDS6.1;havana_gene=OTTHUMG00000040777.4_10;havana_transcript=OTTHUMT0000050148

In [4]:
# df.loc[(df['Int_loc'] == 'Exonic') & (df['exon_loc'] is None)]
df.loc[(df['Int_loc'] == 'Exonic')]


Unnamed: 0,CHROM,POS,REF,ALT,GeneSymbol,SymbolSource,HGNC_ID,ENST,HGVSc,EXON,...,DS_DG,DS_DL,DP_AG,DP_AL,DP_DG,DP_DL,MaxSpliceAI,is_Canonical,Int_loc,exon_loc
0,1,879375,C,T,SAMD11,HGNC,28706,ENST00000342066,c.1888C>T,14/14,...,0.00,0.01,-102,-79,-2,-1307,0.03,False,Exonic,579:87
2,1,949443,T,A,ISG15,HGNC,4053,ENST00000379389,c.83T>A,2/2,...,0.00,0.00,2,-34,-294,208,0.03,False,Exonic,enst_not_match
3,1,949523,C,T,ISG15,HGNC,4053,ENST00000379389,c.163C>T,2/2,...,0.00,0.00,-33,-114,167,128,0.01,False,Exonic,enst_not_match
4,1,949643,AC,A,ISG15,HGNC,4053,ENST00000379389,c.285del,2/2,...,0.02,0.00,-203,-305,8,43,0.02,False,Exonic,enst_not_match
5,1,949648,C,G,ISG15,HGNC,4053,ENST00000379389,c.288C>G,2/2,...,0.00,0.01,-239,-310,-33,3,0.01,False,Exonic,enst_not_match
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20696,1,247597502,G,A,NLRP3,HGNC,16400,ENST00000336119,c.2425G>A,5/9,...,0.00,0.00,1770,-40,1940,-180,0.01,False,Exonic,73:97
20697,1,247599354,T,C,NLRP3,HGNC,16400,ENST00000336119,c.2581T>C,6/9,...,0.02,0.00,-82,-2786,88,1608,0.02,False,Exonic,88:82
20698,1,247599355,A,G,NLRP3,HGNC,16400,ENST00000336119,c.2582A>G,6/9,...,0.00,0.10,-5,-83,-2201,87,0.14,False,Exonic,87:83
20699,1,247607362,C,T,NLRP3,HGNC,16400,ENST00000336119,c.2758C>T,7/9,...,0.18,0.12,-257,-88,-89,82,0.18,False,Exonic,82:88


In [20]:
all_mut_default_colnames: list = [
    "disase", "gene", "chrom", "genename", "gdbid", "omimid", "amino", 
    "deletion", "insertion", "codon", "codonAff", "descr", "refseq", "hgvs", 
    "hgvsAll", "dbsnp", "chromosome", "startCoord", "endCoord", 
    "expected_inheritance", "gnomad_AC", "gnomad_AF", "gnomad_AN", "tag", 
    "dmsupport", "rankscore", "mutype", "author", "title", "fullname", 
    "allname", "vol", "page", "year", "pmid", "pmidAll", "reftag", "comments", 
    "acc_num", "new_date", "base", "clinvarID", "clinvar_clnsig"
]
allmut: pd.DataFrame = pd.read_csv(
    'allmut.csv', sep=';', encoding='cp1252', names=all_mut_default_colnames, 
    skiprows=1,low_memory=False)

allmut = allmut[
    ["gene", "genename", "mutype", "clinvar_clnsig", "tag",
     "refseq", "hgvs", "hgvsAll", "chromosome", "startCoord", "endCoord", 
     "amino", "deletion", "insertion", "expected_inheritance", "gnomad_AF"]]

# Drop non-numeric values in 'startCoord'
allmut = allmut.dropna(subset=['startCoord'])

# Drop duplicates in 'chrom', 'startCoord', and 'endCoord'
allmut = allmut.drop_duplicates(subset=['chromosome', 'startCoord', 'endCoord'])

# Extract tag == "DM" from allmut
allmut_dm = allmut[allmut.tag == "DM"].copy()
print(f"A total of {len(allmut_dm)} DM mutations are found in allmut.")

A total of 253018 DM mutations are found in allmut.


In [22]:
# Fillna with empty string in "gnomad_AF" colmun in allmut_dm
# Extratct MAF 0 from allmut_dm
allmut_dm['gnomad_AF'].fillna(0, inplace=True)
allmut_dm_maf0 = allmut_dm[allmut_dm['gnomad_AF'] == 0].copy()
print(f"A total of {len(allmut_dm_maf0)} DM mutations are found in allmut with MAF 0.")

# Extract non-deletion or non-insertion from allmut_dm
allmut_dm_maf0_snv = allmut_dm_maf0[(allmut_dm_maf0['deletion'].isnull()) & (allmut_dm_maf0['insertion'].isnull())]
print(f"A total of {len(allmut_dm_maf0_snv)} DM mutations are found in allmut with MAF 0 and non-deletion or non-insertion.")

# Extract the mutation type from the mutype column
splice_mutations = allmut_dm[allmut_dm["mutype"].str.contains("splice")].copy()
non_splice_mutations = allmut_dm[~allmut_dm["mutype"].str.contains("splice")]
print(f"Splicing_DM: {len(splice_mutations)}, Non-splicing_DM: {len(non_splice_mutations)}")

A total of 231981 DM mutations are found in allmut with MAF 0.
A total of 154107 DM mutations are found in allmut with MAF 0 and non-deletion or non-insertion.
Splicing_DM: 23777, Non-splicing_DM: 229241


In [23]:
print(allmut_dm_maf0_snv['mutype'].unique())
print(allmut_dm_maf0_snv['gnomad_AF'].unique())

['missense' 'initiation' 'nonsense' 'synonymous' 'gross'
 'canonical-splice' 'exonic-splice' 'nonstop' 'regulatory' 'splice']
[0.]


In [24]:
splice_mutations

Unnamed: 0,gene,genename,mutype,clinvar_clnsig,tag,refseq,hgvs,hgvsAll,chromosome,startCoord,endCoord,amino,deletion,insertion,expected_inheritance,gnomad_AF
96651,AP3B1,Adaptor related protein complex 3 subunit beta 1,canonical-splice,,DM,NM_003664.5,1168-1G>C,1168minus1GtoC,5,78165673.0,78165673.0,,,,AR,0.0
126648,NF1,Neurofibromin 1,exonic-splice,Pathogenic/Likely_pathogenic,DM,NM_000267.3,1748A>G,1748AtoG | K583R,17,31223470.0,31223470.0,,,,AD,0.0
129700,TSC2,TSC complex subunit 2,exonic-splice,Pathogenic,DM,NM_000548.5,1255C>T,1255CtoT | P419S,16,2062006.0,2062006.0,,,,AD,0.0
130620,COL4A5,Collagen type IV alpha 5 chain,exonic-splice,Likely_pathogenic,DM,NM_000495.5,1856C>T,1856CtoT | P619L,X,108598778.0,108598778.0,,,,XLD,0.0
134411,DMD,Dystrophin,exonic-splice,Pathogenic,DM,NM_004006.3,10279C>T,10279CtoT | Q3427*,X,31173588.0,31173588.0,,,,XLR,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510785,PPT1,Palmitoyl-protein thioesterase 1,canonical-splice,Likely_pathogenic,DM,NM_000310.4,536+1G>A,536plus1GtoA,1,40089409.0,40089409.0,,,,AR,0.0
510786,APC,APC regulator of WNT signaling pathway,splice,Likely_pathogenic,DM,NM_000038.6,1409-6A>G,1409minus6AtoG,5,112827102.0,112827102.0,,,,AD,0.0
510789,UROD,Uroporphyrinogen decarboxylase,canonical-splice,,DM,NM_000374.5,876-1G>C,876minus1GtoC,1,45014939.0,45014939.0,,,,ADAR,0.0
510791,PTS,6-pyruvoyltetrahydropterin synthase,canonical-splice,Pathogenic,DM,NM_000317.3,186+1G>A,186plus1GtoA,11,112230231.0,112230231.0,,,,AR,0.0


In [45]:
def remove_dot_ver(x):
    if x == '.':
        pass
    else:
        return re.match(r'[a-zA-Z_]+\d+', x).group()

Unnamed: 0,gene,chrom,genename,mutype,clinvarID,clinvar_clnsig,tag,refseq,hgvs,hgvsAll,chromosome,startCoord,endCoord,amino,deletion,insertion,expected_inheritance,gnomad_AF


In [34]:
len(allmut)

510804

In [2]:
def _liftover_to_hg19(chrom, pos):
    converter = get_lifter('hg38', 'hg19')
    result = converter.query(chrom, pos)
    if result:
        return result[0]
    else:
        return None
    
def anno_vars_id(row):
    variant_id = f'{row["CHROM"]}:{row["POS_hg19"]}-{row["REF"]}-{row["ALT"]}'
    return variant_id

def anno_hg19_pos(row):
    converted = _liftover_to_hg19(row['CHROM'], row['POS_hg38'])
    return converted[1]

def remove_dot_ver(x):
    if x == '.':
        pass
    else:
        return re.match(r'[a-zA-Z_]+\d+', x).group()
    
def remove_non_canon(x):
    if x in enst_set:
        return True
    else:
        pass

def create_refalt(row, nt):
    if row['Strand'] == '+':
        return row[nt]
    elif row['Strand'] == '-':
        return str(Seq(row[nt]).complement())
    else:
        pass

In [3]:
canonlist = '/Github/Projects/DeNovo/data/CanonicalTranscripts_COMP/CanonicalTranscripts.exoncount.tsv'
refseq = '/work/CanonicalTrasncripts/gencode.v43lift37.metadata.RefSeq.gz'

df_canon = pd.read_table(canonlist, header=0)
df_canon = df_canon[(df_canon['ENST'] != 'ENST00000649912')
                    & (df_canon['ENST'] != 'ENST00000609375')]
df_enst = df_canon.drop_duplicates(subset='ENST')
enst_set = set(df_enst['ENST'])
df_refseq = pd.read_table(refseq, header=None, 
                          names=['ENST_refseq', 'RefSeq_RNA', 'RefSeq_Pro'])
df_refseq.fillna(value='.', inplace=True)

df_refseq['ENST_refseq'] = df_refseq['ENST_refseq'].apply(remove_dot_ver)
df_refseq['RefSeq_RNA'] = df_refseq['RefSeq_RNA'].apply(remove_dot_ver)
df_refseq['RefSeq_Pro'] = df_refseq['RefSeq_Pro'].apply(remove_dot_ver)

df_refseq['is_Canonical'] =  df_refseq['ENST_refseq'].apply(remove_non_canon)
df_refseq_canon = df_refseq[df_refseq['is_Canonical'] == True].copy()

##### Generate VCF

In [4]:
# Loading HGMD splicing variants list (tsv)
hgmd_file = '/Github/MyProjects/DeNovo/data/ValidationData/Positive/allmut.trim.colfixed.maf0.tsv.txt'
df = pd.read_table(hgmd_file, header=0)

# Pre-processing
df = df.dropna(subset=['startCoord'])
df['startCoord'] = df['startCoord'].astype(int)
df['refseq'] = df['refseq'].apply(remove_dot_ver)

# Extract REF and ALT from HGVS descriptions
sr_alt_nt = df['hgvs'].str[-1:].rename('alt_nt')
sr_ref_nt = df['hgvs'].str[-3:-2].rename('ref_nt')
df = pd.concat([df, sr_ref_nt, sr_alt_nt], axis=1)

# Insert cols as VCF
df.loc[:,'ID'] = '.'
df.loc[:,'QUAL'] = '.'
df.loc[:,'FILTER'] = '.'
df.loc[:,'INFO'] = '.'

# Annotate ENST IDs
df = pd.merge(df, df_enst, how='left', 
              left_on='gene', right_on='GeneSymbol')

# Rename cols for downstream processing
df = df.rename(columns={'CHROM': 'Chr',
                        'chromosome': 'CHROM', 
                        'startCoord': 'POS_hg38'})

# Create REF and ALT columns
df['REF'] = df.apply(create_refalt, nt='ref_nt', axis=1)
df['ALT'] = df.apply(create_refalt, nt='alt_nt', axis=1)

In [5]:
# Liftover to hg19
df['POS_hg19'] = df.apply(anno_hg19_pos, axis=1)

In [None]:
df2 = df.copy()
df2 = df2.astype({'POS_hg19': str})

# Annotate variant IDs
df2['variant_id'] = df2.apply(anno_vars_id, axis=1)

# Extract columns for VCF
df_19 = df2[['CHROM', 'POS_hg19', 
             'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']]
df_38 = df2[['CHROM', 'POS_hg38', 
             'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']]

In [265]:
df_19 = df_19.dropna(subset=['REF'])
df_38 = df_38.dropna(subset=['REF'])

In [266]:
# Output as VCF
df_19.to_csv('./patho.hg19.vcf', sep='\t', index=False, header=False)
df_38.to_csv('./patho.hg38.vcf', sep='\t', index=False, header=False)

##### Concatenate header and variant list

In [267]:
!cat ../header_for_VCF.tsv ./patho.hg19.vcf > patho.hg19.header.vcf
!cat ../header_for_VCF.tsv ./patho.hg38.vcf > patho.hg38.header.vcf

In [263]:
df[['Strand', 'ref_nt', 'REF', 'alt_nt', 'ALT']]

Unnamed: 0,Strand,ref_nt,REF,alt_nt,ALT
0,+,A,A,G,G
1,-,T,A,G,C
2,-,G,C,A,T
3,+,A,A,G,G
4,-,A,T,G,C
...,...,...,...,...,...
10553,,G,,A,
10554,+,G,G,A,A
10555,-,G,C,A,T
10556,-,G,C,A,T


In [277]:
df2 =df2[['gene', 'gdbid', 'omimid', 'amino', 'codon', 'codonAff', 'descr',
       'refseq', 'hgvs', 'CHROM', 'POS_hg38', 'endCoord', 
       'expected_inheritance', 'dmsupport', 'mutype', 'acc_num', 'new_date', 
       'clinvarID', 'clinvar_clnsig', 'Chr', 'Start', 'End', 'Strand', 
       'GeneSymbol', 'HGNC_ID', 'ENSG', 'ENST', 'GeneType', 'Tag', 
       'REF', 'ALT', 'POS_hg19', 'variant_id']]

In [278]:
df2.to_csv('./patho2.tsv', sep='\t', index=False)