In [10]:
import pandas as pd
import numpy as np
import pysam
from pysam import VariantFile


In [11]:
mydata = "../ValidationData/mydata/original.snpeff.state.disease.identifiedgene.filtered.splai.tsv"
df = pd.read_table(mydata, sep='\t', dtype=str)

In [12]:
clinvar_bcf = "../clinvar/Filtered_BCF_GRCh37_20241211-044124/clinvar_GRCh37.germline.nocoflicted.bcf.gz"

In [12]:
i = 2

query_chr = df.iloc[i, 34]
query_pos = df.iloc[i, 35]
query_ref = df.iloc[i, 36]
query_alt = df.iloc[i, 37]

query_start = int(query_pos) - 10000
query_end = int(query_pos) + 1000000
query_variant = f"{query_chr}-{query_start}-{query_end}-{query_ref}-{query_alt}"

query_variant

'1-100523565-101533565-C-T'

In [20]:
bcf_in = VariantFile(clinvar_bcf)
recs = bcf_in.fetch(query_chr, query_start, query_end)

In [47]:

rec = next(recs)
print(f"{rec.contig}-{rec.pos}-{rec.ref}-{rec.alts[0]}")
clnsig = rec.info["CLNSIG"]
clnsig: tuple = ('Pathogenic', 'Pathogenic/LP', 'Benign')

clnsigs = [x for x in rec.info["CLNSIG"]]
clnsigs

1-100573865-T-A


['Benign']

In [13]:
cln_bcf = VariantFile(clinvar_bcf)

def remove_square_brackets(s: str) -> str:
    return s.replace("[", "").replace("]", "")

def anno_same_pos_vars(row, cln_bcf: VariantFile) -> str:
    samepos = []
    query_chr: str = f"{row['CHROM']}"
    query_pos: int = int(row['POS'])
    query_ref: str = row['REF']
    query_alt: str = row['ALT']
    query_variant: str = f"{query_chr}-{query_pos}-{query_ref}-{query_alt}"
    query_start: int = query_pos - 1
    query_end: int = query_pos
    recs = cln_bcf.fetch(query_chr, query_start, query_end)

    # main loop 
    while 1:
        try:
            rec = next(recs)
        except StopIteration:
            break
        else:
            if rec.alts is None:
                rec_alt: str = "."
            else:
                rec_alt: str = rec.alts[0]
            rec_id = f"{rec.contig}-{rec.pos}-{rec.ref}-{rec_alt}"
            if query_variant == rec_id:
                clnsigs: list = [x for x in rec.info["CLNSIG"]]
                samepos.append(clnsigs)
            else:
                pass
    
    # Return the results
    if samepos == []:
        return "No_ClinVar_info_found"
    else:
        return remove_square_brackets(str(samepos))

df['same_pos'] = df.apply(anno_same_pos_vars, cln_bcf=cln_bcf, axis=1)

In [None]:
def _generate_query_pos(row) -> tuple:
    if row['SpliceType'] == 'Donor_int':
        if row['Strand'] == '+':
            query_start: int = query_pos - int(row['IntronDist']) - 2 - 1
            query_end: int = query_pos - int(row['IntronDist']) + 6
        elif row['Strand'] == '-':
            query_start: int = query_pos + int(row['IntronDist']) - 6 - 1
            query_end: int = query_pos + int(row['IntronDist']) + 2
        else:
            return 'unk_Strand', 'unk_Strand', 'unk_Strand'

    elif row['SpliceType'] == 'Donor_ex':
        if row['Strand'] == '+':
            query_start: int = query_pos + int(row['exon_pos']) - 3 - 1
            query_end: int = query_pos + int(row['exon_pos']) + 5
        elif row['Strand'] == '-':
            query_start: int = query_pos - int(row['exon_pos']) - 5 - 1
            query_end: int = query_pos - int(row['exon_pos']) + 3
        else:
            return 'unk_Strand', 'unk_Strand', 'unk_Strand'
    
    elif row['SpliceType'] == 'Acceptor_int':
        if row['Strand'] == '+':
            query_start: int = query_pos + (- int(row['IntronDist'])) - 20 - 1
            query_end: int = query_pos + (- int(row['IntronDist'])) + 0
        elif row['Strand'] == '-':
            query_start: int = query_pos - (- int(row['IntronDist'])) - 0 - 1
            query_end: int = query_pos - (- int(row['IntronDist'])) + 20
        else:
            return 'unk_Strand', 'unk_Strand', 'unk_Strand'
    
    elif row['SpliceType'] == 'Acceptor_ex':
        if row['Strand'] == '+':
            query_start: int = query_pos - int(row['exon_pos']) - 19 - 1
            query_end: int = query_pos - int(row['exon_pos']) + 1
        elif row['Strand'] == '-':
            query_start: int = query_pos + int(row['exon_pos']) - 1 - 1
            query_end: int = query_pos + int(row['exon_pos']) + 19
        else:
            return 'unk_Strand'
        
    else:
        return 'unk_SpliceType', 'unk_SpliceType', 'unk_SpliceType'
    
    return str(row['CHROM']), query_start, query_end

def anno_same_motif_vars(row, cln_bcf: VariantFile) -> str:
    region: tuple = _generate_query_pos(row)

    if region[0] == 'unk_Strand':
        return 'unk_Strand'
    elif region[0] == 'unk_SpliceType':
        return 'unk_SpliceType'
    else:
        recs = cln_bcf.fetch(*region)

    # Main loop
    samemotifs = []
    while 1:
        try:
            clinvar = next(recs)
        except StopIteration:
            break
        else:
            registered_var = f'{clinvar[7]}_{clinvar[0]}:{clinvar[1]}'
            samemotifs.append(registered_var)

    if samemotifs == []:
        return "No_ClinVar_info_found"
    else:
        return remove_square_brackets(str(samemotifs))

In [18]:
# clinvars = cln_bcf.fetch(
#          query_chr, query_start, query_end, parser=pysam.asBed())
i = 3
regions = "2", 100, 2000000

recs = cln_bcf.fetch(*regions)


In [None]:

def anno_same_motif_vars(row, tabixfile: pysam.pysam.libctabix.TabixFile):
    """ Fetch the variants within the same splicing motif from ClinVar data

    Args:
        row (_type_): _description_

    Returns:
        list: Return a list of variants registered in ClinVar 
                within the same splicing motif (Prediction_CHROM:POS)
    """
    
    # Initialize result list & set common variables
    samemotifs = []
    query_chr: str = f'{row["CHROM"]}'
    query_pos: int = int(row['POS'])

    # Generate query positions 
    query_chr, query_start, query_end = _generate_query_pos(row)
    if row['SpliceType'] == 'Donor_int':
        if row['Strand'] == '+':
            query_start: int = query_pos - int(row['IntronDist']) - 2 - 1
            query_end: int = query_pos - int(row['IntronDist']) + 6
        elif row['Strand'] == '-':
            query_start: int = query_pos + int(row['IntronDist']) - 6 - 1
            query_end: int = query_pos + int(row['IntronDist']) + 2
        else:
            return 'unk_Strand', 'unk_Strand', 'unk_Strand'

    elif row['SpliceType'] == 'Donor_ex':
        if row['Strand'] == '+':
            query_start: int = query_pos + int(row['exon_pos']) - 3 - 1
            query_end: int = query_pos + int(row['exon_pos']) + 5
        elif row['Strand'] == '-':
            query_start: int = query_pos - int(row['exon_pos']) - 5 - 1
            query_end: int = query_pos - int(row['exon_pos']) + 3
        else:
            return 'unk_Strand', 'unk_Strand', 'unk_Strand'
    
    elif row['SpliceType'] == 'Acceptor_int':
        if row['Strand'] == '+':
            query_start: int = query_pos + (- int(row['IntronDist'])) - 20 - 1
            query_end: int = query_pos + (- int(row['IntronDist'])) + 0
        elif row['Strand'] == '-':
            query_start: int = query_pos - (- int(row['IntronDist'])) - 0 - 1
            query_end: int = query_pos - (- int(row['IntronDist'])) + 20
        else:
            return 'unk_Strand', 'unk_Strand', 'unk_Strand'
    
    elif row['SpliceType'] == 'Acceptor_ex':
        if row['Strand'] == '+':
            query_start: int = query_pos - int(row['exon_pos']) - 19 - 1
            query_end: int = query_pos - int(row['exon_pos']) + 1
        elif row['Strand'] == '-':
            query_start: int = query_pos + int(row['exon_pos']) - 1 - 1
            query_end: int = query_pos + int(row['exon_pos']) + 19
        else:
            return 'unk_Strand', 'unk_Strand', 'unk_Strand'
        
    else:
        return 'unk_SpliceType', 'unk_SpliceType', 'unk_SpliceType'

    clinvars = tabixfile.fetch(
        query_chr, query_start, query_end, parser=pysam.asBed())
        
    # Main loop
    while 1:
        try:
            clinvar = next(clinvars)
        except StopIteration:
            break
        else:
            registered_var = f'{clinvar[7]}_{clinvar[0]}:{clinvar[1]}'
            samemotifs.append(registered_var)

    return samemotifs

In [67]:
df['same_pos'].value_counts()

same_pos
No_ClinVar_info_found             15716
'Likely_benign'                     539
'Pathogenic'                         35
'Likely_pathogenic'                  22
'Pathogenic/Likely_pathogenic'       20
'Benign'                             14
'Benign/Likely_benign'                7
Name: count, dtype: int64

In [58]:
df.loc[df['same_pos'] != "No_ClinVar_info_found"]

Unnamed: 0,sample,fa,mo,type,ID_x,vqslod,triodenovo,dnmfilter,denovogear,denovofilter,...,DS_AG,DS_AL,DS_DG,DS_DL,DP_AG,DP_AL,DP_DG,DP_DL,maxsplai,same_pos
6,Sample_10564,Sample_10562,Sample_10563,snv,1:100661991-A-T,-1.171,2.19,0.0235918193666265,,False,...,0.0,0.04,0.0,0.02,-2,-13,-13,-221,0.04,[[Likely_benign]]
7,Sample_10564,Sample_10562,Sample_10563,snv,1:100661994-A-T,-0.6796,2.19,0.00131575948972965,,False,...,0.0,0.04,0.0,0.01,-37,-16,-16,-224,0.04,[[Likely_benign]]
8,Sample_10564,Sample_10562,Sample_10563,snv,1:100661997-A-G,-1.255,10.78,0.000628006845710701,0.998948,False,...,0.06,0.0,0.02,0.0,-19,92,-227,4,0.06,[[Likely_benign]]
35,Sample_7118,Sample_7119,Sample_7120,snv,1:109477385-G-A,3.68,13.23,0.998983274802696,0.999996,True,...,0.0,0.0,0.0,0.0,1638,152,-143,-196,0.0,[[Likely_benign]]
36,Sample_7118,Sample_7119,Sample_7120,snv,1:109477385-G-A,3.68,13.23,0.998983274802696,0.999996,True,...,0.0,0.0,0.0,0.0,1638,152,-143,-196,0.0,[[Likely_benign]]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16313,Sample_17213,Sample_17214,Sample_17215,snv,X:70612508-C-T,-1.029,,,,True,...,0.0,0.0,0.0,0.0,703,189,111,-38,0.0,[[Likely_benign]]
16322,Sample_12995,Sample_12996,Sample_12997,snv,X:71825181-G-A,0.395,,,,False,...,0.0,0.38,0.0,0.49,3049,69,-2935,-60,0.49,[[Pathogenic]]
16340,Sample_12362,Sample_12363,Sample_12364,snv,X:85218829-A-G,1.17,15.13,0.189823675562113,0.999997,True,...,0.0,0.0,0.0,0.0,148,-1853,-159,-1989,0.0,[[Benign]]
16341,Sample_5766,Sample_5784,Sample_5785,snv,X:85282539-T-G,-0.7864,14.82,0.506066077390227,1.0,True,...,0.0,0.0,0.0,0.0,22,-2,-125,2260,0.0,[[Likely_benign]]
