# Analysis of cases when benign mutations do not match with Alpha missense data. 

In [1]:
import pandas as pd

In [2]:
import helpers, fix_positions

### Load positions

TAZ positions from ensembl.org in hg38. 

In [3]:
taz_positions = fix_positions.load_taz_positions()

In [4]:
taz_positions

Unnamed: 0,No.,Exon / Intron,Start,End,Start Phase,End Phase,Length,Sequence
0,,5' upstream sequence,0,0,,,,"{""provisional"":{..."
1,1.0,ENSE00003022330,154411539,154411952,-,1,414.0,"{""url"":""/Homo_sa..."
2,,Intron 1-2,154411953,154412085,,,133.0,"{""provisional"":{..."
3,2.0,ENSE00003127263,154412086,154412214,1,1,129.0,"{""provisional"":{..."
4,,Intron 2-3,154412215,154413206,,,992.0,"{""provisional"":{..."
5,3.0,ENSE00003736420,154413207,154413252,1,2,46.0,"{""url"":""/Homo_sa..."
6,,Intron 3-4,154413253,154413481,,,229.0,"{""provisional"":{..."
7,4.0,ENSE00003017673,154413482,154413567,2,1,86.0,"{""provisional"":{..."
8,,Intron 4-5,154413568,154414100,,,533.0,"{""url"":""/Homo_sa..."
9,5.0,ENSE00003050674,154414101,154414190,1,1,90.0,"{""provisional"":{..."


# Check benign not match

Load data for which we did not find a match between Iris (TAZ) database and Alpha Missense. 

Can we understand the reason why no match was found?

In [5]:
benign_not_match_path = '/Users/zdenka/bths/bths-explor/alpha_missense/alpha_iris/benign_not_match_2024-01-25_18_15_57_237928.csv' 
# created by notebooks/exploratory_analysis/iris_alpha_missense.ipynb in https://github.com/zdenkas/bths-explor/

In [6]:
benign_not_match_df = pd.read_csv(benign_not_match_path)

In [7]:
len(benign_not_match_df)

61

In [8]:
loc_38_col = 'Location in Genome release 38 (hg38)'


In [9]:
weird_splits_benign_not_match = []
for key, row in benign_not_match_df.iterrows():
    try:
        if not isinstance(row[loc_38_col], str):
            print(f'! loc 38 is None for {key}, skipping the row... {key}' + \
                  f'{row[loc_38_col]} {row["DNA Modifications"]}')
            continue
        taz_database_38_loc = int(row[loc_38_col].split(':')[1])
        dna_modifications = row['DNA Modifications']
        unexpected, weird_split = fix_positions.find_unexpected_letters_on_38_position(taz_database_38_loc, dna_modifications, taz_positions)

        if weird_split is not None:
                weird_splits_benign_not_match.append(weird_split)
        
    except Exception as ex:
        print('Exception', ex, key, row[loc_38_col]) #, row)
        print('===============')
        break
        

! loc 38 is None for 18, skipping the row... 18nan c.250C>T
! loc 38 is None for 19, skipping the row... 19nan c.255C>T
! loc 38 is None for 28, skipping the row... 28nan c.497T>C
! loc 38 is None for 33, skipping the row... 33nan c.557G>A


^^ We have 4 cases where loc 38 is missing, TODO fix these cases

In [10]:
len(weird_splits_benign_not_match)

0

In [11]:
for weird_split in weird_splits_benign_not_match:
    print(weird_split)

^^ It seems that these 61 cases, where benign Iris data can't be matched with alpha sense, are not caused by unexpected letter in "DNA Modifications" column --> there must be some other reason?



In [12]:
len(benign_not_match_df)

61

In [13]:
benign_not_match_df.head(20)

Unnamed: 0.1,Unnamed: 0,Location,Location in Genome release 37 (hg19),Location in Genome release 38 (hg38),DNA Modifications,Protein or mRNA Variants,References & population frequency,Source,Amino acid conservation & comments,SIFT prediction,PolyPhen2 prediction,Splicing prediction,Additional variants in other genes,Notes,loc_clean,chromosome_clean,REF,ALT,leftover,POS_REF_ALT
0,18,Exon 1,X:153640181,X:154411844,c.1A>T,p.Met1Leu,,sequencing error?,,,,,,Not in ExAC,154411844.0,X,A,T,,154411844AT
1,19,Exon 1,X:153640182,X:154411845,c.2T>G,p.Met1Arg,,sequencing error?,,,,,,Not in ExAC,154411845.0,X,T,G,,154411845TG
2,29,Exon 1,X:153640213,X:154411876,c.33G>C,p.Ala11=,gnomad exomes; Eur Finnish female 2/104912; ma...,,,,,,,,154411876.0,X,G,C,,154411876GC
3,30,Exon 1,X-153640225,X:154411888,c.45C>T,p.Leu15=,gnomad exomes; Afr female 1/107408,,,,,,,,154411888.0,X,C,T,,154411888CT
4,31,Exon 1,X:153640228,X:154411891,c.48C>G,p.Thr16= [+ MYBPC3 VUS],gnomad exomes & genomes; African male 2/64971...,ClinVar Jul 2010,,,,,MYBPC3 NM_000256.3 c.3677G>; p.Arg1226Leu Cli...,Likely benign,154411891.0,X,C,G,,154411891CG
5,32,Exon 1,X:153640234,X:154411897,c.54C>T,p.Thr18=,SNP MAF n/a,,,,,,,,154411897.0,X,C,T,,154411897CT
6,34,Exon 1,X:153640249,X:154411912,c.69C>T,p.Val23=,gnomad exomes; Latino female 2/109440,ClinVar Apr 2017,,,,No Splicing effect predicted,,,154411912.0,X,C,T,,154411912CT
7,35,Exon 1,X:153640259,X:154411922,c.79T>C,p.Leu27=,"gnomad exomes; Ashk Jew 4, other 1; Ashk Jew ...",,,,,No Splicing effect predicted,,,154411922.0,X,T,C,,154411922TC
8,36,Exon 1,X-153640270,X:154411933,c.90C>G,p.Thr30=,gnomad exomes; Latino female 1/109562,,,,,,,,154411933.0,X,C,G,,154411933CG
9,38,Exon 1,X:153640288,X:154411951,c.108C>T,p.Thr36=,,ClinVar Sept 2017,,,,,,Germline; Report included,154411951.0,X,C,T,,154411951CT
