<a href="https://colab.research.google.com/github/unmtransinfo/TICTAC/blob/master/python/TICTAC_MedlineGenomics_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TICTAC-MedlineGenomics Validation
Compare TICTAC disease-gene associations with Medline Genomics (formerly Genetics Home Reference) associations for the same conditions.
Map diseases via DOID and common UMLS CUIs.

In [None]:
import sys, os, re
import pandas as pd

### Read DO2UMLS mapping file.
DO2UMLS mapping file from Disease Ontology.

In [None]:
do2cui = pd.read_csv("https://raw.githubusercontent.com/unmtransinfo/TICTAC/master/input/do_umls_mapping.csv")
print(f"""Unique DOIDs: {do2cui["id"].unique().size}""")
do2cui.columns = ["doid", "do_label", "umls_cui"]

Unique DOIDs: 6300


Split UMLS CUIs delimited by vertical bars.

In [None]:
s = do2cui['umls_cui'].str.split('|').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'umls_cui'
del do2cui['umls_cui']
do2cui = do2cui.join(s)
print(f"""Unique DOIDs: {do2cui["doid"].unique().size}; CUIs: {do2cui["umls_cui"].unique().size}""")
do2cui.sample(8)

Unique DOIDs: 6300; CUIs: 6856


Unnamed: 0,doid,do_label,umls_cui
3800,DOID:4140,biliary dyskinesia,UMLS_CUI:C0005416
5708,DOID:8409,microinvasive cervical squamous cell carcinoma,UMLS_CUI:C1333370
82,DOID:0050440,familial partial lipodystrophy,UMLS_CUI:C1720861
4599,DOID:5577,gastrinoma,UMLS_CUI:C0017150
1206,DOID:10976,membranous glomerulonephritis,UMLS_CUI:C0017665
2924,DOID:2457,giant papillary conjunctivitis,UMLS_CUI:C0009769
1138,DOID:10816,duodenum adenocarcinoma,UMLS_CUI:C0278804
6164,DOID:9682,yellow fever,UMLS_CUI:C0043395


### Read Medline file.
Split CUIs delimited by commas.
Remove rows without CUIs.

In [None]:
medgen = pd.read_csv("https://raw.githubusercontent.com/unmtransinfo/TICTAC/master/output/medline_genetics_conditions_xrefs.tsv", sep="\t", usecols = ["id", "name", "GTR"])
medgen.columns = ["medline_id", "medline_name", "umls_cui"]
print(f"""Unique MedlineIDs: {medgen["medline_id"].unique().size}""")
s = medgen['umls_cui'].str.split(',').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'umls_cui'
del medgen['umls_cui']
medgen = medgen.join(s)
medgen["umls_cui"] = medgen["umls_cui"].str.replace(re.compile(r"^"), "UMLS_CUI:", regex=True)
medgen = medgen.loc[medgen.umls_cui.notna()]
print(f"""Unique MedlineIDs (with CUIs): {medgen["medline_id"].unique().size}; CUIs: {medgen["umls_cui"].unique().size}""")
medgen.sample(8)

Unique MedlineIDs: 1273
Unique MedlineIDs (with CUIs): 1216; CUIs: 2142


Unnamed: 0,medline_id,medline_name,umls_cui
833,multiple-myeloma,Multiple myeloma,UMLS_CUI:C0026764
692,l1-syndrome,L1 syndrome,UMLS_CUI:C1839909
1202,usher-syndrome,Usher syndrome,UMLS_CUI:C1865885
335,crigler-najjar-syndrome,Crigler-Najjar syndrome,UMLS_CUI:C2931132
1149,swyer-syndrome,Swyer syndrome,UMLS_CUI:C2936694
1108,spastic-paraplegia-type-4,Spastic paraplegia type 4,UMLS_CUI:C1866855
1048,rothmund-thomson-syndrome,Rothmund-Thomson syndrome,UMLS_CUI:C0032339
661,jervell-and-lange-nielsen-syndrome,Jervell and Lange-Nielsen syndrome,UMLS_CUI:C2676723


### Join Medline with DOID2CUI mappings.

In [None]:
medgen2 = pd.merge(medgen, do2cui, how="left", on="umls_cui")
medgen2.sample(8)

Unnamed: 0,medline_id,medline_name,umls_cui,doid,do_label
2164,x-linked-dilated-cardiomyopathy,X-linked dilated cardiomyopathy,UMLS_CUI:C3668940,,
1809,rabson-mendenhall-syndrome,Rabson-Mendenhall syndrome,UMLS_CUI:C0271695,DOID:0050470,Donohue syndrome
142,amelogenesis-imperfecta,Amelogenesis imperfecta,UMLS_CUI:C0399376,DOID:0111721,amelogenesis imperfecta type 3
396,chylomicron-retention-disease,Chylomicron retention disease,UMLS_CUI:C0795956,DOID:0060357,chylomicron retention disease
249,autosomal-recessive-primary-microcephaly,Autosomal recessive primary microcephaly,UMLS_CUI:C1842109,,
713,familial-atrial-fibrillation,Familial atrial fibrillation,UMLS_CUI:C4014269,,
464,cone-rod-dystrophy,Cone-rod dystrophy,UMLS_CUI:C1863634,,
1680,pitt-hopkins-syndrome,Pitt-Hopkins syndrome,UMLS_CUI:C1970431,DOID:0060488,Pitt-Hopkins syndrome


### Read TICTAC disease-gene associations.

In [None]:
tictac = pd.read_csv("https://github.com/unmtransinfo/TICTAC/raw/master/python/tictac_genes_disease_associations.csv", usecols=["disease_name", "doid", "gene_symbol", "tcrdTargetName", "mean_rank_score"])
tictac["doid"] = tictac["doid"].str.replace(re.compile(r"^doid:"), "DOID:", regex=True)
tictac = tictac[["disease_name", "doid", "tcrdTargetName", "gene_symbol", "mean_rank_score"]]
print(f"""Unique TICTAC DOIDs: {tictac["doid"].unique().size}; gene symbols: {tictac["gene_symbol"].unique().size}""")
tictac.sample(8)

Unique TICTAC DOIDs: 585; gene symbols: 790


Unnamed: 0,disease_name,doid,tcrdTargetName,gene_symbol,mean_rank_score
33097,neuroendocrine tumor,DOID:169,Tyrosine-protein kinase JAK2,JAK2,6.984673
53710,anovulation,DOID:3781,Delta-type opioid receptor,OPRD1,39.532235
50775,coronary artery disease,DOID:3393,Regulator of G-protein signaling 4,RGS4,64.954649
11307,congenital vertical talus,DOID:0111568,Amine oxidase [flavin-containing] A,MAOA,46.236213
29461,Parkinson's disease,DOID:14330,Sodium-dependent noradrenaline transporter,SLC6A2,25.448978
30865,head and neck carcinoma,DOID:1542,Cytochrome P450 3A4,CYP3A4,68.846329
53264,skin disease,DOID:37,Alpha-synuclein,SNCA,6.984698
24357,blastomycosis,DOID:12663,Prelamin-A/C,LMNA,44.82964


Join with DOID2CUI file.

In [None]:
tictac2 = pd.merge(tictac, do2cui, how="left", on="doid")
del tictac2["disease_name"]
tictac2.sample(8)

Unnamed: 0,doid,tcrdTargetName,gene_symbol,mean_rank_score,do_label,umls_cui
68992,DOID:3369,Histone-lysine N-methyltransferase 2A,KMT2A,44.82964,Ewing sarcoma,UMLS_CUI:C0796547
5309,DOID:0050908,Mitogen-activated protein kinase kinase kinase 11,MAP3K11,61.35526,myelodysplastic syndrome,UMLS_CUI:C2713368
75256,DOID:3571,Epidermal growth factor receptor,EGFR,34.4637,liver cancer,UMLS_CUI:C0023903
39322,DOID:1682,Sodium-dependent serotonin transporter,SLC6A4,33.33755,congenital heart disease,UMLS_CUI:C0018798
95634,DOID:6713,UDP-glucuronosyltransferase 1-3,UGT1A3,78.53368,cerebrovascular disease,UMLS_CUI:C0007820
60020,DOID:2913,C-C chemokine receptor type 4,CCR4,71.08574,acute pancreatitis,UMLS_CUI:C0001339
69688,DOID:3393,Canalicular multispecific organic anion transp...,ABCC2,61.57471,coronary artery disease,UMLS_CUI:C0010054
79767,DOID:4090,Beta-3 adrenergic receptor,ADRB3,6.440786e-07,agnosia,UMLS_CUI:C0001816


### Read Medline associations file.

In [None]:
medassn = pd.read_csv("https://raw.githubusercontent.com/unmtransinfo/TICTAC/master/output/medline_genetics_condition-gene_associations.tsv", sep="\t", usecols=["id", "GTR", "gene-symbol"])
medassn.columns = ["medline_id", "umls_cui", "gene_symbol"]
s = medassn['umls_cui'].str.split(',').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'umls_cui'
del medassn['umls_cui']
medassn = medassn.join(s)
medassn = medassn.loc[medassn.umls_cui.notna()]
medassn["umls_cui"] = medassn["umls_cui"].str.replace(re.compile(r"^"), "UMLS_CUI:", regex=True)
medassn.sample(8)

Unnamed: 0,medline_id,gene_symbol,umls_cui
3180,osteopetrosis,CA2,UMLS_CUI:C1968603
3427,progressive-external-ophthalmoplegia,TWNK,UMLS_CUI:C0162674
1427,ehlers-danlos-syndrome,TNXB,UMLS_CUI:C3508773
4253,zellweger-spectrum-disorder,PEX10,UMLS_CUI:C4721541
2319,leber-congenital-amaurosis,NMNAT1,UMLS_CUI:C1840284
2924,nephronophthisis,TMEM67,UMLS_CUI:C3539071
4211,x-linked-lymphoproliferative-disease,XIAP,UMLS_CUI:C0549463
1081,cone-rod-dystrophy,POC1B,UMLS_CUI:C1845407


### TICTAC CUIs in Medline?

These are the conditions for which we examine for agreement and some measure of validation.

In [None]:
print(f"""Unique TICTAC CUIs: {tictac2["umls_cui"].unique().size}""")
tictac_cuis = set(tictac2["umls_cui"].to_list())
medassn_cuis = set(medassn["umls_cui"].to_list())
common_cuis = tictac_cuis & medassn_cuis
print(f"""Unique TICTAC CUIs in Medline associations: {len(common_cuis)}""")

Unique TICTAC CUIs: 687
Unique TICTAC CUIs in Medline associations: 41


In [None]:
tictac2.loc[tictac2["umls_cui"].isin(common_cuis)][["doid", "do_label", "umls_cui"]].drop_duplicates()

Unnamed: 0,doid,do_label,umls_cui
1377,DOID:0050156,idiopathic pulmonary fibrosis,UMLS_CUI:C1800706
7736,DOID:0060180,colitis,UMLS_CUI:C0009319
16054,DOID:10584,retinitis pigmentosa,UMLS_CUI:C0035334
16829,DOID:10608,celiac disease,UMLS_CUI:C0007570
16947,DOID:10652,Alzheimer's disease,UMLS_CUI:C0002395
18666,DOID:10825,essential hypertension,UMLS_CUI:C0085580
18852,DOID:10871,age related macular degeneration,UMLS_CUI:C0242383
19638,DOID:10923,sickle cell anemia,UMLS_CUI:C0002895
19710,DOID:10933,obsessive-compulsive disorder,UMLS_CUI:C0028768
23521,DOID:11612,polycystic ovary syndrome,UMLS_CUI:C0032460


### TICTAC associations for conditions in Medline Genetics

In [None]:
tictac_validation = tictac2.loc[tictac2["umls_cui"].isin(common_cuis)][["umls_cui", "gene_symbol"]].drop_duplicates()
print(f"""TICTAC validation set nrows: {tictac_validation.shape[0]}""")
print(f"""TICTAC validation set CUIs: {tictac_validation["umls_cui"].unique().size}; gene symbols: {tictac_validation["gene_symbol"].unique().size}; disease-gene associations: {tictac_validation[["umls_cui", "gene_symbol"]].shape[0]}""")

TICTAC validation set nrows: 7341
TICTAC validation set CUIs: 41; gene symbols: 756; disease-gene associations: 7341


In [None]:
tictac_validation.sample(8)

Unnamed: 0,umls_cui,gene_symbol
38793,UMLS_CUI:C0006142,UGT2B7
96288,UMLS_CUI:C0003873,GMNN
111403,UMLS_CUI:C0011854,CYP1A2
110672,UMLS_CUI:C0026764,PRKD1
110594,UMLS_CUI:C0026764,NLK
106338,UMLS_CUI:C0023467,MAP4K2
35374,UMLS_CUI:C0013080,TACR1
7791,UMLS_CUI:C0009319,CYP3A4


In [None]:
medline_validation = medassn.loc[medassn["umls_cui"].isin(common_cuis)][["umls_cui", "gene_symbol"]].drop_duplicates()
print(f"""Medline validation set nrows: {medline_validation.shape[0]}""")
print(f"""Medline validation set CUIs: {medline_validation["umls_cui"].unique().size}; gene symbols: {medline_validation["gene_symbol"].unique().size}; disease-gene associations: {medline_validation[["umls_cui", "gene_symbol"]].shape[0]}""")

Medline validation set nrows: 522
Medline validation set CUIs: 41; gene symbols: 420; disease-gene associations: 522


### How many and what fraction of Medline associations were found by TICTAC?

In [None]:
combo_validation = pd.concat([tictac_validation, medline_validation])
dups = combo_validation.duplicated()
dups_count = sum([1 if x else 0 for x in dups])
print(f"""Duplicates: {dups_count}; percentage: {100 * dups_count / medline_validation[["umls_cui", "gene_symbol"]].shape[0]:.2f}%""")

Duplicates: 24; percentage: 4.60%


In [None]:
combo_validation_dups = combo_validation[dups]
combo_validation_dups

Unnamed: 0,umls_cui,gene_symbol
599,UMLS_CUI:C0699885,ERBB2
901,UMLS_CUI:C0023473,ABL1
902,UMLS_CUI:C0023473,BCR
1184,UMLS_CUI:C0023467,KIT
1189,UMLS_CUI:C0023467,FLT3
1309,UMLS_CUI:C0023467,KMT2A
1758,UMLS_CUI:C0017205,GBA
2004,UMLS_CUI:C0085580,AGTR1
2006,UMLS_CUI:C0085580,EDNRA
2632,UMLS_CUI:C0025202,BRAF
