<a href="https://colab.research.google.com/github/unmtransinfo/TICTAC/blob/master/python/TICTAC_MedlineGenomics_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TICTAC-MedlineGenomics Validation
Compare TICTAC disease-gene associations with Medline Genomics (formerly Genetics Home Reference) associations for the same conditions.
Map diseases via DOID and common UMLS CUIs.

In [1]:
import sys, os, re
import pandas as pd

### Read DO2UMLS mapping file.
DO2UMLS mapping file from Disease Ontology.

In [2]:
do2cui = pd.read_csv("https://raw.githubusercontent.com/unmtransinfo/TICTAC/master/input/do_umls_mapping.csv")
print(f"""Unique DOIDs: {do2cui["id"].unique().size}""")
do2cui.columns = ["doid", "do_label", "umls_cui"]

Unique DOIDs: 6300


Split UMLS CUIs delimited by vertical bars.

In [3]:
s = do2cui['umls_cui'].str.split('|').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'umls_cui'
del do2cui['umls_cui']
do2cui = do2cui.join(s)
print(f"""Unique DOIDs: {do2cui["doid"].unique().size}; CUIs: {do2cui["umls_cui"].unique().size}""")
do2cui.sample(8)

  s = do2cui['umls_cui'].str.split('|').apply(pd.Series, 1).stack()


Unique DOIDs: 6300; CUIs: 6856


Unnamed: 0,doid,do_label,umls_cui
3180,DOID:2986,IgA glomerulonephritis,UMLS_CUI:C0017661
3790,DOID:4114,uterine body mixed cancer,UMLS_CUI:C1334628
3157,DOID:2938,Epstein-Barr virus infectious disease,UMLS_CUI:C0149678
5771,DOID:8533,hypopharynx cancer,UMLS_CUI:C0496770
2242,DOID:14000,rubeosis iridis,UMLS_CUI:C0154916
4863,DOID:6098,thalamic neoplasm,UMLS_CUI:C0346902
2222,DOID:13948,bladder neck obstruction,UMLS_CUI:C0005694
1795,DOID:12668,abnormal retinal correspondence,UMLS_CUI:C0155010


### Read Medline file.
Split CUIs delimited by commas.
Remove rows without CUIs.

In [4]:
medgen = pd.read_csv("https://raw.githubusercontent.com/unmtransinfo/TICTAC/master/output/medline_genetics_conditions_xrefs.tsv", sep="\t", usecols = ["id", "name", "GTR"])
medgen.columns = ["medline_id", "medline_name", "umls_cui"]
print(f"""Unique MedlineIDs: {medgen["medline_id"].unique().size}""")
s = medgen['umls_cui'].str.split(',').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'umls_cui'
del medgen['umls_cui']
medgen = medgen.join(s)
medgen["umls_cui"] = medgen["umls_cui"].str.replace(re.compile(r"^"), "UMLS_CUI:", regex=True)
medgen = medgen.loc[medgen.umls_cui.notna()]
print(f"""Unique MedlineIDs (with CUIs): {medgen["medline_id"].unique().size}; CUIs: {medgen["umls_cui"].unique().size}""")
medgen.sample(8)

Unique MedlineIDs: 1273


  s = medgen['umls_cui'].str.split(',').apply(pd.Series, 1).stack()


Unique MedlineIDs (with CUIs): 1216; CUIs: 2142


Unnamed: 0,medline_id,medline_name,umls_cui
1,15q11-q13-duplication-syndrome,15q11-q13 duplication syndrome,UMLS_CUI:C2675336
186,blepharocheilodontic-syndrome,Blepharocheilodontic syndrome,UMLS_CUI:C4551988
233,cerebral-folate-transport-deficiency,Cerebral folate transport deficiency,UMLS_CUI:C2751584
163,bare-lymphocyte-syndrome-type-ii,Bare lymphocyte syndrome type II,UMLS_CUI:C1859534
922,parkinsons-disease,Parkinson's disease,UMLS_CUI:C2751842
851,naegeli-franceschetti-jadassohn-syndrome-derma...,Naegeli-Franceschetti-Jadassohn syndrome/derma...,UMLS_CUI:C0343111
117,aromatic-l-amino-acid-decarboxylase-deficiency,Aromatic l-amino acid decarboxylase deficiency,UMLS_CUI:C1291564
168,beckwith-wiedemann-syndrome,Beckwith-Wiedemann syndrome,UMLS_CUI:C0004903


### Join Medline with DOID2CUI mappings.

In [5]:
medgen2 = pd.merge(medgen, do2cui, how="left", on="umls_cui")
medgen2.sample(8)

Unnamed: 0,medline_id,medline_name,umls_cui,doid,do_label
961,hereditary-paraganglioma-pheochromocytoma,Hereditary paraganglioma-pheochromocytoma,UMLS_CUI:C1854336,,
2071,usher-syndrome,Usher syndrome,UMLS_CUI:C3281066,,
270,bartter-syndrome,Bartter syndrome,UMLS_CUI:CN239220,,
1075,isolated-growth-hormone-deficiency,Isolated growth hormone deficiency,UMLS_CUI:C0342573,,
1233,lynch-syndrome,Lynch syndrome,UMLS_CUI:C1838333,,
1856,schindler-disease,Schindler disease,UMLS_CUI:C1836522,,
537,costello-syndrome,Costello syndrome,UMLS_CUI:C0587248,DOID:0050469,Costello syndrome
238,autosomal-recessive-congenital-stationary-nigh...,Autosomal recessive congenital stationary nigh...,UMLS_CUI:C1850362,,


In [7]:
##Read TICTAC disease-gene associations

import pandas as pd
import re

# Path to the new .parquet file
parquet_path = "disease_target_association_with_doid.parquet"

# Read the Parquet file
tictac = pd.read_parquet(parquet_path)

# Rename columns if necessary (ensure compatibility with the old script structure)
tictac.rename(columns={
    "meanRankScore": "mean_rank_score",
    "disease_term": "disease_name"  # Replace with the actual column name for disease in the Parquet file if different
}, inplace=True)

# Format the DOID column to match the required format
tictac["doid"] = tictac["doid"].str.replace(re.compile(r"^doid:"), "DOID:", regex=True)

# Select the necessary columns (ensure `tcrdTargetName` exists or adjust if not present)
tictac = tictac[["doid", "gene_symbol", "mean_rank_score"]]

# Summary information
unique_doids = tictac["doid"].nunique()
unique_genes = tictac["gene_symbol"].nunique()
print(f"Unique TICTAC DOIDs: {unique_doids}; gene symbols: {unique_genes}")

# Display a sample of the dataset
print(tictac.sample(8))

Unique TICTAC DOIDs: 2243; gene symbols: 2022
                doid gene_symbol  mean_rank_score
278808    DOID:13868       NPY2R        65.091255
84843      DOID:3817     ADORA2A        89.376640
753648    DOID:10021        NEK3        17.253820
48258     DOID:10762       EHMT2        93.956734
520747  DOID:0080746        ESR1        34.804306
393065     DOID:4202         SRC        50.777668
68964      DOID:6543         GBA        91.363762
753310  DOID:0080547        HMBS        17.253820


### Read TICTAC disease-gene associations.

Join with DOID2CUI file.

In [9]:
tictac2 = pd.merge(tictac, do2cui, how="left", on="doid")
#del tictac2["disease_name"]
tictac2.sample(8)

Unnamed: 0,doid,gene_symbol,mean_rank_score,do_label,umls_cui
403801,DOID:8567,BRDT,61.679913,Hodgkin's lymphoma,UMLS_CUI:C0019829
105083,DOID:11265,CYP2E1,90.40564,trachoma,UMLS_CUI:C0153107
691779,DOID:4239,ALOX12,32.529533,alveolar soft part sarcoma,UMLS_CUI:C0206657
747543,DOID:11476,ALOX15B,27.90135,osteoporosis,UMLS_CUI:C0029456
556569,DOID:0050827,UGT2B4,46.69648,,
925666,DOID:853,GTPBP4,17.25382,polymyalgia rheumatica,UMLS_CUI:C0032533
376725,DOID:768,RPS6KA5,64.471375,retinoblastoma,UMLS_CUI:C0035335
289013,DOID:8545,FAAH,72.797424,malignant hyperthermia,UMLS_CUI:C0024591


### Read Medline associations file.

In [10]:
medassn = pd.read_csv("https://raw.githubusercontent.com/unmtransinfo/TICTAC/master/output/medline_genetics_condition-gene_associations.tsv", sep="\t", usecols=["id", "GTR", "gene-symbol"])
medassn.columns = ["medline_id", "umls_cui", "gene_symbol"]
s = medassn['umls_cui'].str.split(',').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'umls_cui'
del medassn['umls_cui']
medassn = medassn.join(s)
medassn = medassn.loc[medassn.umls_cui.notna()]
medassn["umls_cui"] = medassn["umls_cui"].str.replace(re.compile(r"^"), "UMLS_CUI:", regex=True)
medassn.sample(8)

  s = medassn['umls_cui'].str.split(',').apply(pd.Series, 1).stack()


Unnamed: 0,medline_id,gene_symbol,umls_cui
1053,cone-rod-dystrophy,CNGB3,UMLS_CUI:C4014856
2874,myopathy-with-deficiency-of-iron-sulfur-cluste...,ISCU,UMLS_CUI:C1850718
879,cholangiocarcinoma,CDK6,UMLS_CUI:C3810156
1945,heterotaxy-syndrome,GDF1,UMLS_CUI:C3178805
1681,fanconi-anemia,FANCF,UMLS_CUI:C3150653
4007,trichothiodystrophy,MARS1,UMLS_CUI:C4225420
1042,common-variable-immune-deficiency,ICOS,UMLS_CUI:C0009447
1081,cone-rod-dystrophy,POC1B,UMLS_CUI:C3809299


### TICTAC CUIs in Medline?

These are the conditions for which we examine for agreement and some measure of validation.

In [11]:
print(f"""Unique TICTAC CUIs: {tictac2["umls_cui"].unique().size}""")
tictac_cuis = set(tictac2["umls_cui"].to_list())
medassn_cuis = set(medassn["umls_cui"].to_list())
common_cuis = tictac_cuis & medassn_cuis
print(f"""Unique TICTAC CUIs in Medline associations: {len(common_cuis)}""")

Unique TICTAC CUIs: 2235
Unique TICTAC CUIs in Medline associations: 193


In [12]:
tictac2.loc[tictac2["umls_cui"].isin(common_cuis)][["doid", "do_label", "umls_cui"]].drop_duplicates()

Unnamed: 0,doid,do_label,umls_cui
6,DOID:1612,breast cancer,UMLS_CUI:C0006142
506,DOID:9119,acute myeloid leukemia,UMLS_CUI:C0023467
556,DOID:1909,melanoma,UMLS_CUI:C0025202
1168,DOID:11612,polycystic ovary syndrome,UMLS_CUI:C0032460
1237,DOID:2394,ovarian cancer,UMLS_CUI:C1140680
...,...,...,...
838785,DOID:649,prion disease,UMLS_CUI:C0162534
845403,DOID:9271,ornithine carbamoyltransferase deficiency,UMLS_CUI:C0268542
847878,DOID:14755,argininosuccinic aciduria,UMLS_CUI:C0268547
865308,DOID:1919,Lesch-Nyhan syndrome,UMLS_CUI:C0023374


### TICTAC associations for conditions in Medline Genetics

In [13]:
tictac_validation = tictac2.loc[tictac2["umls_cui"].isin(common_cuis)][["umls_cui", "gene_symbol"]].drop_duplicates()
print(f"""TICTAC validation set nrows: {tictac_validation.shape[0]}""")
print(f"""TICTAC validation set CUIs: {tictac_validation["umls_cui"].unique().size}; gene symbols: {tictac_validation["gene_symbol"].unique().size}; disease-gene associations: {tictac_validation[["umls_cui", "gene_symbol"]].shape[0]}""")

TICTAC validation set nrows: 63569
TICTAC validation set CUIs: 193; gene symbols: 1804; disease-gene associations: 63569


In [14]:
tictac_validation.sample(8)

Unnamed: 0,umls_cui,gene_symbol
717700,UMLS_CUI:C0033847,CYP2C9
866137,UMLS_CUI:C0028326,HCK
27504,UMLS_CUI:C0003873,DHFR
851202,UMLS_CUI:C0016952,ABCB11
623847,UMLS_CUI:C0684249,ABCC10
887617,UMLS_CUI:C0004943,SRPK3
855880,UMLS_CUI:C0027121,PDE8A
120757,UMLS_CUI:C0242383,CYP1A2


In [15]:
medline_validation = medassn.loc[medassn["umls_cui"].isin(common_cuis)][["umls_cui", "gene_symbol"]].drop_duplicates()
print(f"""Medline validation set nrows: {medline_validation.shape[0]}""")
print(f"""Medline validation set CUIs: {medline_validation["umls_cui"].unique().size}; gene symbols: {medline_validation["gene_symbol"].unique().size}; disease-gene associations: {medline_validation[["umls_cui", "gene_symbol"]].shape[0]}""")

Medline validation set nrows: 1247
Medline validation set CUIs: 193; gene symbols: 967; disease-gene associations: 1247


### How many and what fraction of Medline associations were found by TICTAC?

In [16]:
combo_validation = pd.concat([tictac_validation, medline_validation])
dups = combo_validation.duplicated()
dups_count = sum([1 if x else 0 for x in dups])
print(f"""Duplicates: {dups_count}; percentage: {100 * dups_count / medline_validation[["umls_cui", "gene_symbol"]].shape[0]:.2f}%""")

Duplicates: 136; percentage: 10.91%


In [17]:
combo_validation_dups = combo_validation[dups]
combo_validation_dups

Unnamed: 0,umls_cui,gene_symbol
117,UMLS_CUI:C3714976,PIK3CD
175,UMLS_CUI:C0242383,CFB
180,UMLS_CUI:C0242383,CX3CR1
181,UMLS_CUI:C0242383,FRK
287,UMLS_CUI:C0002395,APP
...,...,...
3758,UMLS_CUI:C0036341,DRD2
3835,UMLS_CUI:C0002895,HBB
3951,UMLS_CUI:C0221013,KIT
4031,UMLS_CUI:C0011854,CCR5
