# Harmonize rare disease information for network integration

Prepare the final edges of the rare disease network to be integrated into Rephetio.

In [1]:
import pandas as pd
import numpy as np

## Read rare disease-gene links

In [2]:
dg_links = pd.read_csv("dg_links/orpha_gene_dise_links.tsv", sep='\t')

In [3]:
dg_links.head(2)

Unnamed: 0,dise_orpha_id,dise_name,link_status,link_type,gene_name,hgnc_id,symbol,entrez_id
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",Assessed,Disease-causing germline mutation(s) in,kinesin family member 7,30497,KIF7,374654
1,36,Acrocallosal syndrome,Assessed,Disease-causing germline mutation(s) in,kinesin family member 7,30497,KIF7,374654


## Filter gene-disease links to those with existing genes

In [4]:
genes = (pd
    .read_csv("data/hetnet_nodes.csv")
    .drop("identifier:string", axis=1)
    .rename(columns={
        ":ID": "entrez_id",
        "name:string": "gene_name",
        ":LABEL": "node_type"
    })
    .query("node_type == 'Gene'")
    .drop("node_type", axis=1)
    .astype({"entrez_id": np.int64})
)

In [5]:
genes.shape

(20945, 2)

In [6]:
genes.head()

Unnamed: 0,entrez_id,gene_name
6,400830,DEFB132
12,84662,GLIS2
14,4212,MEIS2
15,283870,BRICD5
17,56882,CDC42SE1


In [7]:
fin_links = (dg_links
    .merge(
        genes.drop("gene_name", axis=1), how="inner", on="entrez_id"
    )
)

In [8]:
fin_links.shape

(6766, 8)

In [9]:
fin_links.head()

Unnamed: 0,dise_orpha_id,dise_name,link_status,link_type,gene_name,hgnc_id,symbol,entrez_id
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",Assessed,Disease-causing germline mutation(s) in,kinesin family member 7,30497,KIF7,374654
1,36,Acrocallosal syndrome,Assessed,Disease-causing germline mutation(s) in,kinesin family member 7,30497,KIF7,374654
2,2189,Hydrolethalus,Assessed,Disease-causing germline mutation(s) in,kinesin family member 7,30497,KIF7,374654
3,2754,Joubert syndrome with orofaciodigital defect,Assessed,Disease-causing germline mutation(s) in,kinesin family member 7,30497,KIF7,374654
4,93,Aspartylglucosaminuria,Assessed,Disease-causing germline mutation(s) in,aspartylglucosaminidase,318,AGA,175


In [10]:
dg_links["entrez_id"].nunique()

3669

In [11]:
fin_links["entrez_id"].nunique()

3599

In [12]:
fin_links[["entrez_id", "dise_orpha_id"]].drop_duplicates().shape

(6739, 2)

There are a total of 6739 unique gene-disease relations that we will be adding to the network.

## Read rare disease drug indications

But drop the contraindications.

In [13]:
inds = (pd
    .read_csv("cd_links/rare_disease_indications.tsv", sep='\t')
    .query("relationship_name != 'contraindication'")
)

In [14]:
inds.head(2)

Unnamed: 0,struct_id,relationship_name,orphanet_id,dise_name,dise_type,drug_name
0,1253,indication,3389,Tuberculosis,not_genetic,ftivazide
1,5205,indication,58017,Hairy cell leukemia,not_genetic,interferon alfa-2b


In [15]:
inds["struct_id"].nunique()

976

---

## Filter drugs down to those already in the network

In [16]:
drugs = (pd
    .read_csv("data/hetnet_nodes.csv")
    .drop("identifier:string", axis=1)
    .rename(columns={
        ":ID": "drugbank_id",
        "name:string": "drug_name",
        ":LABEL": "node_type"
    })
    .query("node_type == 'Compound'")
    .drop("node_type", axis=1)
)

In [17]:
drugs.head()

Unnamed: 0,drugbank_id,drug_name
34,DB01169,Arsenic trioxide
93,DB01106,Levocabastine
130,DB06288,Amisulpride
132,DB00513,Aminocaproic Acid
156,DB00118,S-Adenosylmethionine


In [18]:
drugs["drugbank_id"].nunique()

1552

### Convert indication drug ids to drugbank ids

In [19]:
chem_ids = (pd
    .read_csv("data/drug_ids.csv")
    .drop(["id", "parent_match"], axis=1)
)

In [20]:
chem_ids.head()

Unnamed: 0,identifier,id_type,struct_id
0,DB00001,DRUGBANK_ID,2995
1,DB00002,DRUGBANK_ID,4954
2,DB00003,DRUGBANK_ID,5135
3,DB00004,DRUGBANK_ID,5087
4,DB00005,DRUGBANK_ID,4978


In [21]:
chem_ids["id_type"].value_counts()

ChEMBL_ID                      5895
SNOMEDCT_US                    4902
UNII                           4347
PUBCHEM_CID                    4271
NDFRT                          4216
CHEBI                          3819
MMSL                           3684
INN_ID                         3547
KEGG_DRUG                      3542
NDDF                           3335
RXNORM                         2938
UMLSCUI                        2838
MESH_SUPPLEMENTAL_RECORD_UI    2533
DRUGBANK_ID                    2465
VANDF                          2136
NUI                            2044
VUID                           1790
SECONDARY_CAS_RN               1653
MESH_DESCRIPTOR_UI             1587
IUPHAR_LIGAND_ID               1370
PDB_CHEM_ID                     649
Name: id_type, dtype: int64

The chemicals in DrugCentral are not very well indexed by DrugBank ids.

In [22]:
chem_map = (chem_ids
    .query("id_type == 'DRUGBANK_ID'")
    .drop("id_type", axis=1)
    .rename(columns={"identifier": "drugbank_id"})
)

In [23]:
drugs = drugs.merge(chem_map, how="inner", on="drugbank_id")

In [24]:
drugs.shape

(1617, 3)

In [25]:
drugs.head()

Unnamed: 0,drugbank_id,drug_name,struct_id
0,DB01169,Arsenic trioxide,244
1,DB01106,Levocabastine,1564
2,DB06288,Amisulpride,179
3,DB00513,Aminocaproic Acid,163
4,DB00118,S-Adenosylmethionine,2414


In [26]:
drugs["drugbank_id"].nunique()

1521

In [27]:
drugs["struct_id"].nunique()

1617

DrugBank to struct_id is one to many.

## Filter indications to those with existing drugs

In [28]:
fin_inds = (inds
    .drop("drug_name", axis=1)
    .merge(drugs, how="inner", on="struct_id")
    .drop("struct_id", axis=1)
    .drop_duplicates()
)

### Filter indications to those with diseases with gene-disease links

In [29]:
common_dises = set(fin_links["dise_orpha_id"])
fin_inds = fin_inds.query("orphanet_id in @common_dises")

In [30]:
fin_inds.shape

(2044, 6)

In [31]:
fin_inds.head()

Unnamed: 0,relationship_name,orphanet_id,dise_name,dise_type,drugbank_id,drug_name
0,indication,58017,Hairy cell leukemia,not_genetic,DB00552,Pentostatin
1,off-label use,67038,B-cell chronic lymphocytic leukemia,not_genetic,DB00552,Pentostatin
2,indication,58017,Hairy cell leukemia,not_genetic,DB00242,Cladribine
3,off-label use,33226,Waldenström macroglobulinemia,not_genetic,DB00242,Cladribine
6,indication,545,Follicular lymphoma,not_genetic,DB00620,Triamcinolone


In [32]:
fin_inds["orphanet_id"].nunique()

599

In [33]:
fin_inds["drugbank_id"].nunique()

319

In [34]:
fin_inds[["drugbank_id", "orphanet_id"]].drop_duplicates().shape

(2015, 2)

---

## Gene-disease link stats

In [35]:
fin_links["dise_orpha_id"].nunique()

3555

In [36]:
fin_links["entrez_id"].nunique()

3599

In [37]:
fin_links[["entrez_id", "dise_orpha_id"]].drop_duplicates().shape

(6739, 2)

In [38]:
fin_links.head()

Unnamed: 0,dise_orpha_id,dise_name,link_status,link_type,gene_name,hgnc_id,symbol,entrez_id
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",Assessed,Disease-causing germline mutation(s) in,kinesin family member 7,30497,KIF7,374654
1,36,Acrocallosal syndrome,Assessed,Disease-causing germline mutation(s) in,kinesin family member 7,30497,KIF7,374654
2,2189,Hydrolethalus,Assessed,Disease-causing germline mutation(s) in,kinesin family member 7,30497,KIF7,374654
3,2754,Joubert syndrome with orofaciodigital defect,Assessed,Disease-causing germline mutation(s) in,kinesin family member 7,30497,KIF7,374654
4,93,Aspartylglucosaminuria,Assessed,Disease-causing germline mutation(s) in,aspartylglucosaminidase,318,AGA,175


In [39]:
len(set(fin_links["dise_orpha_id"]) & set(fin_inds["orphanet_id"]))

599

---

## Read disease-phenotype links

In [40]:
phenos = pd.read_csv("dise_pheno/dise_pheno_links.tsv", sep='\t')

In [41]:
phenos.head()

Unnamed: 0,dise_id,dise_name,freq,hpo_id,hpo_name,mesh_id
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",Very frequent (99-80%),HP:0000256,Macrocephaly,
1,58,Alexander disease,Very frequent (99-80%),HP:0000256,Macrocephaly,
2,61,Alpha-mannosidosis,Occasional (29-5%),HP:0000256,Macrocephaly,
3,585,Multiple sulfatase deficiency,Frequent (79-30%),HP:0000256,Macrocephaly,
4,141,Canavan disease,Frequent (79-30%),HP:0000256,Macrocephaly,


### Read existing phenotypes in network

In [42]:
symptoms = pd.read_csv("data/symptoms.tsv", sep='\t')

In [43]:
symptoms.head()

Unnamed: 0,mesh_id,mesh_name,in_hsdn
0,D000006,"Abdomen, Acute",1
1,D000270,Adie Syndrome,0
2,D000326,Adrenoleukodystrophy,0
3,D000334,Aerophagy,1
4,D000370,Ageusia,1


## Combine existing phenotypes with new rare disease ones

In [44]:
fin_phenos = (phenos
    .query("dise_id in @common_dises")
    .append(
        phenos.merge(symptoms[["mesh_id"]], how="inner", on="mesh_id")
    )
    .drop_duplicates()
    .sort_values(["dise_id", "hpo_id"])
    .reset_index(drop=True)
)

In [45]:
fin_phenos.shape

(35088, 6)

In [46]:
fin_phenos.head()

Unnamed: 0,dise_id,dise_name,freq,hpo_id,hpo_name,mesh_id
0,6,3-methylcrotonyl-CoA carboxylase deficiency,Very frequent (99-80%),HP:0001252,Muscular hypotonia,D009123
1,6,3-methylcrotonyl-CoA carboxylase deficiency,Occasional (29-5%),HP:0001257,Spasticity,D009128
2,6,3-methylcrotonyl-CoA carboxylase deficiency,Frequent (79-30%),HP:0001531,Failure to thrive in infancy,
3,6,3-methylcrotonyl-CoA carboxylase deficiency,Very frequent (99-80%),HP:0001943,Hypoglycemia,D007003
4,6,3-methylcrotonyl-CoA carboxylase deficiency,Frequent (79-30%),HP:0001987,Hyperammonemia,D022124


## Output to file

In [47]:
fin_inds.to_csv("results/rare_dise_indications.tsv", sep='\t', index=False)
fin_links.to_csv("results/rare_dise_gene_links.tsv", sep='\t', index=False)
fin_phenos.to_csv("results/rare_dise_phenotype_links.tsv", sep='\t', index=False)