# Harmonize rare disease information for network integration

In [1]:
import pandas as pd
import numpy as np

## Read rare disease-gene links

In [2]:
dg_links = pd.read_csv("dg_links/orpha_gene_dise_links.tsv", sep='\t')

In [3]:
dg_links.head(2)

Unnamed: 0,dise_orpha_id,dise_name,link_status,link_type,gene_name,hgnc_id,symbol,entrez_id
0,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",Assessed,Disease-causing germline mutation(s) in,kinesin family member 7,30497,KIF7,374654
1,36,Acrocallosal syndrome,Assessed,Disease-causing germline mutation(s) in,kinesin family member 7,30497,KIF7,374654


## Read rare disease drug indications

But drop the contraindications.

In [4]:
inds = (pd
    .read_csv("cd_links/rare_disease_indications.tsv", sep='\t')
    .query("relationship_name != 'contraindication'")
)

In [5]:
inds.head(2)

Unnamed: 0,struct_id,relationship_name,orphanet_id,dise_name,dise_type,drug_name
0,76,indication,181425,Major hypertriglyceridemia,rare_genetic,acipimox
1,76,indication,181422,Rare hyperlipidemia,rare_genetic,acipimox


In [6]:
inds["struct_id"].nunique()

871

---

## Filter drugs down to those already in the network

In [7]:
drugs = (pd
    .read_csv("data/hetnet_nodes.csv")
    .drop("identifier:string", axis=1)
    .rename(columns={
        ":ID": "drugbank_id",
        "name:string": "drug_name",
        ":LABEL": "node_type"
    })
    .query("node_type == 'Compound'")
    .drop("node_type", axis=1)
)

In [8]:
drugs.head()

Unnamed: 0,drugbank_id,drug_name
34,DB01169,Arsenic trioxide
93,DB01106,Levocabastine
130,DB06288,Amisulpride
132,DB00513,Aminocaproic Acid
156,DB00118,S-Adenosylmethionine


In [9]:
drugs["drugbank_id"].nunique()

1552

### Convert indication drug ids to drugbank ids

In [10]:
chem_ids = (pd
    .read_csv("data/drug_ids.csv")
    .drop(["id", "parent_match"], axis=1)
)

In [11]:
chem_ids.head()

Unnamed: 0,identifier,id_type,struct_id
0,DB00001,DRUGBANK_ID,2995
1,DB00002,DRUGBANK_ID,4954
2,DB00003,DRUGBANK_ID,5135
3,DB00004,DRUGBANK_ID,5087
4,DB00005,DRUGBANK_ID,4978


In [12]:
chem_ids["id_type"].value_counts()

ChEMBL_ID                      5895
SNOMEDCT_US                    4902
UNII                           4347
PUBCHEM_CID                    4271
NDFRT                          4216
CHEBI                          3819
MMSL                           3684
INN_ID                         3547
KEGG_DRUG                      3542
NDDF                           3335
RXNORM                         2938
UMLSCUI                        2838
MESH_SUPPLEMENTAL_RECORD_UI    2533
DRUGBANK_ID                    2465
VANDF                          2136
NUI                            2044
VUID                           1790
SECONDARY_CAS_RN               1653
MESH_DESCRIPTOR_UI             1587
IUPHAR_LIGAND_ID               1370
PDB_CHEM_ID                     649
Name: id_type, dtype: int64

In [13]:
inds.head()

Unnamed: 0,struct_id,relationship_name,orphanet_id,dise_name,dise_type,drug_name
0,76,indication,181425,Major hypertriglyceridemia,rare_genetic,acipimox
1,76,indication,181422,Rare hyperlipidemia,rare_genetic,acipimox
2,76,indication,412,Hyperlipoproteinemia type 3,rare_genetic,acipimox
3,76,indication,309015,Familial lipoprotein lipase deficiency,rare_genetic,acipimox
4,76,indication,309020,Familial apolipoprotein C-II deficiency,rare_genetic,acipimox


In [14]:
inds["struct_id"].nunique()

871

In [15]:
a = set(chem_ids.query("id_type == 'DRUGBANK_ID'")["struct_id"])

In [16]:
b = set(inds["struct_id"])

In [17]:
len(a)

2465

In [18]:
len(b)

871

In [19]:
len(a & b)

758

In [20]:
ja = b - a

In [21]:
len(ja)

113

In [22]:
lol = inds.query("struct_id in @ja")

In [23]:
lol.shape

(647, 6)

In [24]:
lol.sample(10)

Unnamed: 0,struct_id,relationship_name,orphanet_id,dise_name,dise_type,drug_name
33082,5219,indication,98352,Autosomal dominant disease with diffuse palmop...,rare_genetic,trafermin
1055,3871,indication,70578,Adult acute respiratory distress syndrome,not_genetic,artemisinin
32630,2555,indication,251646,Anaplastic ependymoma,not_genetic,talaporfin
26972,3667,indication,35858,Gräsbeck-Imerslund disease,rare_genetic,amanozine
32199,4343,off-label use,37,Acrodermatitis enteropathica,rare_genetic,zinc gluconate
2534,4341,indication,99981,Apnea of prematurity,not_genetic,almasilate
32974,5219,indication,1366,Autosomal recessive palmoplantar keratoderma a...,rare_genetic,trafermin
19816,1680,indication,328,Congenital factor X deficiency,rare_genetic,menadiol sulfate
9636,1379,indication,279922,Infectious anterior uveitis,not_genetic,homatropine
22601,4520,off-label use,98292,Mastocytosis,not_genetic,potassium hydrogencarbonate


In [25]:
lol["dise_type"].value_counts()

rare_genetic    451
not_genetic     196
Name: dise_type, dtype: int64

In [48]:
inds.query("struct_id in @ja").groupby("dise_type")["orphanet_id"].nunique()

dise_type
not_genetic     133
rare_genetic    325
Name: orphanet_id, dtype: int64

In [46]:
inds.shape

(7195, 6)

In [50]:
chem_ids.query("struct_id in @ja")["id_type"].value_counts()

ChEMBL_ID                      126
UNII                           110
PUBCHEM_CID                    104
SNOMEDCT_US                     96
CHEBI                           87
KEGG_DRUG                       84
MESH_SUPPLEMENTAL_RECORD_UI     81
INN_ID                          77
NDFRT                           76
NDDF                            69
RXNORM                          58
UMLSCUI                         56
MMSL                            55
VANDF                           42
NUI                             39
SECONDARY_CAS_RN                34
VUID                            34
MESH_DESCRIPTOR_UI              27
IUPHAR_LIGAND_ID                16
PDB_CHEM_ID                      8
Name: id_type, dtype: int64

In [55]:
chem_ids.query("struct_id in @ja and id_type == 'ChEMBL_ID'").groupby("struct_id").size().sort_values(ascending=False).head()

struct_id
4193    3
3017    3
2999    3
941     3
4341    2
dtype: int64

In [56]:
chem_ids.query("struct_id == 4193")

Unnamed: 0,identifier,id_type,struct_id
19305,CHEMBL303933,ChEMBL_ID,4193
39379,014245,NDDF,4193
49035,CHEMBL539666,ChEMBL_ID,4193
50557,C034759,MESH_SUPPLEMENTAL_RECORD_UI,4193
52529,854D7K8LXB,UNII,4193
52530,915967-82-7,SECONDARY_CAS_RN,4193
55629,122262,PUBCHEM_CID,4193
56359,CHEMBL1652442,ChEMBL_ID,4193
62598,CHEBI:91231,CHEBI,4193


In [26]:
inds.query("struct_id == 4193")

Unnamed: 0,struct_id,relationship_name,orphanet_id,dise_name,dise_type,drug_name
30964,4193,indication,673,Malaria,not_genetic,piperaquine


The chemicals in DrugCentral are not very well indexed by DrugBank ids.

In [22]:
chem_map = (chem_ids
    .query("id_type == 'DRUGBANK_ID'")
    .drop("id_type", axis=1)
    .rename(columns={"identifier": "drugbank_id"})
)

In [23]:
drugs = drugs.merge(chem_map, how="inner", on="drugbank_id")

In [24]:
drugs.shape

(1617, 3)

In [25]:
drugs.head()

Unnamed: 0,drugbank_id,drug_name,struct_id
0,DB01169,Arsenic trioxide,244
1,DB01106,Levocabastine,1564
2,DB06288,Amisulpride,179
3,DB00513,Aminocaproic Acid,163
4,DB00118,S-Adenosylmethionine,2414


In [26]:
drugs["drugbank_id"].nunique()

1521

In [27]:
drugs["struct_id"].nunique()

1617

DrugBank to struct_id is one to many.

## Filter indications to those with existing drugs

In [28]:
fin_inds = (inds
    .drop("drug_name", axis=1)
    .merge(drugs, how="inner", on="struct_id")
    .drop("struct_id", axis=1)
    .drop_duplicates()
)

### Filter indications to those with diseases with gene-disease links

In [29]:
common_dises = set(fin_links["dise_orpha_id"])
fin_inds = fin_inds.query("orphanet_id in @common_dises")

In [30]:
fin_inds.shape

(1864, 6)

In [31]:
fin_inds.head()

Unnamed: 0,relationship_name,orphanet_id,dise_name,dise_type,drugbank_id,drug_name
0,indication,209981,IRIDA syndrome,rare_genetic,DB01592,Iron
2,indication,83642,Microcytic anemia with liver iron overload,rare_genetic,DB01592,Iron
4,indication,209981,IRIDA syndrome,rare_genetic,DB00158,Folic Acid
6,indication,83642,Microcytic anemia with liver iron overload,rare_genetic,DB00158,Folic Acid
10,indication,79241,Biotinidase deficiency,rare_genetic,DB00158,Folic Acid


In [32]:
fin_inds["orphanet_id"].nunique()

565

In [33]:
fin_inds["drugbank_id"].nunique()

273

In [34]:
fin_inds[["drugbank_id", "orphanet_id"]].drop_duplicates().shape

(1850, 2)

---

## Output to file

In [35]:
fin_inds.to_csv("results/rare_dise_indications.tsv", sep='\t', index=False)
fin_links.to_csv("results/rare_dise_gene_links.tsv", sep='\t', index=False)