# Format DrugCentral indications for rare diseases

In [1]:
import pandas as pd
import numpy as np

## Read rare disease data

In [3]:
rare_dises = pd.read_csv("data/rare_disease_info.tsv", sep='\t')

In [4]:
rare_dises.head(2)

Unnamed: 0,dise_name,orphanet_id,ref_id,ref_name,dise_type
0,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,ICD-10,rare_genetic
1,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,607131,OMIM,rare_genetic


## Read indications

In [5]:
inds = (pd
    .read_csv("data/simple_indications.tsv", sep='\t')
    .query("relationship_name != 'contraindication'")
)

print(len(inds))

10254


In [6]:
inds.head(2)

Unnamed: 0,struct_id,relationship_name,disease_name,umls_cui,snomed_conceptid
0,1253,indication,Tuberculosis,C0041296,56717001
1,5203,indication,Malignant tumor of ovary,C1140680,363443007


---

## Rare disease identifier conversion

Rare diseases are best identified by ICD-10 ids. However, the diseases in the DrugCentral indications are in UMLS and SNOMED namespace.

We will try to use ICD-10 (to SNOMED), UMLS, and OMIM ids to try and map the rare diseases to diseases in the indications file.

In [7]:
rare_dises["orphanet_id"].nunique()

7762

In [8]:
rare_dises.groupby("ref_name")["orphanet_id"].nunique().sort_values(ascending=False)

ref_name
ICD-10    7076
OMIM      4381
UMLS      2879
MeSH      1760
MedDRA    1166
Name: orphanet_id, dtype: int64

## Convert rare diseases to snomed

Rare OrphaNet diseases are best referenced by ICD-10. We will use UMLS if ICD-10 ids don't exist.

In [8]:
rare_dises = rare_dises.query("ref_name in('OMIM', 'UMLS', 'ICD-10')")

In [9]:
rare_dises["orphanet_id"].nunique()

7755

Using a combination of the three id types allows us almost perfect coverage of the rare diseases.

### Orphanet to UMLS map

In [10]:
# orphanet to umls map
# one umls cui may have multiple orphanet ids

umls_map = (rare_dises
    .query("ref_name == 'UMLS'")
    [["orphanet_id", "ref_id"]]
    .rename(columns={"ref_id": "umls_cui"})
)

In [11]:
umls_map.head(2)

Unnamed: 0,orphanet_id,umls_cui
6,58,C0270726
12,61,C0024748


## Orphanet to OMIM map

In [12]:
omim_map = (rare_dises
    .query("ref_name == 'OMIM'")
    [["orphanet_id", "ref_id"]]
    .rename(columns={"ref_id": "omim_id"})
    .astype({"omim_id": np.int64})
)

In [13]:
omim_map.head(2)

Unnamed: 0,orphanet_id,omim_id
1,166024,607131
3,166032,609325


## Orphanet to ICD-10 to SNOMED map

In [14]:
rare_icd = (rare_dises
    .query("ref_name == 'ICD-10'")
    [["dise_name", "orphanet_id", "ref_id"]]
    .rename(columns={"ref_id": "icd_id"})
)

In [15]:
rare_icd.head(2)

Unnamed: 0,dise_name,orphanet_id,icd_id
0,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3
2,"Multiple epiphyseal dysplasia, with miniepiphyses",166032,Q77.3


### Convert ICD-10 to SNOMED

In [16]:
mapping = pd.read_csv("snomed_icd_map.tsv", sep='\t')

In [17]:
mapping.head(2)

Unnamed: 0,snomed_id,snomed_name,icd_id,icd_name
0,109006,Anxiety disorder of childhood OR adolescence (...,F93.0,Separation anxiety disorder of childhood
1,109006,Anxiety disorder of childhood OR adolescence (...,F40.8,Other phobic anxiety disorders


In [18]:
rare_sno = rare_icd.merge(mapping[["snomed_id", "icd_id"]], how="left", on="icd_id")

In [19]:
rare_sno.head(2)

Unnamed: 0,dise_name,orphanet_id,icd_id,snomed_id
0,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,76556008.0
1,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,205465004.0


In [39]:
mapping.query("snomed_id == 237075006")

Unnamed: 0,snomed_id,snomed_name,icd_id,icd_name
74476,237075006,Incarceration of uterus (disorder),O34.539,Maternal care for retroversion of gravid uteru...
74477,237075006,Incarceration of uterus (disorder),N85.8,Other specified noninflammatory disorders of u...
74478,237075006,Incarceration of uterus (disorder),O34.511,Maternal care for incarceration of gravid uter...
74479,237075006,Incarceration of uterus (disorder),O34.519,Maternal care for incarceration of gravid uter...
74480,237075006,Incarceration of uterus (disorder),,
74481,237075006,Incarceration of uterus (disorder),Z37.9,"Outcome of delivery, unspecified"
74482,237075006,Incarceration of uterus (disorder),O90.9,"Complication of the puerperium, unspecified"


In [37]:
mapping[["snomed_id", "icd_id"]].sample(10)

Unnamed: 0,snomed_id,icd_id
133008,430349003,K92.2
26518,73602001,T79.9XX?
148211,104951000119106,H43.10
130123,422352008,
147861,90121000119102,K08.434
96063,283641007,W27.3XX?
133001,430337004,Z82.0
74477,237075006,N85.8
14833,41174002,L04.2
14684,40873003,E72.19


In [38]:
mapping.query("snomed_id == 40873003")

Unnamed: 0,snomed_id,snomed_name,icd_id,icd_name
14684,40873003,Sulfite oxidase deficiency syndrome (disorder),E72.19,Other disorders of sulfur-bearing amino-acid m...


---

### String match diseases without UMLS or SNOMED to SNOMED directly

Map through a crude string matching format.

In [20]:
missing_sno = set(rare_sno[rare_sno["snomed_id"].isnull()]["orphanet_id"])

icd = set(rare_dises.query("ref_name == 'ICD-10'")["orphanet_id"])
umls = set(rare_dises.query("ref_name == 'UMLS'")["orphanet_id"])

icd_only_no_sno = (icd - umls) & missing_sno

In [21]:
len(icd_only_no_sno)

1499

In [22]:
need_conv = rare_sno.query("orphanet_id in @icd_only_no_sno")[["orphanet_id", "dise_name"]]

In [23]:
need_conv.head(2)

Unnamed: 0,orphanet_id,dise_name
538,166108,"Intellectual disability, Birk-Barel type"
1075,166295,Benign non-familial infantile seizures


The names were mapped in another notebook.

In [24]:
mapped = (pd
    .read_csv("orpha_snomed_map.tsv", sep='\t')
    .merge(
        rare_dises[["orphanet_id", "dise_name"]],
        how="inner", on="orphanet_id"
    )
    .drop_duplicates()
)

In [25]:
mapped.head(2)

Unnamed: 0,orphanet_id,snomed_id,dise_name
0,104,58610003,Leber hereditary optic neuropathy
3,169090,717811007,Combined immunodeficiency due to CRAC channel ...


In [26]:
mapped["orphanet_id"].nunique()

276

## Final Orphanet to SNOMED mapping

In [27]:
rare_sno.shape

(152606, 4)

In [28]:
rare_sno.head()

Unnamed: 0,dise_name,orphanet_id,icd_id,snomed_id
0,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,76556008.0
1,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,205465004.0
2,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,254082007.0
3,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,254083002.0
4,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,278715001.0


In [29]:
rare_sno.isnull().sum()

dise_name         0
orphanet_id       0
icd_id            0
snomed_id      2640
dtype: int64

In [30]:
kek = (rare_sno
    [["orphanet_id", "snomed_id"]]
    .dropna(axis=0, how="any")
    .astype({"snomed_id": np.int64})
    .drop_duplicates()
)

In [31]:
len(set(kek["orphanet_id"]) & set(mapped["orphanet_id"]))

14

In [32]:
set(mapped["orphanet_id"]) - set(kek["orphanet_id"])

{17,
 28,
 59,
 104,
 134,
 227,
 229,
 318,
 334,
 405,
 427,
 469,
 492,
 513,
 518,
 530,
 622,
 641,
 723,
 747,
 793,
 811,
 833,
 911,
 956,
 959,
 991,
 1302,
 1305,
 1308,
 1310,
 1467,
 1768,
 1797,
 1930,
 1972,
 2029,
 2032,
 2038,
 2056,
 2062,
 2090,
 2115,
 2119,
 2176,
 2306,
 2369,
 2498,
 2729,
 2741,
 2744,
 2903,
 3086,
 3097,
 3337,
 3434,
 3437,
 3439,
 3473,
 3474,
 26792,
 26793,
 28378,
 31824,
 31825,
 31828,
 33069,
 33208,
 35107,
 35689,
 35710,
 35807,
 39041,
 39812,
 45358,
 45452,
 46487,
 47159,
 49804,
 52047,
 52416,
 52427,
 52994,
 57782,
 60030,
 63261,
 64280,
 69665,
 70573,
 71212,
 71269,
 71493,
 75377,
 77295,
 79094,
 79238,
 79430,
 79435,
 79473,
 79500,
 83465,
 83468,
 83597,
 84085,
 85128,
 85201,
 85279,
 85284,
 85335,
 85451,
 85458,
 86788,
 86814,
 86820,
 86850,
 86854,
 86867,
 86884,
 86911,
 88629,
 90058,
 90068,
 90797,
 91359,
 91483,
 93320,
 93321,
 93322,
 93402,
 93558,
 93623,
 93929,
 95157,
 97353,
 97355,
 98627,
 9

In [33]:
rare_dises.query("orphanet_id == 530")

Unnamed: 0,dise_name,orphanet_id,ref_id,ref_name,dise_type
14992,Lipoid proteinosis,530,247100,OMIM,rare_genetic
14993,Lipoid proteinosis,530,E78.8,ICD-10,rare_genetic


In [35]:
mapped.query("orphanet_id == 530")

Unnamed: 0,orphanet_id,snomed_id,dise_name
488,530,238950006,Lipoid proteinosis
490,530,38692000,Lipoid proteinosis


In [34]:
rare_sno.query("orphanet_id == 530")

Unnamed: 0,dise_name,orphanet_id,icd_id,snomed_id
105628,Lipoid proteinosis,530,E78.8,


----

### Remove diseases with missing SNOMED ids

In [26]:
dise_res = (rare_sno
    .dropna(axis=0, how="any")
    .astype({"snomed_id": np.int64})
    .drop("icd_id", axis=1)
    .drop_duplicates()
)

In [27]:
dise_res.shape

(148960, 3)

In [28]:
dise_res.head()

Unnamed: 0,dise_name,orphanet_id,snomed_id
0,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,76556008
1,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,205465004
2,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,254082007
3,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,254083002
4,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,278715001


In [29]:
dise_res["orphanet_id"].nunique()

4814

In [30]:
dise_res["snomed_id"].nunique()

15223

### Add on direct SNOMED matches

In [31]:
dise_map = dise_res.append(mapped).drop_duplicates()

In [32]:
dise_map.shape

(149304, 3)

In [33]:
dise_map["orphanet_id"].nunique()

5076

In [34]:
dise_map.head()

Unnamed: 0,dise_name,orphanet_id,snomed_id
0,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,76556008
1,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,205465004
2,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,254082007
3,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,254083002
4,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,278715001


---

## Match to indications

The drug central indications are given with snomed ids as diseases, so we will use that to cross reference to see which indications are about rare diseases.

In [53]:
inds = pd.read_csv("simple_inds.tsv", sep='\t')

In [54]:
inds.head(2)

Unnamed: 0,struct_id,relationship_name,concept_name,umls_cui,snomed_full_name,cui_semantic_type,snomed_conceptid
0,1253,indication,Tuberculosis,C0041296,Tuberculosis,T047,56717001
1,5203,indication,Malignant tumor of ovary,C1140680,Malignant tumor of ovary,T191,363443007


In [55]:
inds.shape

(36949, 7)

In [56]:
stuff = (inds
    .drop(["concept_name", "snomed_full_name", "cui_semantic_type"], axis=1)
    .rename(columns={"snomed_conceptid": "snomed_id"})
        )

In [57]:
stuff.head()

Unnamed: 0,struct_id,relationship_name,umls_cui,snomed_id
0,1253,indication,C0041296,56717001
1,5203,indication,C1140680,363443007
2,5202,indication,C0003873,69896004
3,67,indication,C0004238,49436004
4,67,indication,C0018802,42343007


In [58]:
ja = stuff.merge(dise_map[["snomed_id", "orphanet_id"]], how="left", on="snomed_id")

In [59]:
ja.head()

Unnamed: 0,struct_id,relationship_name,umls_cui,snomed_id,orphanet_id
0,1253,indication,C0041296,56717001,
1,5203,indication,C1140680,363443007,
2,5202,indication,C0003873,69896004,
3,67,indication,C0004238,49436004,
4,67,indication,C0018802,42343007,


In [60]:
umls_map.head()

Unnamed: 0,orphanet_id,umls_cui
6,58,C0270726
12,61,C0024748
18,93,C0268225
19,93,C2931840
25,585,C0268263


In [61]:
kek = ja.merge(umls_map, how="left", on="umls_cui")

In [62]:
kek.shape

(64361, 6)

In [63]:
kek.head()

Unnamed: 0,struct_id,relationship_name,umls_cui,snomed_id,orphanet_id_x,orphanet_id_y
0,1253,indication,C0041296,56717001,,3389.0
1,5203,indication,C1140680,363443007,,
2,5202,indication,C0003873,69896004,,
3,67,indication,C0004238,49436004,,
4,67,indication,C0018802,42343007,,


In [64]:
kek.shape

(64361, 6)

In [65]:
lel = kek.dropna(how="any")

In [66]:
lel.shape

(7736, 6)

In [67]:
lel.head()

Unnamed: 0,struct_id,relationship_name,umls_cui,snomed_id,orphanet_id_x,orphanet_id_y
54,5189,indication,C0008533,41788008,169799.0,98879.0
55,5189,indication,C0008533,41788008,169796.0,98879.0
56,5189,indication,C0008533,41788008,169793.0,98879.0
57,5189,indication,C0008533,41788008,177929.0,98879.0
58,5189,indication,C0008533,41788008,98879.0,98879.0


In [68]:
lel.shape

(7736, 6)

In [69]:
lel = lel.astype({"orphanet_id_x": np.int64, "orphanet_id_y":np.int64})

In [70]:
lel.dtypes

struct_id             int64
relationship_name    object
umls_cui             object
snomed_id             int64
orphanet_id_x         int64
orphanet_id_y         int64
dtype: object

In [71]:
lel.head()

Unnamed: 0,struct_id,relationship_name,umls_cui,snomed_id,orphanet_id_x,orphanet_id_y
54,5189,indication,C0008533,41788008,169799,98879
55,5189,indication,C0008533,41788008,169796,98879
56,5189,indication,C0008533,41788008,169793,98879
57,5189,indication,C0008533,41788008,177929,98879
58,5189,indication,C0008533,41788008,98879,98879


In [72]:
(lel["orphanet_id_x"] != lel["orphanet_id_y"]).sum()

6627

In [37]:
res = (inds
    .drop(["umls_cui", "cui_semantic_type", "concept_name"], axis=1)
    .rename(columns={"snomed_conceptid": "snomed_id"})
    .merge(dise_map, on="snomed_id", how="inner")
    .drop("snomed_full_name", axis=1)
)

In [38]:
res.shape

(34578, 5)

In [39]:
res.shape

(34578, 5)

In [40]:
res.head()

Unnamed: 0,struct_id,relationship_name,snomed_id,dise_name,orphanet_id
0,76,indication,3744001,Major hypertriglyceridemia,181425
1,76,indication,3744001,Rare hyperlipidemia,181422
2,76,indication,3744001,Hyperlipoproteinemia type 3,412
3,76,indication,3744001,Familial lipoprotein lipase deficiency,309015
4,76,indication,3744001,Familial apolipoprotein C-II deficiency,309020


### Add in chemical names

In [29]:
dnames = (pd
    .read_csv("../data/drug_names.csv")
    [["id", "name"]]
    .rename(columns={
        "id": "struct_id",
        "name": "drug_name"
    })
)

good = (res
    [["struct_id", "relationship_name", "orphanet_id", "dise_name", "dise_type"]]
    .drop_duplicates()
    .merge(dnames, how="left", on="struct_id")
)

## Information about results

In [30]:
good.shape

(33087, 6)

In [31]:
good.head()

Unnamed: 0,struct_id,relationship_name,orphanet_id,dise_name,dise_type,drug_name
0,76,indication,181425,Major hypertriglyceridemia,rare_genetic,acipimox
1,76,indication,181422,Rare hyperlipidemia,rare_genetic,acipimox
2,76,indication,412,Hyperlipoproteinemia type 3,rare_genetic,acipimox
3,76,indication,309015,Familial lipoprotein lipase deficiency,rare_genetic,acipimox
4,76,indication,309020,Familial apolipoprotein C-II deficiency,rare_genetic,acipimox


### Statistics

In [32]:
good["struct_id"].nunique()

1488

In [33]:
good["orphanet_id"].nunique()

1753

In [34]:
good["relationship_name"].value_counts()

contraindication    25892
indication           5287
off-label use        1908
Name: relationship_name, dtype: int64

In [35]:
good["dise_type"].value_counts()

rare_genetic    24595
not_genetic      8492
Name: dise_type, dtype: int64

In [36]:
good.groupby(["dise_type", "relationship_name"]).size()

dise_type     relationship_name
not_genetic   contraindication      5412
              indication            2198
              off-label use          882
rare_genetic  contraindication     20480
              indication            3089
              off-label use         1026
dtype: int64

About half of the indications in the data are for non genetic rare diseases.

In [37]:
good.groupby(["dise_type", "relationship_name"])["orphanet_id"].nunique()

dise_type     relationship_name
not_genetic   contraindication     286
              indication           410
              off-label use        210
rare_genetic  contraindication     872
              indication           879
              off-label use        663
Name: orphanet_id, dtype: int64

In [38]:
good.groupby("relationship_name")["orphanet_id"].nunique()

relationship_name
contraindication    1158
indication          1289
off-label use        873
Name: orphanet_id, dtype: int64

In [39]:
good.groupby("relationship_name")["struct_id"].nunique()

relationship_name
contraindication    1111
indication           754
off-label use        248
Name: struct_id, dtype: int64

In [40]:
good.query("relationship_name == 'indication'").groupby("orphanet_id").size().value_counts().head()

1    381
3    315
4    160
2    158
8     79
dtype: int64

Drugcentral contains at least 5k drug indications for 1300 diseases with hundreds of different drugs. It seems that there are plenty of rare disease indications that we can integrate into the rephetio network.

---

## Output to file

In [41]:
good.to_csv("rare_disease_indications.tsv", sep='\t', index=False)