# Format DrugCentral indications for rare diseases

In [1]:
import pandas as pd
import numpy as np

## Read rare disease data

In [2]:
rare_dises = pd.read_csv("rare_disease_info.tsv", sep='\t')

In [3]:
rare_dises.head(2)

Unnamed: 0,dise_name,orphanet_id,ref_id,ref_name,dise_type
0,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,ICD-10,rare_genetic
1,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,607131,OMIM,rare_genetic


---

## Rare disease identifier conversion

Rare diseases are best identified by ICD-10 ids. However, the diseases in the DrugCentral indications are in UMLS and SNOMED namespace. We will use the ICD-10 identifier to convert to SNOMED, and fall back to UMLS if there is no ICD-10 mapping.

In [4]:
rare_dises["orphanet_id"].nunique()

7762

In [5]:
rare_dises.groupby("ref_name")["orphanet_id"].nunique().sort_values(ascending=False)

ref_name
ICD-10    7076
OMIM      4381
UMLS      2879
MeSH      1760
MedDRA    1166
Name: orphanet_id, dtype: int64

## Convert rare diseases to snomed

Rare OrphaNet diseases are best referenced by ICD-10. We will use UMLS if ICD-10 ids don't exist.

In [6]:
rare_icd = (rare_dises
    .query("ref_name == 'ICD-10'")
    .rename(columns={"ref_id": "icd_id"})
    .drop("ref_name", axis=1)
)

In [7]:
rare_icd["orphanet_id"].nunique()

7076

In [8]:
rare_icd.head(2)

Unnamed: 0,dise_name,orphanet_id,icd_id,dise_type
0,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,rare_genetic
2,"Multiple epiphyseal dysplasia, with miniepiphyses",166032,Q77.3,rare_genetic


### Convert ICD-10 to SNOMED

In [9]:
mapping = pd.read_csv("snomed_icd_map.tsv", sep='\t')

In [10]:
mapping.head(2)

Unnamed: 0,snomed_id,snomed_name,icd_id,icd_name
0,109006,Anxiety disorder of childhood OR adolescence (...,F93.0,Separation anxiety disorder of childhood
1,109006,Anxiety disorder of childhood OR adolescence (...,F40.8,Other phobic anxiety disorders


In [11]:
dise_res = rare_dises.merge(mapping, how="left", on="icd_id")

In [12]:
dise_res.shape

(152606, 7)

In [13]:
dise_res.head()

Unnamed: 0,dise_name,orphanet_id,icd_id,dise_type,snomed_id,snomed_name,icd_name
0,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,rare_genetic,76556008.0,Hyperphosphatasia-osteoectasia syndrome (disor...,Chondrodysplasia punctata
1,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,rare_genetic,205465004.0,Chondrodysplasia (disorder),Chondrodysplasia punctata
2,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,rare_genetic,254082007.0,"Chondrodysplasia punctata, X-linked recessive ...",Chondrodysplasia punctata
3,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,rare_genetic,254083002.0,"Chondrodysplasia punctata, tibia-metacarpal ty...",Chondrodysplasia punctata
4,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,rare_genetic,278715001.0,Chondrodysplasia punctata (stippled epiphyses)...,Chondrodysplasia punctata


### Examine missing information

In [14]:
dise_res.isnull().sum()

dise_name         0
orphanet_id       0
icd_id            0
dise_type         0
snomed_id      2640
snomed_name    2640
icd_name       2640
dtype: int64

In [15]:
dise_res[dise_res["snomed_id"].isnull()].head()

Unnamed: 0,dise_name,orphanet_id,icd_id,dise_type,snomed_id,snomed_name,icd_name
24,Alexander disease,58,E75.2,rare_genetic,,,
71,Multiple sulfatase deficiency,585,E75.2,rare_genetic,,,
89,Canavan disease,141,E75.2,rare_genetic,,,
223,Cystinosis,213,E72.0,rare_genetic,,,
308,Farber disease,333,E75.2,rare_genetic,,,


In [15]:
dise_res[dise_res["snomed_id"].isnull()]["orphanet_id"].nunique()

2357

Some OrphaNet diseases cannot be mapped to SNOMED due to missing ICD-10 to SNOMED mappings. However, these diseases seem to have SNOMED ids based on a name lookup. For now we will drop these diseases.

### Remove diseases with missing SNOMED ids

In [16]:
dise_res = (dise_res
    .dropna(axis=0, how="any")
    .astype({"snomed_id": np.int64})
    .drop_duplicates()
)

In [17]:
dise_res.shape

(149966, 7)

In [18]:
dise_res.head()

Unnamed: 0,dise_name,orphanet_id,icd_id,dise_type,snomed_id,snomed_name,icd_name
0,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,rare_genetic,76556008,Hyperphosphatasia-osteoectasia syndrome (disor...,Chondrodysplasia punctata
1,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,rare_genetic,205465004,Chondrodysplasia (disorder),Chondrodysplasia punctata
2,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,rare_genetic,254082007,"Chondrodysplasia punctata, X-linked recessive ...",Chondrodysplasia punctata
3,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,rare_genetic,254083002,"Chondrodysplasia punctata, tibia-metacarpal ty...",Chondrodysplasia punctata
4,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,Q77.3,rare_genetic,278715001,Chondrodysplasia punctata (stippled epiphyses)...,Chondrodysplasia punctata


In [19]:
dise_res["orphanet_id"].nunique()

4814

In [20]:
dise_res["snomed_id"].nunique()

15223

In [21]:
dise_map = (dise_res
    .drop(["icd_id", "icd_name"], axis=1)
    .drop_duplicates()
)

In [22]:
dise_map.shape

(148960, 5)

In [23]:
dise_map.head()

Unnamed: 0,dise_name,orphanet_id,dise_type,snomed_id,snomed_name
0,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,rare_genetic,76556008,Hyperphosphatasia-osteoectasia syndrome (disor...
1,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,rare_genetic,205465004,Chondrodysplasia (disorder)
2,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,rare_genetic,254082007,"Chondrodysplasia punctata, X-linked recessive ..."
3,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,rare_genetic,254083002,"Chondrodysplasia punctata, tibia-metacarpal ty..."
4,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,rare_genetic,278715001,Chondrodysplasia punctata (stippled epiphyses)...


---

## Match to indications

The drug central indications are given with snomed ids as diseases, so we will use that to cross reference to see which indications are about rare diseases.

In [21]:
inds = pd.read_csv("simple_inds.tsv", sep='\t')

In [22]:
inds.head(2)

Unnamed: 0,struct_id,relationship_name,concept_name,umls_cui,snomed_full_name,cui_semantic_type,snomed_conceptid
0,1253,indication,Tuberculosis,C0041296,Tuberculosis,T047,56717001
1,5203,indication,Malignant tumor of ovary,C1140680,Malignant tumor of ovary,T191,363443007


In [26]:
res = (inds
    .drop(["umls_cui", "cui_semantic_type", "concept_name"], axis=1)
    .rename(columns={"snomed_conceptid": "snomed_id"})
    .merge(dise_map, on="snomed_id", how="inner")
    .drop("snomed_full_name", axis=1)
)

In [27]:
res.shape

(34386, 7)

In [28]:
res.head()

Unnamed: 0,struct_id,relationship_name,snomed_id,dise_name,orphanet_id,dise_type,snomed_name
0,76,indication,3744001,Major hypertriglyceridemia,181425,rare_genetic,Hyperlipoproteinemia (disorder)
1,76,indication,3744001,Rare hyperlipidemia,181422,rare_genetic,Hyperlipoproteinemia (disorder)
2,76,indication,3744001,Hyperlipoproteinemia type 3,412,rare_genetic,Hyperlipoproteinemia (disorder)
3,76,indication,3744001,Familial lipoprotein lipase deficiency,309015,rare_genetic,Hyperlipoproteinemia (disorder)
4,76,indication,3744001,Familial apolipoprotein C-II deficiency,309020,rare_genetic,Hyperlipoproteinemia (disorder)


### Add in chemical names

In [29]:
dnames = (pd
    .read_csv("../data/drug_names.csv")
    [["id", "name"]]
    .rename(columns={
        "id": "struct_id",
        "name": "drug_name"
    })
)

good = (res
    [["struct_id", "relationship_name", "orphanet_id", "dise_name", "dise_type"]]
    .drop_duplicates()
    .merge(dnames, how="left", on="struct_id")
)

## Information about results

In [30]:
good.shape

(33087, 6)

In [31]:
good.head()

Unnamed: 0,struct_id,relationship_name,orphanet_id,dise_name,dise_type,drug_name
0,76,indication,181425,Major hypertriglyceridemia,rare_genetic,acipimox
1,76,indication,181422,Rare hyperlipidemia,rare_genetic,acipimox
2,76,indication,412,Hyperlipoproteinemia type 3,rare_genetic,acipimox
3,76,indication,309015,Familial lipoprotein lipase deficiency,rare_genetic,acipimox
4,76,indication,309020,Familial apolipoprotein C-II deficiency,rare_genetic,acipimox


### Statistics

In [32]:
good["struct_id"].nunique()

1488

In [33]:
good["orphanet_id"].nunique()

1753

In [34]:
good["relationship_name"].value_counts()

contraindication    25892
indication           5287
off-label use        1908
Name: relationship_name, dtype: int64

In [35]:
good["dise_type"].value_counts()

rare_genetic    24595
not_genetic      8492
Name: dise_type, dtype: int64

In [36]:
good.groupby(["dise_type", "relationship_name"]).size()

dise_type     relationship_name
not_genetic   contraindication      5412
              indication            2198
              off-label use          882
rare_genetic  contraindication     20480
              indication            3089
              off-label use         1026
dtype: int64

About half of the indications in the data are for non genetic rare diseases.

In [37]:
good.groupby(["dise_type", "relationship_name"])["orphanet_id"].nunique()

dise_type     relationship_name
not_genetic   contraindication     286
              indication           410
              off-label use        210
rare_genetic  contraindication     872
              indication           879
              off-label use        663
Name: orphanet_id, dtype: int64

In [38]:
good.groupby("relationship_name")["orphanet_id"].nunique()

relationship_name
contraindication    1158
indication          1289
off-label use        873
Name: orphanet_id, dtype: int64

In [39]:
good.groupby("relationship_name")["struct_id"].nunique()

relationship_name
contraindication    1111
indication           754
off-label use        248
Name: struct_id, dtype: int64

In [40]:
good.query("relationship_name == 'indication'").groupby("orphanet_id").size().value_counts().head()

1    381
3    315
4    160
2    158
8     79
dtype: int64

Drugcentral contains at least 5k drug indications for 1300 diseases with hundreds of different drugs. It seems that there are plenty of rare disease indications that we can integrate into the rephetio network.

---

## Output to file

In [41]:
good.to_csv("rare_disease_indications.tsv", sep='\t', index=False)