# Format DrugCentral indications for rare diseases

In [1]:
import pandas as pd
import numpy as np

## Read rare disease data

In [2]:
rare_dises = pd.read_csv("data/rare_disease_info.tsv", sep='\t')

In [3]:
rare_dises.head(2)

Unnamed: 0,dise_name,orphanet_id,ref_id,ref_name,dise_type
0,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,607131,OMIM,rare_genetic
1,"Multiple epiphyseal dysplasia, Al-Gazali type",166024,C1846722,UMLS,rare_genetic


---

## Rare disease identifier conversion

Rare diseases are best identified by ICD-10 ids. However, the diseases in the DrugCentral indications are in UMLS and SNOMED namespace.

We will try to use ICD-10 (to SNOMED) and UMLS ids to try and map the rare diseases to diseases in the indications file.

In [4]:
rare_dises["orphanet_id"].nunique()

7945

In [5]:
rare_dises.groupby("ref_name")["orphanet_id"].nunique().sort_values(ascending=False)

ref_name
ICD-10    7058
UMLS      4838
OMIM      4436
MeSH      1758
MedDRA    1164
Name: orphanet_id, dtype: int64

## Convert rare diseases to SNOMED

Rare OrphaNet diseases are best referenced by ICD-10. We will use UMLS if ICD-10 ids don't exist.

In [6]:
rare_dises = rare_dises.query("ref_name in('UMLS', 'ICD-10')")

In [7]:
rare_dises["orphanet_id"].nunique()

7735

Using a combination of the two id types gives 95% coverage of the Orphanet diseases. We will ignore the remaining 5% because they are not likely to have drug indications anyways.

### Orphanet to UMLS map

In [8]:
# orphanet to umls map
# one umls cui may have multiple orphanet ids

umls_map = (rare_dises
    .query("ref_name == 'UMLS'")
    [["orphanet_id", "ref_id"]]
    .rename(columns={"ref_id": "umls_cui"})
)

In [9]:
umls_map.head(2)

Unnamed: 0,orphanet_id,umls_cui
1,166024,C1846722
4,166032,C1836307


## Orphanet to ICD-10 to SNOMED map

In [10]:
rare_icd = (rare_dises
    .query("ref_name == 'ICD-10'")
    [["orphanet_id", "ref_id"]]
    .rename(columns={"ref_id": "icd_id"})
)

In [11]:
rare_icd.head(2)

Unnamed: 0,orphanet_id,icd_id
2,166024,Q77.3
5,166032,Q77.3


### Convert ICD-10 to SNOMED

In [12]:
mapping = pd.read_csv("data/snomed_icd10_map.tsv", sep='\t')

In [13]:
mapping.head(2)

Unnamed: 0,snomed_id,snomed_name,icd_id,icd_name
0,109006,Anxiety disorder of childhood OR adolescence (...,F93.0,Separation anxiety disorder of childhood
1,109006,Anxiety disorder of childhood OR adolescence (...,F40.8,Other phobic anxiety disorders


In [14]:
rare_sno = rare_icd.merge(
    mapping[["snomed_id", "icd_id"]], how="inner", on="icd_id"
)

In [15]:
rare_sno.head(2)

Unnamed: 0,orphanet_id,icd_id,snomed_id
0,166024,Q77.3,76556008
1,166024,Q77.3,205465004


---

## Match to indications

The drug central indications are given with snomed ids as diseases, so we will use that to cross reference to see which indications are about rare diseases.

## Read indications

In [16]:
inds = (pd
    .read_csv("data/simple_indications.tsv", sep='\t')
    .query("relationship_name != 'contraindication'")
    .rename(columns={"snomed_conceptid": "snomed_id"})
)

print(len(inds))

10254


In [17]:
inds.head(2)

Unnamed: 0,struct_id,relationship_name,disease_name,umls_cui,snomed_id
0,1253,indication,Tuberculosis,C0041296,56717001
1,5203,indication,Malignant tumor of ovary,C1140680,363443007


---

## Cross reference with Orphanet

In [18]:
res = pd.concat([
    inds.merge(
        umls_map, how="inner", on="umls_cui"
    ),
    inds.merge(
        rare_sno[["orphanet_id", "snomed_id"]].drop_duplicates(),
        how="inner", on="snomed_id"    
    )
])

In [19]:
res.shape

(9656, 6)

In [20]:
res.head()

Unnamed: 0,struct_id,relationship_name,disease_name,umls_cui,snomed_id,orphanet_id
0,1253,indication,Tuberculosis,C0041296,56717001,3389
1,5203,indication,Malignant tumor of ovary,C1140680,363443007,213500
2,1667,off-label use,Malignant tumor of ovary,C1140680,363443007,213500
3,2707,indication,Malignant tumor of ovary,C1140680,363443007,213500
4,1678,indication,Malignant tumor of ovary,C1140680,363443007,213500


---

## Add in disease names from Orphanet

In [21]:
res = (res
    [["struct_id", "relationship_name", "orphanet_id"]]
    .merge(
        rare_dises[["orphanet_id", "dise_name", "dise_type"]],
        how="inner", on="orphanet_id"
    )
    .drop_duplicates()       
)

In [22]:
res.shape

(8297, 5)

In [23]:
res.head()

Unnamed: 0,struct_id,relationship_name,orphanet_id,dise_name,dise_type
0,1253,indication,3389,Tuberculosis,not_genetic
1,5203,indication,213500,Ovarian cancer,not_genetic
2,1667,off-label use,213500,Ovarian cancer,not_genetic
3,2707,indication,213500,Ovarian cancer,not_genetic
4,1678,indication,213500,Ovarian cancer,not_genetic


---

### Add in chemical names

In [24]:
dnames = (pd
    .read_csv("../data/drug_names.csv")
    [["id", "name"]]
    .rename(columns={
        "id": "struct_id",
        "name": "drug_name"
    })
)

In [25]:
good = (res
    [["struct_id", "relationship_name", "orphanet_id", "dise_name", "dise_type"]]
    .drop_duplicates()
    .merge(dnames, how="left", on="struct_id")
)

## Information about results

In [26]:
good.shape

(8297, 6)

In [27]:
good.head()

Unnamed: 0,struct_id,relationship_name,orphanet_id,dise_name,dise_type,drug_name
0,1253,indication,3389,Tuberculosis,not_genetic,ftivazide
1,5203,indication,213500,Ovarian cancer,not_genetic,rucaparib
2,1667,off-label use,213500,Ovarian cancer,not_genetic,megestrol acetate
3,2707,indication,213500,Ovarian cancer,not_genetic,topotecan
4,1678,indication,213500,Ovarian cancer,not_genetic,melphalan


### Statistics

In [28]:
good["struct_id"].nunique()

1066

In [29]:
good["orphanet_id"].nunique()

1573

In [30]:
good["relationship_name"].value_counts()

indication       6151
off-label use    2146
Name: relationship_name, dtype: int64

In [31]:
good["dise_type"].value_counts()

rare_genetic    4325
not_genetic     3972
Name: dise_type, dtype: int64

In [32]:
good.groupby(["dise_type", "relationship_name"]).size()

dise_type     relationship_name
not_genetic   indication           2903
              off-label use        1069
rare_genetic  indication           3248
              off-label use        1077
dtype: int64

About half of the indications in the data are for non genetic rare diseases.

In [33]:
good.groupby(["dise_type", "relationship_name"])["orphanet_id"].nunique()

dise_type     relationship_name
not_genetic   indication           505
              off-label use        258
rare_genetic  indication           930
              off-label use        679
Name: orphanet_id, dtype: int64

In [34]:
good.groupby("relationship_name")["orphanet_id"].nunique()

relationship_name
indication       1435
off-label use     937
Name: orphanet_id, dtype: int64

In [35]:
good.groupby("relationship_name")["struct_id"].nunique()

relationship_name
indication       967
off-label use    292
Name: struct_id, dtype: int64

In [36]:
good.query("relationship_name == 'indication'").groupby("orphanet_id").size().value_counts().head()

1    432
3    337
2    177
4    169
8     82
dtype: int64

Drugcentral contains at least 5k drug indications for 1300 diseases with hundreds of different drugs. It seems that there are plenty of rare disease indications that we can integrate into the rephetio network.

---

## Output to file

In [37]:
good.to_csv("rare_disease_indications.tsv", sep='\t', index=False)