# Extract DrugCentral drug indications

2017-06-21

In [1]:
import pandas as pd
import numpy as np

## Read indication information

This is a text file dump of the `omop_relationship` table.

In [2]:
ind = pd.read_csv("../data/raw/drugcentral_indications.csv")

In [3]:
ind.shape

(40865, 9)

In [4]:
ind.head(2)

Unnamed: 0,id,struct_id,concept_id,relationship_name,concept_name,umls_cui,snomed_full_name,cui_semantic_type,snomed_conceptid
0,132463,1253,21000041,indication,Tuberculosis,C0041296,Tuberculosis,T047,56717001.0
1,132464,5203,21000533,indication,Malignant tumor of ovary,C1140680,Malignant tumor of ovary,T191,363443007.0


### Basic information

In [5]:
ind.isnull().sum()

id                      0
struct_id               0
concept_id              0
relationship_name       0
concept_name            0
umls_cui             3916
snomed_full_name     3916
cui_semantic_type    3916
snomed_conceptid     3916
dtype: int64

In [6]:
ind.isnull().sum(axis=1).value_counts()

0    36949
4     3916
dtype: int64

Empty cells occur together in four columns (umls_cui, snomed_full_name, cui_semantic_type,
snomed_conceptid).

**Can these diseases with missing ids be mapped with string searching?**

Random sampling by hand of the disease name in the UMLS and SNOMED show that mapping through a direct text search would be very difficult.

In [7]:
ind[ind["umls_cui"].isnull()].head()

Unnamed: 0,id,struct_id,concept_id,relationship_name,concept_name,umls_cui,snomed_full_name,cui_semantic_type,snomed_conceptid
711,133173,26,21003143,contraindication,Palmar-Plantar Erythrodysesthesia,,,,
743,133204,254,21003413,off-label use,Prevention of HIV Infection after Exposure,,,,
752,133213,348,21002783,indication,Systemic Dermatomyositis,,,,
795,133256,824,21001374,indication,Otitis Externa Eczema,,,,
806,133267,865,21002814,indication,Osteoarthritis in Patients at High Ulcer Risk,,,,


## Remove indications with missing disease identifiers

In [8]:
# snomed_full_name is the same as concept_name
res = (ind
    .drop(["id", "concept_id", "snomed_full_name", "cui_semantic_type"], axis=1)
    .dropna(axis=0, how="any")
    .astype({"snomed_conceptid": np.int64})
    .rename(columns={"concept_name": "disease_name"})
    .drop_duplicates()
)

In [9]:
res.shape

(36949, 5)

In [10]:
res.head()

Unnamed: 0,struct_id,relationship_name,disease_name,umls_cui,snomed_conceptid
0,1253,indication,Tuberculosis,C0041296,56717001
1,5203,indication,Malignant tumor of ovary,C1140680,363443007
2,5202,indication,Rheumatoid arthritis,C0003873,69896004
3,67,indication,Atrial fibrillation,C0004238,49436004
4,67,indication,Congestive heart failure,C0018802,42343007


### Indication statistics

In [11]:
res["struct_id"].nunique()

2370

In [12]:
res["relationship_name"].value_counts()

contraindication    26695
indication           8362
off-label use        1891
reduce risk             1
Name: relationship_name, dtype: int64

In [13]:
res["umls_cui"].nunique()

2192

In [14]:
res["snomed_conceptid"].nunique()

2214

## Save cleaned indications to file

In [15]:
res.to_csv("data/simple_indications.tsv", sep='\t', index=False)