# Remove indications from the Drugcentral gold standard which has concepts which do not exist in SemmedDB

2018-12-10

The evaluation won't be fair if we include indications which include concepts that don't exist in SemmedDB. We will filter these indications in this notebook.

In [1]:
import pandas as pd

## Read nodes

In [2]:
nodes = pd.read_csv("../pipeline/no_treats/semmeddb_no_treats_nodes.tsv", sep='\t')

In [3]:
nodes.shape

(210375, 3)

In [4]:
nodes.head()

Unnamed: 0,node_id,node_name,node_type
0,DOID:0050545,visceral heterotaxy,Disorders
1,DOID:0050591,tooth agenesis,Disorders
2,DOID:0060061,cutaneous T cell lymphoma,Disorders
3,DOID:0060073,lymphatic system cancer,Disorders
4,DOID:0060119,pharynx cancer,Disorders


---

## Read Drugcentral gold standard

In [5]:
dcentral = (pd
    .read_csv("../data/raw/indications_slim.csv", sep=',')
    [["compound_umlscui", "compound_name", "disease_umlscui", "disease_name", "etype"]]
    .rename(columns={
        "compound_umlscui": "chemical_id",
        "compound_name": "chemical_name",
        "disease_umlscui": "disease_id",
        "etype": "edge_type"
    })
)

In [6]:
dcentral.shape

(8176, 5)

In [7]:
dcentral.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,edge_type
0,C3661315,rucaparib,C1299247,ovarian cancer,TREATS_CDtDO
1,C4044947,baricitinib,C0003873,rheumatoid arthritis,TREATS_CDtDO
2,C0520442,acetyldigitoxin,C0004238,Atrial fibrillation,TREATS_CDtDO
3,C0520442,acetyldigitoxin,C0018802,Congestive heart failure,TREATS_CDtDO
4,C0771809,acexamic acid,C0037299,Skin ulcer,TREATS_CDtDO


In [8]:
dcentral["chemical_id"].str.startswith("C").value_counts()

True    8176
Name: chemical_id, dtype: int64

In [9]:
dcentral["disease_id"].str.startswith("C").value_counts()

True     8134
False      42
Name: disease_id, dtype: int64

In [10]:
dcentral[~dcentral["disease_id"].str.startswith("C")]["disease_id"].str.startswith("DOID:").value_counts()

True    42
Name: disease_id, dtype: int64

---

## Add UMLS to the gold standard node ids

In [11]:
gold = (dcentral
    .assign(
        chemical_id = lambda df: df["chemical_id"].map(
            lambda v: "UMLS:{}".format(v)
        )
    )
    .assign(
        disease_id = lambda df: df["disease_id"].map(
            lambda v: "UMLS:{}".format(v) if v.startswith("C") else v
        )
    )
)

In [12]:
gold.shape

(8176, 5)

In [13]:
gold.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,edge_type
0,UMLS:C3661315,rucaparib,UMLS:C1299247,ovarian cancer,TREATS_CDtDO
1,UMLS:C4044947,baricitinib,UMLS:C0003873,rheumatoid arthritis,TREATS_CDtDO
2,UMLS:C0520442,acetyldigitoxin,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO
3,UMLS:C0520442,acetyldigitoxin,UMLS:C0018802,Congestive heart failure,TREATS_CDtDO
4,UMLS:C0771809,acexamic acid,UMLS:C0037299,Skin ulcer,TREATS_CDtDO


In [14]:
gold_nodes = set(gold["chemical_id"]) | set(gold["disease_id"])

In [15]:
len(gold_nodes)

3409

## Remove gold standard indications which have concepts that don't exist in Semmeddb

In [16]:
len(gold_nodes - set(nodes["node_id"]))

656

In [17]:
node_set = set(nodes["node_id"])

In [18]:
good_gold = (gold
    .query("chemical_id in @node_set and disease_id in @node_set")
    .sort_values(["chemical_id", "disease_id"])
    .reset_index(drop=True)
)

In [19]:
good_gold.shape

(6307, 5)

In [20]:
good_gold.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,edge_type
0,UMLS:C0000473,aminobenzoic acid,UMLS:C0011633,Dermatomyositis,TREATS_CDtDO
1,UMLS:C0000473,aminobenzoic acid,UMLS:C0011644,Systemic sclerosis,TREATS_CDtDO
2,UMLS:C0000473,aminobenzoic acid,UMLS:C0030807,Pemphigus,TREATS_CDtDO
3,UMLS:C0000473,aminobenzoic acid,UMLS:C0030848,Induratio penis plastica,TREATS_CDtDO
4,UMLS:C0000473,aminobenzoic acid,UMLS:C1527383,Morphea,TREATS_CDtDO


In [21]:
good_gold["chemical_id"].nunique()

1607

In [22]:
good_gold["disease_id"].nunique()

954

## Save gold standard to disk

In [23]:
good_gold.to_csv("../pipeline/no_treats/semmeddb_no_treats_gold_std.tsv", sep='\t', index=False)