# Format SemmedDB gold standard for use

In [1]:
import pandas as pd

## Read drugcentral gold standard

In [2]:
gold = (pd
    .read_csv("data/indications_slim.csv", sep=',')
    [["compound_umlscui", "compound_name", "disease_umlscui", "disease_name", "etype"]]
    .rename(columns={
        "compound_umlscui": "chemical_id",
        "compound_name": "chemical_name",
        "disease_umlscui": "disease_id",
    })
)

In [3]:
gold.shape

(8176, 5)

In [4]:
gold.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype
0,C3661315,rucaparib,C1299247,ovarian cancer,TREATS_CDtDO
1,C4044947,baricitinib,C0003873,rheumatoid arthritis,TREATS_CDtDO
2,C0520442,acetyldigitoxin,C0004238,Atrial fibrillation,TREATS_CDtDO
3,C0520442,acetyldigitoxin,C0018802,Congestive heart failure,TREATS_CDtDO
4,C0771809,acexamic acid,C0037299,Skin ulcer,TREATS_CDtDO


---

## Read all nodes

In [5]:
nodes = (pd
    .read_csv("data/nodes_7_metanode_slim.csv", sep=',')
    .rename(columns={
        ":ID": "node_id",
        ":LABEL": "ntype"
    })
)

In [6]:
nodes.shape

(214590, 3)

In [7]:
nodes.head()

Unnamed: 0,node_id,name,ntype
0,C0016192,Flagella,Anatomy
1,C0230349,Cubital fossa,Anatomy
2,C0447417,Entire retromolar area of mouth,Anatomy
3,C0033151,Primitive Gut,Anatomy
4,C0225861,Left auricular appendage,Anatomy


---

## Filter gold standard

In [8]:
subg = (gold
    .merge(
        nodes[["node_id"]], how="inner",
        left_on="chemical_id", right_on="node_id"
    )
    .drop("node_id", axis=1)
    .merge(
        nodes[["node_id"]], how="inner",
        left_on="disease_id", right_on="node_id"
    )
    .drop("node_id", axis=1)
)

In [9]:
subg.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype
0,C0520442,acetyldigitoxin,C0004238,Atrial fibrillation,TREATS_CDtDO
1,C0033497,propranolol,C0004238,Atrial fibrillation,TREATS_CDtDO
2,C0012265,digoxin,C0004238,Atrial fibrillation,TREATS_CDtDO
3,C0084273,quinidine,C0004238,Atrial fibrillation,TREATS_CDtDO
4,C0766326,dronedarone,C0004238,Atrial fibrillation,TREATS_CDtDO


In [10]:
subg.shape

(6329, 5)

In [11]:
subg["chemical_id"].nunique()

1618

In [12]:
subg["disease_id"].nunique()

963

In [13]:
set(nodes["node_id"]) >= (set(subg["chemical_id"]) | set(subg["disease_id"]))

True

---

## Examine whether gold standard edges are in the network

In [14]:
edges = (pd
    .read_csv("data/edges_clean.tsv", sep='\t')
    .rename(columns={
        ":START_ID": "source_id",
        ":END_ID": "target_id",
        ":TYPE": "etype"
    })
)

In [15]:
edges.shape

(10230594, 3)

In [16]:
edges.head()

Unnamed: 0,source_id,target_id,etype
0,C0000039,C0001026,REGULATES_CDreg>CD
1,C0000039,C0001041,REGULATES_CDreg>CD
2,C0000039,C0001492,REGULATES_CDreg>CD
3,C0000039,C0001511,TREATS_CDtDO
4,C0000039,C0001962,REGULATES_CDreg>CD


### Remove existing treats edges in the network

In [17]:
edges = edges.query("etype != 'TREATS_CDtDO'")

In [18]:
edges.shape

(9646780, 3)

---

## Any relations between treats relations in the gold standard?

In [19]:
(edges
    .merge(
        (subg
            [["chemical_id", "disease_id"]]
            .rename(columns={
                "chemical_id": "source_id",
                "disease_id": "target_id"
            })
        ),
        how="inner", on=["source_id", "target_id"]
    )
)

Unnamed: 0,source_id,target_id,etype
0,C0011134,C0021572,REGULATES_CDreg>CD
1,C0011134,C0021572,RELATED_TO_CDrtCD
2,C0016410,C0032961,AUGMENTS_CDagPS
3,C0016410,C0032961,DISRUPTS_CDdsPS


In [20]:
(edges
    .merge(
        (subg
            [["chemical_id", "disease_id"]]
            .rename(columns={
                "chemical_id": "target_id",
                "disease_id": "source_id"
            })
        ),
        how="inner", on=["source_id", "target_id"]
    )
)

Unnamed: 0,source_id,target_id,etype
0,C0021572,C0011134,RELATED_TO_CDrtCD
1,C1265175,C0663241,PRODUCES_LBpdCD


---

## Save to file

In [21]:
subg.to_csv("data/filtered_goldstd.tsv", sep='\t', index=False)

In [22]:
edges.to_csv("data/edges_no_treats.tsv", sep='\t', index=False)