# Remove treats edges from SemmedDB

2018-12-05

Remove all Semrep mined "treats" edges from SemmedDB.

In [1]:
import pandas as pd

## Read source files

### Read nodes

In [2]:
raw_nodes = (pd
    .read_csv("data/raw/nodes_7_metanode_slim.csv", sep=',')
    .rename(columns={
        ":ID": "node_id",
        ":LABEL": "ntype"
    })
)

In [3]:
raw_nodes.shape

(214590, 3)

In [4]:
raw_nodes.head()

Unnamed: 0,node_id,name,ntype
0,C0016192,Flagella,Anatomy
1,C0230349,Cubital fossa,Anatomy
2,C0447417,Entire retromolar area of mouth,Anatomy
3,C0033151,Primitive Gut,Anatomy
4,C0225861,Left auricular appendage,Anatomy


### Read edges

In [5]:
raw_edges = (pd
    .read_csv("data/raw/edges_7_metanode_slim.csv", sep=',')
    .rename(columns={
        ":START_ID": "source_id",
        ":END_ID": "target_id",
        ":TYPE": "edge_type"
    })
)

In [6]:
raw_edges.shape

(10230594, 5)

In [7]:
raw_edges.head()

Unnamed: 0,source_id,target_id,edge_type,pmids,n_pmids
0,C0000039,C0001026,REGULATES_CDreg>CD,{12832447},1
1,C0000039,C0001041,REGULATES_CDreg>CD,"{2725816, 3601005}",2
2,C0000039,C0001492,REGULATES_CDreg>CD,{6627084},1
3,C0000039,C0001511,TREATS_CDtDO,"{11117308, 16398821}",2
4,C0000039,C0001962,REGULATES_CDreg>CD,{7619843},1


---

## Look at most common edge labels

In [8]:
raw_edges["edge_type"].value_counts().head()

LOCATION_OF_AloCD         996348
REGULATES_CDreg>CD        814972
INTERACTS_WITH_CDiwG      705394
ASSOCIATED_WITH_DOawDO    690653
TREATS_CDtDO              583814
Name: edge_type, dtype: int64

## Drop TREATS edges from the network

We don't want TREATS edges mined by Semrep to influence the drug predictions.

In [9]:
edges = raw_edges.query("edge_type != 'TREATS_CDtDO'")

In [10]:
edges.shape

(9646780, 5)

In [11]:
edges.head()

Unnamed: 0,source_id,target_id,edge_type,pmids,n_pmids
0,C0000039,C0001026,REGULATES_CDreg>CD,{12832447},1
1,C0000039,C0001041,REGULATES_CDreg>CD,"{2725816, 3601005}",2
2,C0000039,C0001492,REGULATES_CDreg>CD,{6627084},1
4,C0000039,C0001962,REGULATES_CDreg>CD,{7619843},1
5,C0000039,C0001975,RELATED_TO_CDrtCD,{2064754},1


---

## Use the resulting list of edges to generate the master node list

In [12]:
edge_nodes = set(edges["source_id"]) | set(edges["target_id"])

In [13]:
all_raw_nodes = set(raw_nodes["node_id"])

In [14]:
len(edge_nodes)

210375

In [15]:
len(all_raw_nodes)

214590

### After dropping treats edges, there are some nodes in the master node list which are disconnected from the network

We will drop these unconnected nodes.

In [16]:
all_raw_nodes >= edge_nodes

True

#### The disconnected nodes:

In [17]:
temp = all_raw_nodes - edge_nodes

raw_nodes.query("node_id in @temp").head()

Unnamed: 0,node_id,name,ntype
15106,C0727304,Ora-Sweet SF,Chemicals & Drugs
15138,C0992318,famciclovir 500 MG,Chemicals & Drugs
15173,C1330257,Estradot,Chemicals & Drugs
15186,C1445605,Oral dexketoprofen,Chemicals & Drugs
15201,C1176623,rosiglitazone 4 MG,Chemicals & Drugs


## Use the edge nodes as the master list of nodes

In [18]:
final_nodes = edge_nodes

In [19]:
len(final_nodes)

210375

### Check that nodes are UMLS CUIs

In [20]:
temp = (raw_nodes
    .query("node_id in @final_nodes") 
)

In [21]:
temp.head()

Unnamed: 0,node_id,name,ntype
0,C0016192,Flagella,Anatomy
1,C0230349,Cubital fossa,Anatomy
2,C0447417,Entire retromolar area of mouth,Anatomy
3,C0033151,Primitive Gut,Anatomy
4,C0225861,Left auricular appendage,Anatomy


In [22]:
temp["node_id"].str.startswith("C").value_counts()

True     210368
False         7
Name: node_id, dtype: int64

In [23]:
temp[~temp["node_id"].str.startswith("C")]

Unnamed: 0,node_id,name,ntype
101249,DOID:12859,choreatic disease,Disorders
101695,DOID:0060073,lymphatic system cancer,Disorders
102333,DOID:0060061,cutaneous T cell lymphoma,Disorders
103707,DOID:0060119,pharynx cancer,Disorders
127328,DOID:0060668,anencephaly,Disorders
128351,DOID:0050591,tooth agenesis,Disorders
131126,DOID:0050545,visceral heterotaxy,Disorders


Not all nodes are CUIs. We will add "UMLS" to the front of those which are CUIs.

---

## Format final node list

In [24]:
good_nodes = (raw_nodes
    .query("node_id in @final_nodes")
    .assign(
        node_id = lambda df: df["node_id"].map(
            lambda v: "UMLS:{}".format(v) if v.startswith("C") else v
        )
    )
    .sort_values("node_id")
    .reset_index(drop=True)
)

In [25]:
good_nodes.shape

(210375, 3)

In [26]:
good_nodes.head()

Unnamed: 0,node_id,name,ntype
0,DOID:0050545,visceral heterotaxy,Disorders
1,DOID:0050591,tooth agenesis,Disorders
2,DOID:0060061,cutaneous T cell lymphoma,Disorders
3,DOID:0060073,lymphatic system cancer,Disorders
4,DOID:0060119,pharynx cancer,Disorders


In [27]:
good_nodes["node_id"].nunique()

210375

In [28]:
good_nodes.index

RangeIndex(start=0, stop=210375, step=1)

In [29]:
good_nodes["node_id"].str.startswith("UMLS:").value_counts()

True     210368
False         7
Name: node_id, dtype: int64

In [30]:
good_nodes[~good_nodes["node_id"].str.startswith("UMLS:")]

Unnamed: 0,node_id,name,ntype
0,DOID:0050545,visceral heterotaxy,Disorders
1,DOID:0050591,tooth agenesis,Disorders
2,DOID:0060061,cutaneous T cell lymphoma,Disorders
3,DOID:0060073,lymphatic system cancer,Disorders
4,DOID:0060119,pharynx cancer,Disorders
5,DOID:0060668,anencephaly,Disorders
6,DOID:12859,choreatic disease,Disorders


---

## Change edge ids to also use "UMLS:" prefix

We already know from the node list that all nodes in the edge list is either a CUI or a DOID.

In [31]:
good_edges = (edges
    .assign(
        source_id = lambda df: df["source_id"].map(
            lambda v: "UMLS:{}".format(v) if v.startswith("C") else v
        )
    )
    .assign(
        target_id = lambda df: df["target_id"].map(
            lambda v: "UMLS:{}".format(v) if v.startswith("C") else v
        )
    )
    .sort_values(["source_id", "target_id", "edge_type"])
    .reset_index(drop=True)
)

In [32]:
good_edges.shape

(9646780, 5)

In [33]:
good_edges.head()

Unnamed: 0,source_id,target_id,edge_type,pmids,n_pmids
0,DOID:0050545,UMLS:C0000768,ASSOCIATED_WITH_DOawDO,"{26177792, 21160705, 11723906, 22470785, 17674...",58
1,DOID:0050545,UMLS:C0000772,ASSOCIATED_WITH_DOawDO,"{23377898, 19493518}",2
2,DOID:0050545,UMLS:C0001792,AFFECTS_DOafLB,{15249896},1
3,DOID:0050545,UMLS:C0002871,ASSOCIATED_WITH_DOawDO,{5379521},1
4,DOID:0050545,UMLS:C0004245,ASSOCIATED_WITH_DOawDO,{24509635},1


## Save to disk

In [34]:
good_nodes.to_csv("data/no_treats/semmeddb_no_treats_nodes.tsv", sep='\t', index=False)

In [35]:
good_edges.to_csv("data/no_treats/semmeddb_no_treats_edges.tsv", sep='\t', index=False)