# Convert Hetionet to UMLS space

## Workflow

1. Run Daniel's `integrate.ipynb` to get `data/hetnet.json.bz2` file. All edges from PharmacotherapyDB were excluded.
2. Run `prepare_neo4j_import_csvs.ipynb` to convert to CSV.
3. Map all ids to UMLS CUIs whereever possible.

In [1]:
import pandas as pd
import re

from collections import defaultdict

## Read Hetionet nodes

In [2]:
nodes = (pd
    .read_csv("data/hetionet/hetnet_nodes.csv", sep=',')
    .drop("identifier:string", axis=1)
    .rename(columns={
        ":ID": "uid",
        "name:string": "name",
        ":LABEL": "node_type"
    })
)

In [3]:
nodes.shape

(47031, 3)

In [4]:
nodes.head()

Unnamed: 0,uid,name,node_type
0,DB00795,Sulfasalazine,Compound
1,N0000000151,Histamine H2 Receptor Antagonists,Pharmacologic Class
2,100996420,DNM1P50,Gene
3,DB04898,Ximelagatran,Compound
4,C0278151,Facial spasm,Side Effect


## Read Hetionet edges

In [5]:
edges = (pd
    .read_csv("data/hetionet/hetnet_edges.csv", sep=',')
    .rename(columns={
        ":START_ID": "source_id",
        ":END_ID": "target_id",
        ":TYPE": "edge_type"
    })
)

In [6]:
edges.head()

Unnamed: 0,source_id,target_id,edge_type
0,8568,GO:0042254,PARTICIPATES_GpBP
1,UBERON:0002450,51316,EXPRESSES_AeG
2,4893,PC7_8430,PARTICIPATES_GpPW
3,UBERON:0002185,55186,EXPRESSES_AeG
4,119504,GO:0033047,PARTICIPATES_GpBP


In [7]:
edges.shape

(2249052, 3)

---

## Create unified ID map

In [8]:
id_map = defaultdict(list)

In [9]:
def add_to_map(df, start, stop):
    id_map["orig"].extend(list(map(str, df[start])))
    id_map["dest"].extend(df[stop])

---

In [10]:
(pd
    .read_csv("maps/uberon.tsv", sep='\t')
    .pipe(add_to_map, "uberon_id", "cui")
)

In [11]:
(pd
    .read_csv("maps/drugbank.tsv", sep='\t')
    .pipe(add_to_map, "drugbank_id", "cui")
)

In [12]:
(pd
    .read_csv("maps/doid_to_cui.tsv", sep='\t')
    .pipe(add_to_map, "doid", "cui")
)

In [13]:
(pd
    .read_csv("maps/gene_map.tsv", sep='\t')
    .pipe(add_to_map, "entrez_id", "cui")
)

In [14]:
(pd
    .read_csv("maps/remaining.tsv", sep='\t')
    .pipe(add_to_map, "code", "cui")
)

---

In [15]:
id_map = pd.DataFrame(id_map)

In [16]:
id_map.shape

(545080, 2)

In [17]:
id_map.head()

Unnamed: 0,dest,orig
0,UMLS:C0007874,UBERON:0000002
1,UMLS:C0028429,UBERON:0000004
2,UMLS:C0458561,UBERON:0000004
3,UMLS:C0022131,UBERON:0000006
4,UMLS:C0032005,UBERON:0000007


---

## Perform mapping

In [18]:
def is_cui(s):
    return re.match(r'^C[0-9]{7}$', s) is not None

In [19]:
hnodes = (nodes
    .merge(id_map, how="left", left_on="uid", right_on="orig")
    .drop("orig", axis=1)
        
    .assign(val = lambda df: df["dest"].fillna(df["uid"]))
    .drop("dest", axis=1)
    
    .assign(
        fuid = lambda df: df["val"].map(
            lambda v: "UMLS:{}".format(v) if is_cui(v) else v
        )
    )
    .drop("val", axis=1)
)

In [20]:
hnodes.shape

(58700, 4)

In [21]:
hnodes.head()

Unnamed: 0,uid,name,node_type,fuid
0,DB00795,Sulfasalazine,Compound,UMLS:C0036078
1,DB00795,Sulfasalazine,Compound,UMLS:C0699547
2,DB00795,Sulfasalazine,Compound,UMLS:C4255898
3,N0000000151,Histamine H2 Receptor Antagonists,Pharmacologic Class,UMLS:C2757005
4,100996420,DNM1P50,Gene,100996420


---

## Map Hetionet edges to new IDs

In [22]:
hedges = (edges
    .merge(
        hnodes[["uid", "fuid"]],
        how="inner", left_on="source_id", right_on="uid"
    )
    .drop(["source_id", "uid"], axis=1)
    .rename(columns={"fuid": "source_uid"})

    .merge(
        hnodes[["uid", "fuid"]],
        how="inner", left_on="target_id", right_on="uid"
    )
    .drop(["target_id", "uid"], axis=1)
    .rename(columns={"fuid": "target_uid"})
          
    .drop_duplicates()
)

In [23]:
hedges.head()

Unnamed: 0,edge_type,source_uid,target_uid
0,PARTICIPATES_GpBP,UMLS:C1857700,UMLS:C1149564
1,PARTICIPATES_GpBP,UMLS:C1857700,UMLS:C1156140
2,PARTICIPATES_GpBP,UMLS:C1419751,UMLS:C1149564
3,PARTICIPATES_GpBP,UMLS:C1419751,UMLS:C1156140
4,PARTICIPATES_GpBP,UMLS:C1335641,UMLS:C1149564


In [24]:
hedges.shape

(3434083, 3)

---

## Save to file

In [25]:
hnodes.to_csv("hetionet/hetio_nodes.tsv", sep='\t', index=False)
hedges.to_csv("hetionet/hetio_edges.tsv", sep='\t', index=False)