# Convert Hetionet to UMLS space

## Workflow

1. Run Daniel's `integrate.ipynb` to get `data/hetnet.json.bz2` file. All edges from PharmacotherapyDB were excluded.
2. Run `prepare_neo4j_import_csvs.ipynb` to convert to CSV.
3. Map all ids to UMLS CUIs whenever possible.

In [1]:
import pandas as pd
import re

from collections import defaultdict

## Read Hetionet nodes

In [2]:
nodes = (pd
    .read_csv("data/hetionet/hetnet_nodes.csv", sep=',')
    .drop("identifier:string", axis=1)
    .rename(columns={
        ":ID": "hetio_id",
        "name:string": "name",
        ":LABEL": "het_type"
    })
)

In [3]:
nodes.shape

(47031, 3)

In [4]:
nodes.head()

Unnamed: 0,hetio_id,name,het_type
0,DB00795,Sulfasalazine,Compound
1,N0000000151,Histamine H2 Receptor Antagonists,Pharmacologic Class
2,100996420,DNM1P50,Gene
3,DB04898,Ximelagatran,Compound
4,C0278151,Facial spasm,Side Effect


---

## Create unified ID map

In [5]:
id_map = defaultdict(list)

In [6]:
def add_to_map(df, start, stop):
    id_map["orig"].extend(list(map(str, df[start])))
    id_map["dest"].extend(df[stop])

---

In [7]:
(pd
    .read_csv("maps/uberon.tsv", sep='\t')
    .pipe(add_to_map, "uberon_id", "cui")
)

In [8]:
(pd
    .read_csv("maps/drugbank.tsv", sep='\t')
    .pipe(add_to_map, "drugbank_id", "cui")
)

In [9]:
(pd
    .read_csv("maps/doid_to_cui.tsv", sep='\t')
    .pipe(add_to_map, "doid", "cui")
)

In [10]:
(pd
    .read_csv("maps/gene_map.tsv", sep='\t')
    .pipe(add_to_map, "entrez_id", "cui")
)

In [11]:
(pd
    .read_csv("maps/remaining.tsv", sep='\t')
    .pipe(add_to_map, "code", "cui")
)

---

In [12]:
id_map = pd.DataFrame(id_map)

In [13]:
id_map.shape

(545080, 2)

In [14]:
id_map.head()

Unnamed: 0,dest,orig
0,UMLS:C0007874,UBERON:0000002
1,UMLS:C0028429,UBERON:0000004
2,UMLS:C0458561,UBERON:0000004
3,UMLS:C0022131,UBERON:0000006
4,UMLS:C0032005,UBERON:0000007


---

## Perform mapping

In [15]:
def is_cui(s):
    return re.match(r'^C[0-9]{7}$', s) is not None

In [16]:
hnodes = (nodes
    .merge(id_map, how="left", left_on="hetio_id", right_on="orig")
    .drop("orig", axis=1)
        
    .assign(val = lambda df: df["dest"].fillna(df["hetio_id"]))
    .drop("dest", axis=1)
    
    .assign(
        cui = lambda df: df["val"].map(
            lambda v: "UMLS:{}".format(v) if is_cui(v) else v
        )
    )
    .drop("val", axis=1)
)

In [17]:
hnodes.shape

(58700, 4)

In [18]:
hnodes.head()

Unnamed: 0,hetio_id,name,het_type,cui
0,DB00795,Sulfasalazine,Compound,UMLS:C0036078
1,DB00795,Sulfasalazine,Compound,UMLS:C0699547
2,DB00795,Sulfasalazine,Compound,UMLS:C4255898
3,N0000000151,Histamine H2 Receptor Antagonists,Pharmacologic Class,UMLS:C2757005
4,100996420,DNM1P50,Gene,100996420


## Save to file

In [19]:
hnodes.to_csv("hetionet/hetio_nodes.tsv", sep='\t', index=False)