# Filter the semmeddb gold standard for use with hetionet

In [1]:
import pandas as pd

## Read hetionet to umls mapping

In [2]:
hmap = pd.read_csv("../../merge/hetionet/hetio_nodes.tsv", sep='\t')

In [3]:
hmap.shape

(58700, 4)

In [4]:
hmap.head()

Unnamed: 0,hetio_id,name,het_type,cui
0,DB00795,Sulfasalazine,Compound,UMLS:C0036078
1,DB00795,Sulfasalazine,Compound,UMLS:C0699547
2,DB00795,Sulfasalazine,Compound,UMLS:C4255898
3,N0000000151,Histamine H2 Receptor Antagonists,Pharmacologic Class,UMLS:C2757005
4,100996420,DNM1P50,Gene,100996420


## Read semmeddb gold standard

In [5]:
gold = (pd
    .read_csv("../data/merged_gold.tsv", sep='\t')
    .drop(["chemical_uid", "disease_uid"], axis=1)
)

In [6]:
gold.shape

(6329, 5)

In [7]:
gold.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype
0,UMLS:C0520442,acetyldigitoxin,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO
1,UMLS:C0033497,propranolol,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO
2,UMLS:C0012265,digoxin,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO
3,UMLS:C0084273,quinidine,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO
4,UMLS:C0766326,dronedarone,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO


---

## Filter down to existing hetio nodes

In [8]:
hcuis = set(hmap["cui"])

In [9]:
hgold = gold.query("chemical_id in @hcuis and disease_id in @hcuis")

In [10]:
hgold.shape

(2924, 5)

In [11]:
hgold.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype
0,UMLS:C0520442,acetyldigitoxin,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO
1,UMLS:C0033497,propranolol,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO
2,UMLS:C0012265,digoxin,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO
4,UMLS:C0766326,dronedarone,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO
5,UMLS:C0116569,esmolol,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO


---

## Convert to hetio ids

In [12]:
fgold = (hgold
    .merge(
        hmap.drop("name", axis=1), how="inner",
        left_on="chemical_id", right_on="cui"
    )
    .drop("cui", axis=1)
    .rename(columns={
        "hetio_id": "chemical_hetid",
        "het_type": "chemical_htype"
    })
        
    .merge(
        hmap.drop("name", axis=1), how="inner",
        left_on="disease_id", right_on="cui"
    )
    .drop("cui", axis=1)
    .rename(columns={
        "hetio_id": "disease_hetid",
        "het_type": "disease_htype"
    })
)

In [13]:
fgold.shape

(3716, 9)

In [14]:
fgold.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype,chemical_hetid,chemical_htype,disease_hetid,disease_htype
0,UMLS:C0520442,acetyldigitoxin,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO,DB00511,Compound,C0004238,Side Effect
1,UMLS:C0033497,propranolol,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO,DB00571,Compound,C0004238,Side Effect
2,UMLS:C0012265,digoxin,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO,DB00390,Compound,C0004238,Side Effect
3,UMLS:C0766326,dronedarone,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO,DB04855,Compound,C0004238,Side Effect
4,UMLS:C0116569,esmolol,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO,DB00187,Compound,C0004238,Side Effect


In [15]:
len(set(zip(fgold["chemical_id"], fgold["disease_id"])))

2924

we have some expansion of the gold standard by converting it into hetio space. originally we only had 2924 unique gold standard chem/dise pairs, but due to the id mapping we now have 3716 hetio chem/dise pairs, which is a 27% expansion (which inflates our gold standard)

In [17]:
a = fgold.groupby(["chemical_id", "disease_id"]).filter(lambda v: len(v) > 1)

In [19]:
a.sort_values(["chemical_id", "disease_id"]).head(20)

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype,chemical_hetid,chemical_htype,disease_hetid,disease_htype
466,UMLS:C0000477,fampridine,UMLS:C0026769,multiple sclerosis,TREATS_CDtDO,DB06637,Compound,DOID:2377,Disease
467,UMLS:C0000477,fampridine,UMLS:C0026769,multiple sclerosis,TREATS_CDtDO,DB06637,Compound,C0026769,Side Effect
3613,UMLS:C0000545,eicosapentaenoic acid,UMLS:C0032961,"Pregnancy, function",TREATS_CDtDO,DB00159,Compound,GO:0007565,Biological Process
3614,UMLS:C0000545,eicosapentaenoic acid,UMLS:C0032961,"Pregnancy, function",TREATS_CDtDO,DB00159,Compound,C0032961,Side Effect
1646,UMLS:C0000970,paracetamol,UMLS:C0003862,Joint pain,TREATS_CDtDO,DB00316,Compound,D018771,Symptom
1647,UMLS:C0000970,paracetamol,UMLS:C0003862,Joint pain,TREATS_CDtDO,DB00316,Compound,C0003862,Side Effect
1743,UMLS:C0000970,paracetamol,UMLS:C0004604,Backache,TREATS_CDtDO,DB00316,Compound,C0004604,Side Effect
1744,UMLS:C0000970,paracetamol,UMLS:C0004604,Backache,TREATS_CDtDO,DB00316,Compound,D001416,Symptom
1703,UMLS:C0000970,paracetamol,UMLS:C0015967,Fever,TREATS_CDtDO,DB00316,Compound,GO:0001660,Biological Process
1704,UMLS:C0000970,paracetamol,UMLS:C0015967,Fever,TREATS_CDtDO,DB00316,Compound,D005334,Symptom


---

### How many duplicates per unique drugcentral indication?

In [16]:
fgold.groupby(["chemical_id", "disease_id"]).size().value_counts()

1    2178
2     708
3      30
4       8
dtype: int64

---

## How to rank the duplicates?

In [17]:
fgold["chemical_htype"].value_counts()

Compound               3677
Pharmacologic Class      39
Name: chemical_htype, dtype: int64

In [18]:
fgold["disease_htype"].value_counts()

Side Effect           2819
Disease                522
Symptom                355
Biological Process      20
Name: disease_htype, dtype: int64

### we will use the frequency order of semantic types to determine which of the duplicates to keep

In [19]:
a = list(map(
    lambda v: (10*v[0], v[1]), enumerate(
        fgold["chemical_htype"].value_counts().sort_values().index
    )
))

b = list(enumerate(
    fgold["disease_htype"].value_counts().sort_values().index
))

# higher score is better
scores = pd.DataFrame(
    a + b, columns=["score", "htype"]
)

In [20]:
scores

Unnamed: 0,score,htype
0,0,Pharmacologic Class
1,10,Compound
2,0,Biological Process
3,1,Symptom
4,2,Disease
5,3,Side Effect


### add in scores to the indications

In [21]:
temp = (fgold
    .merge(scores, how="left", left_on="chemical_htype", right_on="htype")
    .drop("htype", axis=1)
    .rename(columns={"score": "chem_score"})

    .merge(scores, how="left", left_on="disease_htype", right_on="htype")
    .drop("htype", axis=1)
    .rename(columns={"score": "dise_score"})
        
    .assign(fscore = lambda df: df["chem_score"] + df["dise_score"])
)

In [22]:
temp.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype,chemical_hetid,chemical_htype,disease_hetid,disease_htype,chem_score,dise_score,fscore
0,UMLS:C0520442,acetyldigitoxin,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO,DB00511,Compound,C0004238,Side Effect,10,3,13
1,UMLS:C0033497,propranolol,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO,DB00571,Compound,C0004238,Side Effect,10,3,13
2,UMLS:C0012265,digoxin,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO,DB00390,Compound,C0004238,Side Effect,10,3,13
3,UMLS:C0766326,dronedarone,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO,DB04855,Compound,C0004238,Side Effect,10,3,13
4,UMLS:C0116569,esmolol,UMLS:C0004238,Atrial fibrillation,TREATS_CDtDO,DB00187,Compound,C0004238,Side Effect,10,3,13


### Check that the scores for each set of duplicates is unique

In [23]:
(temp
    .groupby(["chemical_id", "disease_id"])
    .apply(lambda df: len(df) == df["fscore"].nunique())
    .all()
)

True

## Take the hetio id pair with the highest score

In [24]:
res = (temp
    .groupby(["chemical_id", "disease_id"])
    .apply(lambda df: df.sort_values("fscore").tail(1))
    .reset_index(drop=True)
)

In [25]:
res.shape

(2924, 12)

In [26]:
res.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype,chemical_hetid,chemical_htype,disease_hetid,disease_htype,chem_score,dise_score,fscore
0,UMLS:C0000477,fampridine,UMLS:C0026769,multiple sclerosis,TREATS_CDtDO,DB06637,Compound,C0026769,Side Effect,10,3,13
1,UMLS:C0000545,eicosapentaenoic acid,UMLS:C0032961,"Pregnancy, function",TREATS_CDtDO,DB00159,Compound,C0032961,Side Effect,10,3,13
2,UMLS:C0000618,mercaptopurine,UMLS:C0023449,acute lymphocytic leukemia,TREATS_CDtDO,DB01033,Compound,C0023449,Side Effect,10,3,13
3,UMLS:C0000618,mercaptopurine,UMLS:C0023487,acute promyelocytic leukemia,TREATS_CDtDO,DB01033,Compound,C0023487,Side Effect,10,3,13
4,UMLS:C0000956,acenocoumarol,UMLS:C0034065,Pulmonary embolism,TREATS_CDtDO,DB01418,Compound,C0034065,Side Effect,10,3,13


In [27]:
res.groupby(["chemical_id", "disease_id"]).size().value_counts()

1    2924
dtype: int64

In [28]:
res["fscore"].value_counts()

13    2789
12     117
11      11
3        7
Name: fscore, dtype: int64

---

## Final gold standard for use with hetionet and deepwalk

In [29]:
fres = res.drop(["chem_score", "dise_score", "fscore"], axis=1)

In [30]:
fres.shape

(2924, 9)

In [31]:
fres.head()

Unnamed: 0,chemical_id,chemical_name,disease_id,disease_name,etype,chemical_hetid,chemical_htype,disease_hetid,disease_htype
0,UMLS:C0000477,fampridine,UMLS:C0026769,multiple sclerosis,TREATS_CDtDO,DB06637,Compound,C0026769,Side Effect
1,UMLS:C0000545,eicosapentaenoic acid,UMLS:C0032961,"Pregnancy, function",TREATS_CDtDO,DB00159,Compound,C0032961,Side Effect
2,UMLS:C0000618,mercaptopurine,UMLS:C0023449,acute lymphocytic leukemia,TREATS_CDtDO,DB01033,Compound,C0023449,Side Effect
3,UMLS:C0000618,mercaptopurine,UMLS:C0023487,acute promyelocytic leukemia,TREATS_CDtDO,DB01033,Compound,C0023487,Side Effect
4,UMLS:C0000956,acenocoumarol,UMLS:C0034065,Pulmonary embolism,TREATS_CDtDO,DB01418,Compound,C0034065,Side Effect


## save gold to file

In [32]:
fres.to_csv("filtered_semmed_gold_for_hetionet.tsv", sep='\t', index=False)