# Map all remaining IDs to CUI space

All remaining categories of terms are in MeSH, NDFRT, and Gene Ontology space. The category of "Living Beings" in the SemmedDB network can't be resolved because no such concept type exists in Hetionet.

In [1]:
import pandas as pd

from collections import defaultdict

from tqdm import tqdm

---

In [2]:
def read_umls(fname):
    res = defaultdict(list)
    with open(fname, "r") as fin:
        for line in tqdm(fin, total=13897048):
            vals = line.rstrip("\n").split("|")

            cui, sab, code = vals[0], vals[11], vals[13]
            
            if sab in {"GO", "MSH", "NDFRT"}:
                res["cui"].append(cui)
                res["code"].append(code)
                res["source"].append(sab)

    return pd.DataFrame(res).drop_duplicates()

In [3]:
umls = read_umls("../data/MRCONSO.RRF")

100%|██████████| 13897048/13897048 [00:34<00:00, 400211.68it/s]


In [4]:
res = (umls
    .sort_values(["cui", "source", "code"])
    .reset_index(drop=True)
)

In [5]:
umls["source"].value_counts()

MSH      370689
GO        71615
NDFRT     45465
Name: source, dtype: int64

In [6]:
res.shape

(487769, 3)

In [7]:
res.head()

Unnamed: 0,code,cui,source
0,D012711,C0000005,MSH
1,D015060,C0000039,MSH
2,N0000007747,C0000039,NDFRT
3,D015061,C0000052,MSH
4,N0000168345,C0000052,NDFRT


In [8]:
res.to_csv("../maps/remaining.tsv", sep='\t', index=False)