# Map genes between UMLS CUI and Entrez gene using HGNC

In [1]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

In [2]:
def parse_hgnc():
    """Determine HGNC to Entrez gene id mapping.
    Drops all genes without Entrez Gene ids.
    """

    # drops any HGNC genes with missing Entrez ids
    return (pd
        .read_csv("../data/hgnc_complete_set.txt", sep='\t', low_memory=False)
        [["hgnc_id", "symbol", "status", "entrez_id"]]
        .dropna(axis=0, how="any", subset=["entrez_id"])
        .assign(entrez_id = lambda df: df["entrez_id"].astype(int))
    )

In [3]:
hgnc = parse_hgnc()

In [4]:
hgnc.head()

Unnamed: 0,hgnc_id,symbol,status,entrez_id
0,HGNC:5,A1BG,Approved,1
1,HGNC:37133,A1BG-AS1,Approved,503538
2,HGNC:24086,A1CF,Approved,29974
3,HGNC:7,A2M,Approved,2
4,HGNC:27057,A2M-AS1,Approved,144571


In [5]:
hgnc.shape

(41203, 4)

In [6]:
hgnc["status"].value_counts()

Approved    41203
Name: status, dtype: int64

---

In [7]:
def read_umls(fname):
    res = defaultdict(list)
    with open(fname, "r") as fin:
        for line in tqdm(fin, total=13897048):
            vals = line.rstrip("\n").split("|")
            
            cui, sab, code = vals[0], vals[11], vals[13]
            
            if sab == "HGNC":
                res["cui"].append(cui)
                res["hgnc_id"].append(code)
                
    return pd.DataFrame(res).drop_duplicates()

In [8]:
umls = read_umls("../data/MRCONSO.RRF")

100%|██████████| 13897048/13897048 [00:32<00:00, 426238.20it/s]


In [9]:
umls.shape

(40978, 2)

In [10]:
umls.head()

Unnamed: 0,cui,hgnc_id
0,C0008288,HGNC:20365
4,C0017351,HGNC:5477
7,C0026574,HGNC:7199
10,C0026900,HGNC:7545
14,C0035018,HGNC:9954


---

In [11]:
res = (umls
    .merge(hgnc, how="inner", on="hgnc_id")
    .drop("status", axis=1)
)

In [12]:
res.head()

Unnamed: 0,cui,hgnc_id,symbol,entrez_id
0,C0008288,HGNC:20365,CIPC,85457
1,C0017351,HGNC:5477,IGH,3492
2,C0026574,HGNC:7199,MOS,4342
3,C0026900,HGNC:7545,MYB,4602
4,C0035018,HGNC:9954,REL,5966


In [13]:
res.shape

(40867, 4)

In [14]:
res["cui"].nunique()

40862

In [15]:
res["entrez_id"].nunique()

40867

---

In [16]:
res.to_csv("../maps/gene_map.tsv", sep='\t', index=False)