map genes between umls cui and entrez gene

In [1]:
import pandas as pd
import mygene

In [2]:
semnodes = (pd
    .read_csv("semmed/data/nodes_7_metanode_slim.csv", sep=',')
    .rename(columns={":LABEL": "ntype", ":ID": "cui"})
)

In [3]:
semnodes.head()

Unnamed: 0,cui,name,ntype
0,C0016192,Flagella,Anatomy
1,C0230349,Cubital fossa,Anatomy
2,C0447417,Entire retromolar area of mouth,Anatomy
3,C0033151,Primitive Gut,Anatomy
4,C0225861,Left auricular appendage,Anatomy


In [4]:
semnodes["ntype"].value_counts()

Chemicals & Drugs              84614
Living Beings                  48191
Disorders                      37284
Genes & Molecular Sequences    20539
Anatomy                        15100
Physiology                      7727
Phenomena                       1135
Name: ntype, dtype: int64

---

## Convert genes

In [5]:
genes = semnodes.query("ntype == 'Genes & Molecular Sequences'")

In [6]:
genes.head()

Unnamed: 0,cui,name,ntype
136998,C1335205,PECAM1 gene,Genes & Molecular Sequences
136999,C1421346,UGT2B7 gene,Genes & Molecular Sequences
137000,C1537426,ISOC1 gene,Genes & Molecular Sequences
137001,C1420171,SLC29A1 gene,Genes & Molecular Sequences
137002,C1417913,OAS2 gene,Genes & Molecular Sequences


In [7]:
genes.shape

(20539, 3)

---

In [8]:
mg = mygene.MyGeneInfo()

In [9]:
res = mg.querymany(
    list(genes["cui"]),
    scopes="umls.cui",
    fields="entrezgene",
    species="human"
)

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-20539...done.
Finished.
2 input query terms found dup hits:
	[('C1424161', 2), ('C1423043', 2)]
2492 input query terms found no hit:
	['C0796371', 'C0079062', 'C1336777', 'C0017429', 'C1327780', 'C1428078', 'C1424711', 'C0085303', 'C1
Pass "returnall=True" to return complete lists of duplicate or missing query terms.


In [10]:
genemap = (pd
    .DataFrame(
        [
            (v["query"], v["entrezgene"])
            for v in res if "entrezgene" in v
        ],
        columns=["cui", "geneid"]
    )
    .assign(ntype = "gene")
    .sort_values(["cui", "geneid"])
    .reset_index(drop=True)
)

In [11]:
genemap.head()

Unnamed: 0,cui,geneid,ntype
0,C0008288,85457,gene
1,C0017351,3492,gene
2,C0026574,4342,gene
3,C0026900,4602,gene
4,C0035018,5966,gene


In [12]:
genemap["cui"].nunique()

18047

In [13]:
genemap.to_csv("gene_map.tsv", sep='\t', index=False)