# Examine genes in Hetionet which do not have UMLS CUIs

2019-01-25

Here we will see if there are ways to map missed genes in Hetionet to UMLS CUIs.

In [1]:
import pandas as pd
import requests
from tqdm import tqdm
from collections import defaultdict

## Read Hetionet to UMLS mapping file

In [2]:
hnodes = pd.read_csv("../pipeline/hetionet/hetionet_nodes_umls.tsv", sep='\t')

In [3]:
hnodes.shape

(58700, 4)

In [4]:
hnodes.head()

Unnamed: 0,hetio_id,name,het_type,cui
0,1,A1BG,Gene,UMLS:C1412045
1,10,NAT2,Gene,UMLS:C0796518
2,100,ADA,Gene,UMLS:C1412179
3,1000,CDH2,Gene,UMLS:C1413277
4,10000,AKT3,Gene,UMLS:C1332074


## Grab unmapped genes

In [5]:
missed_genes = (hnodes
    .assign(is_cui = lambda df: df["cui"].str.startswith("UMLS:C"))
    .query("~is_cui and het_type == 'Gene'")
    .reset_index(drop=True)
)

In [6]:
missed_genes.shape

(1562, 5)

In [7]:
missed_genes.head()

Unnamed: 0,hetio_id,name,het_type,cui,is_cui
0,100049579,OCTN3,Gene,100049579,False
1,100127889,C10orf131,Gene,100127889,False
2,100127971,LOC100127971,Gene,100127971,False
3,100127991,LOC100127991,Gene,100127991,False
4,100128001,LOC100128001,Gene,100128001,False


---

## LOC genes vs named genes

In [8]:
missed_genes = missed_genes.assign(is_loc = lambda df: df["name"].str.startswith("LOC"))

In [9]:
missed_genes.head()

Unnamed: 0,hetio_id,name,het_type,cui,is_cui,is_loc
0,100049579,OCTN3,Gene,100049579,False,False
1,100127889,C10orf131,Gene,100127889,False,False
2,100127971,LOC100127971,Gene,100127971,False,True
3,100127991,LOC100127991,Gene,100127991,False,True
4,100128001,LOC100128001,Gene,100128001,False,True


In [10]:
missed_genes["is_loc"].value_counts()

True     1460
False     102
Name: is_loc, dtype: int64

## Functions for checking gene status

In [11]:
def make_clickable(val):
    return '<a href="https://www.ncbi.nlm.nih.gov/gene/{}">{}</a>'.format(val, val)

In [12]:
def parse(gene_id):
    """Get the status of a NCBI gene id"""

    def process(html):
        keywords = ["This record was replaced", "WITHDRAWN"]
        status = ["replaced", "withdrawn"]

        # check
        res = [v in html for v in keywords]

        if sum(res) == 2:
            print("Gene id {} is both withdrawn and replaced".format(gene_id))

        for ans, value in zip(status, res):
            if value:
                return ans

        return "normal"

    
    url = "https://www.ncbi.nlm.nih.gov/gene/{}".format(gene_id)

    resp = requests.get(url)
    assert resp.status_code == 200
    
    return process(resp.text)

In [13]:
def get_status(df):
    return df.assign(
        gene_status = lambda df: df["hetio_id"].map(parse)
    )

---

## Sample named genes

In [14]:
(missed_genes
    .query("~is_loc")
    .sample(10)
    [["hetio_id", "name"]]
    .pipe(get_status)
).style.format({"hetio_id": make_clickable})

Unnamed: 0,hetio_id,name,gene_status
239,100861437,NARR,replaced
1435,389422,C6orf183,replaced
175,100287859,FAM197Y6,replaced
1460,408186,OVOS,normal
1349,2575,GAGE3,withdrawn
1356,26120,DKFZP564C152,withdrawn
1425,353138,LEP7,withdrawn
1559,80761,UPK3B,replaced
1428,388439,FLJ12120,withdrawn
1446,399832,DKFZp686M1136,withdrawn


## Sample LOC genes

In [15]:
(missed_genes
    .query("is_loc")
    .sample(10)
    [["hetio_id", "name"]]
    .pipe(get_status)
).style.format({"hetio_id": make_clickable})

Unnamed: 0,hetio_id,name,gene_status
827,105371935,LOC105371935,withdrawn
469,102723330,LOC102723330,normal
491,102723730,LOC102723730,normal
394,101928948,LOC101928948,normal
456,101930075,LOC101930075,withdrawn
136,100131894,LOC100131894,withdrawn
29,100128691,LOC100128691,withdrawn
318,101927260,LOC101927260,replaced
1281,105379689,LOC105379689,withdrawn
1426,388282,LOC388282,normal


---

## Process a larger sample of genes

In [16]:
big_sample = (missed_genes
    .query("is_loc")
    .sample(50)
    .append(
        missed_genes.query("~is_loc").sample(50)
    )
    .sort_values("hetio_id")
    .reset_index(drop=True)
)

In [17]:
big_sample.shape

(100, 6)

In [18]:
big_sample.head()

Unnamed: 0,hetio_id,name,het_type,cui,is_cui,is_loc
0,100049579,OCTN3,Gene,100049579,False,False
1,100127889,C10orf131,Gene,100127889,False,False
2,100128795,LOC100128795,Gene,100128795,False,True
3,100129069,LOC100129069,Gene,100129069,False,True
4,100129126,LOC100129126,Gene,100129126,False,True


In [19]:
big_sample["is_loc"].value_counts()

True     50
False    50
Name: is_loc, dtype: int64

### Get gene status from NCBI Gene

In [20]:
sample_size = 100

res = defaultdict(list)

for row in tqdm(big_sample.itertuples(), total=sample_size):
    hetio_id = row.hetio_id
    status = parse(hetio_id)
    
    res["hetio_id"].append(hetio_id)
    res["gene_status"].append(status)

100%|██████████| 100/100 [02:40<00:00,  1.37s/it]


In [21]:
res = pd.DataFrame(res)

In [22]:
res.shape

(100, 2)

In [23]:
res.head()

Unnamed: 0,hetio_id,gene_status
0,100049579,normal
1,100127889,replaced
2,100128795,withdrawn
3,100129069,withdrawn
4,100129126,withdrawn


---

## Merge gene status results with the large sample

In [24]:
sample_res = (big_sample
    [["hetio_id", "name", "is_loc"]]
    .merge(res, how="inner", on="hetio_id")
)

In [25]:
sample_res.shape

(100, 4)

In [26]:
sample_res.head()

Unnamed: 0,hetio_id,name,is_loc,gene_status
0,100049579,OCTN3,False,normal
1,100127889,C10orf131,False,replaced
2,100128795,LOC100128795,True,withdrawn
3,100129069,LOC100129069,True,withdrawn
4,100129126,LOC100129126,True,withdrawn


### Gene status of large sample

In [27]:
sample_res["gene_status"].value_counts(normalize=True).multiply(100)

withdrawn    40.0
normal       35.0
replaced     25.0
Name: gene_status, dtype: float64

### Gene status divided by LOC status

In [28]:
sample_res.groupby("is_loc")["gene_status"].value_counts()

is_loc  gene_status
False   replaced       20
        normal         15
        withdrawn      15
True    withdrawn      25
        normal         20
        replaced        5
Name: gene_status, dtype: int64

In [29]:
sample_res.groupby("is_loc")["gene_status"].value_counts(normalize=True).multiply(100)

is_loc  gene_status
False   replaced       40.0
        normal         30.0
        withdrawn      30.0
True    withdrawn      50.0
        normal         40.0
        replaced       10.0
Name: gene_status, dtype: float64

# Conclusion

From a sample of 100 out of 1562 genes it seems like most of the LOC genes were withdrawn. Some of the genes may be mappable to HGNC ids and therefore UMLS CUIs by updating the mapping file.