# Entrez ID Scraper

In [1]:
import csv
import pandas as pd
import requests
import re

from collections import defaultdict
# Format: https://www.ncbi.nlm.nih.gov/gene/13188983?report=xml&format=text

Helper functions from the main file

In [2]:
# Fantastic function from http://stackoverflow.com/a/39946744/4943106
# to split columns with separators
def tidy_split(df, column, sep='|', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.

    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

def _get_wb_id_for_entrez(entry):
    """Finds all the WB ID entries and returns them separated with commas."""
    id_set = re.findall(r'WormBase:(WBGene[0-9]{8})', entry)
    if len(id_set):
        return ','.join(id_set)
    else:
        return None

def _get_ensg_id_for_entrez(entry):
    """Finds all the ENSG entries and returns them separated with commas."""
    id_set = re.findall(r'ENSG[0-9]{11}', entry)
    if len(id_set):
        return ','.join(id_set)
    else:
        return None

def get_mapping_ce(entrez_list):
    entrez_set = set()
    for id in entrez_list:
        url = 'https://www.ncbi.nlm.nih.gov/gene/{}?report=xml&format=text'.format(id)
        r = requests.get(url)
        wb_ids = set(re.findall(r'WBGene[0-9]{8}', r.text))

        print("{}\n{} {}".format(url, id, wb_ids))
        for wb_id in wb_ids:
            entrez_set.add((id, wb_id))
        
    return entrez_set

def get_mapping_hs(entrez_list):
    entrez_set = set()
    for id in entrez_list:
        url = 'https://www.ncbi.nlm.nih.gov/gene/{}?report=xml&format=text'.format(id)
        r = requests.get(url)
        ensembl_ids = set(re.findall(r'ENSG[0-9]{11}', r.text))

        print("{}\n{} {}".format(url, id, ensembl_ids))
        for ensembl_id in ensembl_ids:
            entrez_set.add((id, ensembl_id))
        
    return entrez_set

## _C. elegans_

In [3]:
homologene_ce_full = pd.read_csv('../homologene/entrez_list_ce.csv',
                                      header=None, names=['CE_ENTREZ'])

# Map obtained from NIH
homologene_ce_mapping = pd.read_csv('Caenorhabditis_elegans.gene_info.gz', sep='\t',
                               header=0, usecols=[1, 5], names=['CE_ENTREZ', 'CE_WB_OLD'])

## Pick out WB ID entries and separate with commas
homologene_ce_mapping['CE_WB_OLD'] = homologene_ce_mapping['CE_WB_OLD'].apply(_get_wb_id_for_entrez)

## Split comma-separated values in one row into multiple rows
homologene_ce_mapping = tidy_split(homologene_ce_mapping, 'CE_WB_OLD', sep=',')

## Only keep the intersecting entries
homologene_ce_mapping = pd.merge(homologene_ce_full, homologene_ce_mapping, how='inner', on='CE_ENTREZ')

# Find missing genes to scrape
homologene_ce_full_set = set(homologene_ce_full['CE_ENTREZ'])
homologene_ce_mapping_set = set(homologene_ce_mapping['CE_ENTREZ']) & homologene_ce_full_set
homologene_ce_to_scrape = sorted(list(homologene_ce_full_set - homologene_ce_mapping_set))
homologene_ce_scraped = get_mapping_ce(homologene_ce_to_scrape)

print("\nFull set size: {}".format(len(homologene_ce_full_set)))
print("Mapping set size: {}".format(len(homologene_ce_mapping_set)))
print("Scraping set size: {}".format(len(homologene_ce_to_scrape)))

scraped_size = len(set([i[0] for i in homologene_ce_scraped]))
print("Scraped set size: {}".format(scraped_size))

# Manual entries:
# https://www.ncbi.nlm.nih.gov/gene/178598
# -> http://www.wormbase.org/species/c_elegans/gene/WBGene00021747
homologene_ce_scraped.add((178598, 'WBGene00021747'))

# https://www.ncbi.nlm.nih.gov/gene/185648
# -> http://www.wormbase.org/species/c_elegans/gene/WBGene00018326
homologene_ce_scraped.add((185648, 'WBGene00018326'))

# https://www.ncbi.nlm.nih.gov/gene/187992
# -> http://www.wormbase.org/species/c_elegans/gene/WBGene00020164
homologene_ce_scraped.add((187992, 'WBGene00020164'))

# https://www.ncbi.nlm.nih.gov/gene/191348
# -> http://www.wormbase.org/species/c_elegans/gene/WBGene00013995
homologene_ce_scraped.add((191348, 'WBGene00013995'))

# https://www.ncbi.nlm.nih.gov/gene/191959
# -> http://www.wormbase.org/species/c_elegans/gene/WBGene00005646
homologene_ce_scraped.add((191959, 'WBGene00005646'))

# https://www.ncbi.nlm.nih.gov/gene/353489
# -> http://www.wormbase.org/species/c_elegans/gene/WBGene00022432
homologene_ce_scraped.add((353489, 'WBGene00022432'))

# https://www.ncbi.nlm.nih.gov/gene/2565697
# -> http://www.wormbase.org/species/c_elegans/gene/WBGene00010965
homologene_ce_scraped.add((2565697, 'WBGene00010965'))

# https://www.ncbi.nlm.nih.gov/gene/2565698
# -> http://www.wormbase.org/species/c_elegans/gene/WBGene00010959
homologene_ce_scraped.add((2565698, 'WBGene00010959'))

# https://www.ncbi.nlm.nih.gov/gene/2565700
# -> http://www.wormbase.org/species/c_elegans/gene/WBGene00010964
homologene_ce_scraped.add((2565700, 'WBGene00010964'))

# https://www.ncbi.nlm.nih.gov/gene/2565705
# -> http://www.wormbase.org/species/c_elegans/gene/WBGene00010963
homologene_ce_scraped.add((2565705, 'WBGene00010963'))

print("Manually matched set size: {}".format(len(set([i[0] for i in homologene_ce_scraped])) - scraped_size))
print("Unique target entries: {}".format(len(set([i[1] for i in homologene_ce_scraped]) | 
                                             set(homologene_ce_mapping['CE_WB_OLD']))))

# Write to file
with open('../homologene/entrez_wb_map_scraped.csv', 'w') as f:
    writer = csv.writer(f)
    for i_id, o_id in sorted(homologene_ce_scraped, key=lambda x: x[0]):
        writer.writerow([i_id, o_id])

https://www.ncbi.nlm.nih.gov/gene/172113?report=xml&format=text
172113 {'WBGene00017882'}
https://www.ncbi.nlm.nih.gov/gene/176677?report=xml&format=text
176677 {'WBGene00013732'}
https://www.ncbi.nlm.nih.gov/gene/178598?report=xml&format=text
178598 set()
https://www.ncbi.nlm.nih.gov/gene/185648?report=xml&format=text
185648 set()
https://www.ncbi.nlm.nih.gov/gene/187992?report=xml&format=text
187992 set()
https://www.ncbi.nlm.nih.gov/gene/188636?report=xml&format=text
188636 {'WBGene00011862'}
https://www.ncbi.nlm.nih.gov/gene/189136?report=xml&format=text
189136 {'WBGene00020959'}
https://www.ncbi.nlm.nih.gov/gene/190265?report=xml&format=text
190265 {'WBGene00021851'}
https://www.ncbi.nlm.nih.gov/gene/191348?report=xml&format=text
191348 set()
https://www.ncbi.nlm.nih.gov/gene/191959?report=xml&format=text
191959 set()
https://www.ncbi.nlm.nih.gov/gene/353489?report=xml&format=text
353489 set()
https://www.ncbi.nlm.nih.gov/gene/2565697?report=xml&format=text
2565697 set()
https://w

## _H. sapiens_

In [4]:
homologene_hs_full = pd.read_csv('../homologene/entrez_list_hs.csv',
                                      header=None, names=['HS_ENTREZ'])
    
# Map obtained from NIH
homologene_hs_mapping = pd.read_csv('Homo_sapiens.gene_info.gz', sep='\t',
                                 header=0, usecols=[1, 5], names=['HS_ENTREZ', 'HS_ENSG'])

## Pick out WB ID entries and separate with commas
homologene_hs_mapping['HS_ENSG'] = homologene_hs_mapping['HS_ENSG'].apply(_get_ensg_id_for_entrez)

## Split comma-separated values in one row into multiple rows
homologene_hs_mapping = tidy_split(homologene_hs_mapping, 'HS_ENSG', sep=',')

## Only keep the intersecting entries
homologene_hs_mapping = pd.merge(homologene_hs_full, homologene_hs_mapping, how='inner', on='HS_ENTREZ')

# Find missing genes to scrape
homologene_hs_full_set = set(homologene_hs_full['HS_ENTREZ'])
homologene_hs_mapping_set = set(homologene_hs_mapping['HS_ENTREZ'])
homologene_hs_to_scrape = sorted(list(homologene_hs_full_set - homologene_hs_mapping_set))
homologene_hs_scraped = get_mapping_ce(homologene_hs_to_scrape)

print("\nFull set size: {}".format(len(homologene_hs_full_set)))
print("Mapping set size: {}".format(len(homologene_hs_mapping_set)))
print("Scraping set size: {}".format(len(homologene_hs_to_scrape)))

scraped_size = len(set([i[0] for i in homologene_hs_scraped]))
print("Scraped set size: {}".format(scraped_size))

# Manual entries:
# https://www.ncbi.nlm.nih.gov/gene/4512
# -> http://uswest.ensembl.org/Homo_sapiens/Gene/Summary?g=ENSG00000198804
homologene_hs_scraped.add((4512, 'ENSG00000198804'))

# https://www.ncbi.nlm.nih.gov/gene/4513
# -> http://uswest.ensembl.org/Homo_sapiens/Gene/Summary?g=ENSG00000198712
homologene_hs_scraped.add((4513, 'ENSG00000198712'))

# https://www.ncbi.nlm.nih.gov/gene/4535
# -> http://uswest.ensembl.org/Homo_sapiens/Gene/Summary?g=ENSG00000198888
homologene_hs_scraped.add((4535, 'ENSG00000198888'))

# https://www.ncbi.nlm.nih.gov/gene/4538
# -> http://uswest.ensembl.org/Homo_sapiens/Gene/Summary?g=ENSG00000198886
homologene_hs_scraped.add((4538, 'ENSG00000198886'))

# https://www.ncbi.nlm.nih.gov/gene/100130097
# -> http://uswest.ensembl.org/Homo_sapiens/Gene/Summary?g=ENSG00000223519
homologene_hs_scraped.add((100130097, 'ENSG00000223519'))

# https://www.ncbi.nlm.nih.gov/gene/100132874
# -> http://may2009.archive.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000204811
# -> http://www.bcgsc.ca/people/malachig/htdocs/alexa_platform/alexa_seq/Neuroblastoma/genes/chr9_10/ENSG00000204811.htm
homologene_hs_scraped.add((100132874, 'ENSG00000204811'))

print("Manually matched set size: {}".format(len(set([i[0] for i in homologene_hs_scraped])) - scraped_size))
print("Unique target entries: {}".format(len(set([i[1] for i in homologene_hs_scraped]) | 
                                             set(homologene_hs_mapping['HS_ENSG']))))

# Write to file
with open('../homologene/entrez_ensembl_map_scraped.csv', 'w') as f:
    writer = csv.writer(f)
    for i_id, o_id in sorted(homologene_hs_scraped, key=lambda x: x[0]):
        writer.writerow([i_id, o_id])

https://www.ncbi.nlm.nih.gov/gene/4512?report=xml&format=text
4512 set()
https://www.ncbi.nlm.nih.gov/gene/4513?report=xml&format=text
4513 set()
https://www.ncbi.nlm.nih.gov/gene/4535?report=xml&format=text
4535 set()
https://www.ncbi.nlm.nih.gov/gene/4538?report=xml&format=text
4538 set()
https://www.ncbi.nlm.nih.gov/gene/100130097?report=xml&format=text
100130097 set()
https://www.ncbi.nlm.nih.gov/gene/100132874?report=xml&format=text
100132874 set()

Full set size: 3184
Mapping set size: 3178
Scraping set size: 6
Scraped set size: 0
Manually matched set size: 6
Unique target entries: 3205


----