# Scraper for deprecated / merged Uniprot IDs

In [1]:
import csv
import pandas as pd
import requests
import re

INPARANOID_HS_OUTPUT = '../inparanoid/uniprot_ensembl_map_scraped.csv'
INPARANOID_CE_OUTPUT = '../inparanoid/uniprot_wb_map_scraped.csv'
ORTHOINSPECTOR_HS_OUTPUT = '../orthoinspector/uniprot_ensembl_map_scraped.csv'
ORTHOINSPECTOR_CE_OUTPUT = '../orthoinspector/uniprot_wb_map_scraped.csv'

# Example URL: http://www.uniprot.org/uniprot/A4UVJ9.txt?version=47
def get_mapping_ce(uniprot_list):
    uniprot_set = set()
    for id in list(uniprot_list):
        url = "http://www.uniprot.org/uniprot/{}?version=*".format(id)
        pattern = r'\.\/{}.txt\?version\=([0-9]{{1,3}})'.format(id)
        r = requests.get(url)
        most_recent = re.findall(pattern, r.text)[0]

        url = 'http://www.uniprot.org/uniprot/{}.txt?version={}'.format(id, most_recent)
        r = requests.get(url)
        wb_ids = set(re.findall(r'WBGene[0-9]{8}', r.text))

        print(url)
        print(id, wb_ids)
        for wb_id in wb_ids:
            uniprot_set.add((id, wb_id))
        
    return uniprot_set

def get_mapping_hs(uniprot_list):
    uniprot_set = set()
    for id in list(uniprot_list):
        url = "http://www.uniprot.org/uniprot/{}?version=*".format(id)
        pattern = r'\.\/{}.txt\?version\=([0-9]{{1,3}})'.format(id)
        r = requests.get(url)
        most_recent = re.findall(pattern, r.text)[0]

        url = 'http://www.uniprot.org/uniprot/{}.txt?version={}'.format(id, most_recent)
        r = requests.get(url)
        ensembl_ids = set(re.findall(r' (ENSG[0-9]{11})', r.text))
        
        print(url)
        print(id, ensembl_ids)
        for ensembl_id in ensembl_ids:
            uniprot_set.add((id, ensembl_id))
            
    return uniprot_set

## InParanoid
### _C. elegans_

In [2]:
inparanoid_ce_full = pd.read_csv('../inparanoid/uniprot_list_ce.csv',
                                      header=None, names=['CE_UNIPROT'])

# Map obtained by using the ID mapping tool from UniProt available at <http://www.uniprot.org/uploadlists/>
inparanoid_ce_mapping = pd.read_csv('../inparanoid/uniprot_wb_map.tsv', sep='\t', header=0,
                                usecols=[0, 1], names=['CE_UNIPROT', 'CE_WB_OLD'])
# Find missing genes to scrape
inparanoid_ce_full_set = set(inparanoid_ce_full['CE_UNIPROT'])
inparanoid_ce_mapping_set = set(inparanoid_ce_mapping['CE_UNIPROT'])
inparanoid_ce_to_scrape = sorted(list(inparanoid_ce_full_set - inparanoid_ce_mapping_set))

print("Full set size: {}".format(len(inparanoid_ce_full_set)))
print("Mapping set size: {}".format(len(inparanoid_ce_mapping_set)))
print("Scraping set size: {}\n".format(len(inparanoid_ce_to_scrape)))

inparanoid_ce_scraped = get_mapping_ce(inparanoid_ce_to_scrape)

# Write to file
with open(INPARANOID_CE_OUTPUT, 'w') as f:
    writer = csv.writer(f)
    for uniprot_id, wb_id in sorted(inparanoid_ce_scraped, key=lambda x: x[0]):
        writer.writerow([uniprot_id, wb_id])

Full set size: 5545
Mapping set size: 5482
Scraping set size: 63

http://www.uniprot.org/uniprot/A4UVJ9.txt?version=47
A4UVJ9 {'WBGene00019439'}
http://www.uniprot.org/uniprot/A7DT48.txt?version=41
A7DT48 {'WBGene00043357'}
http://www.uniprot.org/uniprot/A7WK42.txt?version=44
A7WK42 {'WBGene00044651'}
http://www.uniprot.org/uniprot/B0M0N6.txt?version=38
B0M0N6 {'WBGene00000642'}
http://www.uniprot.org/uniprot/G5EE85.txt?version=56
G5EE85 {'WBGene00002221'}
http://www.uniprot.org/uniprot/G5EFJ2.txt?version=22
G5EFJ2 {'WBGene00003085'}
http://www.uniprot.org/uniprot/H2KYD6.txt?version=37
H2KYD6 {'WBGene00017868'}
http://www.uniprot.org/uniprot/H2KYD7.txt?version=36
H2KYD7 {'WBGene00006890'}
http://www.uniprot.org/uniprot/H2KZ12.txt?version=35
H2KZ12 {'WBGene00000092'}
http://www.uniprot.org/uniprot/H2KZB0.txt?version=18
H2KZB0 {'WBGene00007007'}
http://www.uniprot.org/uniprot/H2KZK6.txt?version=35
H2KZK6 {'WBGene00019467'}
http://www.uniprot.org/uniprot/O01264.txt?version=91
O01264 {'WBG

### _H. sapiens_

In [3]:
inparanoid_hs_full = pd.read_csv('../inparanoid/uniprot_list_hs.csv',
                                      header=None, names=['HS_UNIPROT'])

# Map obtained by using the ID mapping tool from UniProt available at <http://www.uniprot.org/uploadlists/>
inparanoid_hs_mapping = pd.read_csv('../inparanoid/uniprot_ensembl_map.tsv', sep='\t', header=0,
                                usecols=[0, 1], names=['HS_UNIPROT', 'HS_ENSG'])

# Map obtained from Ensembl 74 BioMart (SwissProt & Trembl)
inparanoid_hs_biomart1 = pd.read_csv('../inparanoid/uniprot_ensembl_map_swissprot.tsv',
                                     sep='\t', header=0, names=['HS_ENSG', 'HS_UNIPROT'])
inparanoid_hs_biomart2 = pd.read_csv('../inparanoid/uniprot_ensembl_map_trembl.tsv',
                                     sep='\t', header=0, names=['HS_ENSG', 'HS_UNIPROT'])


# Find missing genes to scrape
inparanoid_hs_full_set = set(inparanoid_hs_full['HS_UNIPROT'])
inparanoid_hs_mapping_set = set(inparanoid_hs_mapping['HS_UNIPROT'])
inparanoid_hs_biomart1_set = set(inparanoid_hs_biomart1['HS_UNIPROT'])
inparanoid_hs_biomart2_set = set(inparanoid_hs_biomart2['HS_UNIPROT'])
inparanoid_hs_to_scrape = sorted(list(inparanoid_hs_full_set -
                                      inparanoid_hs_mapping_set -
                                      inparanoid_hs_biomart1_set -
                                      inparanoid_hs_biomart2_set))

print("Full set size: {}".format(len(inparanoid_hs_full_set)))
print("Mapping set size: {}".format(len(inparanoid_hs_mapping_set)))
print("SwissProt set size: {}".format(len(inparanoid_hs_biomart1_set)))
print("Trembl set size: {}".format(len(inparanoid_hs_biomart2_set)))
print("Scraping set size: {}\n".format(len(inparanoid_hs_to_scrape)))

inparanoid_hs_scraped = get_mapping_hs(inparanoid_hs_to_scrape)

# Write to file
with open(INPARANOID_HS_OUTPUT, 'w') as f:
    writer = csv.writer(f)
    for uniprot_id, ensembl_id in sorted(inparanoid_hs_scraped, key=lambda x: x[0]):
        writer.writerow([uniprot_id, ensembl_id])

Full set size: 8395
Mapping set size: 8220
SwissProt set size: 27
Trembl set size: 13
Scraping set size: 135

http://www.uniprot.org/uniprot/A2A3N6.txt?version=79
A2A3N6 set()
http://www.uniprot.org/uniprot/A2BEX4.txt?version=50
A2BEX4 {'ENSG00000264619'}
http://www.uniprot.org/uniprot/A4D256.txt?version=79
A4D256 set()
http://www.uniprot.org/uniprot/A6NC97.txt?version=49
A6NC97 set()
http://www.uniprot.org/uniprot/A6NCK2.txt?version=80
A6NCK2 {'ENSG00000144010'}
http://www.uniprot.org/uniprot/A6NDR6.txt?version=62
A6NDR6 set()
http://www.uniprot.org/uniprot/A6NDX5.txt?version=78
A6NDX5 set()
http://www.uniprot.org/uniprot/A6NEC2.txt?version=84
A6NEC2 set()
http://www.uniprot.org/uniprot/A6NFC9.txt?version=75
A6NFC9 set()
http://www.uniprot.org/uniprot/A6NGU5.txt?version=80
A6NGU5 set()
http://www.uniprot.org/uniprot/A6NK02.txt?version=81
A6NK02 {'ENSG00000250374'}
http://www.uniprot.org/uniprot/A6NKH3.txt?version=64
A6NKH3 set()
http://www.uniprot.org/uniprot/A6NL99.txt?version=73
A6N

----
## OrthoInspector
### _C. elegans_

In [4]:
orthoinspector_ce_full = pd.read_csv('../orthoinspector/uniprot_list_ce.csv',
                                      header=None, names=['CE_UNIPROT'])

# Map obtained by using the ID mapping tool from UniProt available at <http://www.uniprot.org/uploadlists/>
orthoinspector_ce_mapping = pd.read_csv('../orthoinspector/uniprot_wb_map.tsv', sep='\t', header=0,
                                usecols=[0, 1], names=['CE_UNIPROT', 'CE_WB_OLD'])
# Find missing genes to scrape
orthoinspector_ce_full_set = set(orthoinspector_ce_full['CE_UNIPROT'])
orthoinspector_ce_mapping_set = set(orthoinspector_ce_mapping['CE_UNIPROT'])
orthoinspector_ce_to_scrape = sorted(list(orthoinspector_ce_full_set - orthoinspector_ce_mapping_set))

print("Full set size: {}".format(len(orthoinspector_ce_full_set)))
print("Mapping set size: {}".format(len(orthoinspector_ce_mapping_set)))
print("Scraping set size: {}\n".format(len(orthoinspector_ce_to_scrape)))

orthoinspector_ce_scraped = get_mapping_ce(orthoinspector_ce_to_scrape)

# Write to file
with open(ORTHOINSPECTOR_CE_OUTPUT, 'w') as f:
    writer = csv.writer(f)
    for uniprot_id, wb_id in sorted(orthoinspector_ce_scraped, key=lambda x: x[0]):
        writer.writerow([uniprot_id, wb_id])

Full set size: 5547
Mapping set size: 5480
Scraping set size: 67

http://www.uniprot.org/uniprot/A4UVJ9.txt?version=47
A4UVJ9 {'WBGene00019439'}
http://www.uniprot.org/uniprot/D3YT15.txt?version=31
D3YT15 {'WBGene00002222'}
http://www.uniprot.org/uniprot/D3YT25.txt?version=39
D3YT25 {'WBGene00001487'}
http://www.uniprot.org/uniprot/G5EFJ2.txt?version=22
G5EFJ2 {'WBGene00003085'}
http://www.uniprot.org/uniprot/G8JY08.txt?version=15
G8JY08 {'WBGene00001241'}
http://www.uniprot.org/uniprot/G8JY09.txt?version=17
G8JY09 {'WBGene00015540'}
http://www.uniprot.org/uniprot/G8JY66.txt?version=38
G8JY66 {'WBGene00000857'}
http://www.uniprot.org/uniprot/H2KYD6.txt?version=37
H2KYD6 {'WBGene00017868'}
http://www.uniprot.org/uniprot/H2KYD7.txt?version=36
H2KYD7 {'WBGene00006890'}
http://www.uniprot.org/uniprot/H2KZ12.txt?version=35
H2KZ12 {'WBGene00000092'}
http://www.uniprot.org/uniprot/H2KZB0.txt?version=18
H2KZB0 {'WBGene00007007'}
http://www.uniprot.org/uniprot/H2KZK6.txt?version=35
H2KZK6 {'WBG

In [5]:
orthoinspector_hs_full = pd.read_csv('../orthoinspector/uniprot_list_hs.csv',
                                      header=None, names=['HS_UNIPROT'])

# Map obtained by using the ID mapping tool from UniProt available at <http://www.uniprot.org/uploadlists/>
orthoinspector_hs_mapping = pd.read_csv('../orthoinspector/uniprot_ensembl_map.tsv', sep='\t', header=0,
                                usecols=[0, 1], names=['HS_UNIPROT', 'HS_ENSG'])

# Map obtained from Ensembl 74 BioMart (SwissProt & Trembl)
orthoinspector_hs_biomart1 = pd.read_csv('../orthoinspector/uniprot_ensembl_map_swissprot.tsv',
                                     sep='\t', header=0, names=['HS_ENSG', 'HS_UNIPROT'])
orthoinspector_hs_biomart2 = pd.read_csv('../orthoinspector/uniprot_ensembl_map_trembl.tsv',
                                     sep='\t', header=0, names=['HS_ENSG', 'HS_UNIPROT'])


# Find missing genes to scrape
orthoinspector_hs_full_set = set(orthoinspector_hs_full['HS_UNIPROT'])
orthoinspector_hs_mapping_set = set(orthoinspector_hs_mapping['HS_UNIPROT'])
orthoinspector_hs_biomart1_set = set(orthoinspector_hs_biomart1['HS_UNIPROT'])
orthoinspector_hs_biomart2_set = set(orthoinspector_hs_biomart2['HS_UNIPROT'])
orthoinspector_hs_to_scrape = sorted(list(orthoinspector_hs_full_set -
                                      orthoinspector_hs_mapping_set -
                                      orthoinspector_hs_biomart1_set -
                                      orthoinspector_hs_biomart2_set))

print("Full set size: {}".format(len(orthoinspector_hs_full_set)))
print("Mapping set size: {}".format(len(orthoinspector_hs_mapping_set)))
print("SwissProt set size: {}".format(len(orthoinspector_hs_biomart1_set)))
print("Trembl set size: {}".format(len(orthoinspector_hs_biomart2_set)))
print("Scraping set size: {}\n".format(len(orthoinspector_hs_to_scrape)))

orthoinspector_hs_scraped = get_mapping_hs(orthoinspector_hs_to_scrape)

# Manual entry:
# http://www.uniprot.org/uniprot/O75044.txt?version=151
#    http://www.ensembl.org/Homo_sapiens/Gene/Matches?g=ENSG00000266028
orthoinspector_hs_scraped.add(('O75044', 'ENSG00000266028'))

# Write to file
with open(ORTHOINSPECTOR_HS_OUTPUT, 'w') as f:
    writer = csv.writer(f)
    for uniprot_id, ensembl_id in sorted(orthoinspector_hs_scraped, key=lambda x: x[0]):
        writer.writerow([uniprot_id, ensembl_id])

Full set size: 7317
Mapping set size: 7279
SwissProt set size: 15
Trembl set size: 10
Scraping set size: 13

http://www.uniprot.org/uniprot/C9JQ55.txt?version=26
C9JQ55 {'ENSG00000186008'}
http://www.uniprot.org/uniprot/E7EMT2.txt?version=17
E7EMT2 {'ENSG00000232856'}
http://www.uniprot.org/uniprot/F5H7H8.txt?version=13
F5H7H8 {'ENSG00000157152'}
http://www.uniprot.org/uniprot/F8VZW7.txt?version=12
F8VZW7 {'ENSG00000257717'}
http://www.uniprot.org/uniprot/F8WBX1.txt?version=13
F8WBX1 {'ENSG00000257961'}
http://www.uniprot.org/uniprot/F8WCE9.txt?version=11
F8WCE9 {'ENSG00000258085'}
http://www.uniprot.org/uniprot/F8WEP6.txt?version=15
F8WEP6 {'ENSG00000257882'}
http://www.uniprot.org/uniprot/H0Y5L6.txt?version=18
H0Y5L6 {'ENSG00000196468'}
http://www.uniprot.org/uniprot/H0YHH2.txt?version=9
H0YHH2 {'ENSG00000257278'}
http://www.uniprot.org/uniprot/H0YIG9.txt?version=9
H0YIG9 {'ENSG00000257887'}
http://www.uniprot.org/uniprot/H0YIQ1.txt?version=9
H0YIQ1 {'ENSG00000258246', 'ENSG000002573