In [11]:
import requests

def url_func_req(accession):
	return f"https://rest.uniprot.org/uniprotkb/{accession}.json?fields=cc_function"

def get_func_desc(accession):
	url = url_func_req(accession)
	req = requests.get(url)
	return req.json()

j = get_func_desc("A1A519")

In [12]:
def parse_func_desc(desc):
	if "comments" not in desc or len(desc["comments"]) == 0:
		return None
	comments = desc["comments"][0]
	texts = comments["texts"][0]
	value = texts["value"]
	return value

In [13]:
import pandas as pd

In [14]:
df = pd.read_parquet("../data/569k-protein-embeddings.parquet")

In [15]:
accs = df["accession"].tolist()

In [16]:
import asyncio
import aiohttp

In [27]:
async def get(url, session):
    try:
        async with session.get(url=url) as response:
            resp = await response.json()
            return resp
    except Exception as e:
        print("Unable to get url {} due to {}.".format(url, e.__class__))

In [28]:
async def parallel_fetch(urls):
    async with aiohttp.ClientSession() as session:
        ret = await asyncio.gather(*(get(url, session) for url in urls))
    return ret

In [29]:
urls = [url_func_req(i) for i in accs]

In [30]:
from tqdm import tqdm

batch_size = 1000
results = []
for i in tqdm(range(0, len(urls), batch_size)):
	b = urls[i:i+batch_size]
	res = await parallel_fetch(b)
	results.extend([parse_func_desc(d) for d in res])

100%|██████████| 570/570 [18:31<00:00,  1.95s/it]


In [34]:
df["function"] = results;

In [37]:
df.to_parquet("../data/569k-protein-embeddings-large.parquet")