In [1]:
import requests
import time

def get_uniprot_ids(chembl_id):
    """Fetch UniProt IDs for a given ChEMBL ID"""
    url = f"https://www.ebi.ac.uk/chembl/api/data/target/{chembl_id}.json"
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        
        uniprot_ids = []


        if 'target_components' in data:
            #print("algo")
            for component in data['target_components']:
                #print(component)
                if 'target_component_xrefs' in component:
                    for xref in component['target_component_xrefs']:
                        #print(xref)
                        if xref['xref_src_db'] == 'UniProt':
                            uniprot_ids.append(xref['xref_id'])
                            
        
        return uniprot_ids
    except:
        return []


In [2]:
with open('./data/target_prediction/ppb2/ppb_chembl_ids.txt', 'r') as f:
    ppb_chembl_ids = [line.strip() for line in f if line.strip()]

with open('./data/target_prediction/multitask/multitask_chembl_ids.txt', 'r') as f:
    multitask_chembl_ids = [line.strip() for line in f if line.strip()]
        
    

In [5]:
# Collect all UniProt IDs
all_uniprot_ids = []
for chembl_id in ppb_chembl_ids:
    uniprot_ids = get_uniprot_ids(chembl_id)

    if len(uniprot_ids) == 1:
        all_uniprot_ids.append((chembl_id, uniprot_ids[0]))
    else:
        for uniprot_id in uniprot_ids:
            all_uniprot_ids.append((chembl_id, uniprot_id))
    time.sleep(0.1)  # Be nice to the API

# Remove duplicates and write to output file
unique_uniprot_ids = list(set(all_uniprot_ids))
with open('data/target_prediction/ppb2/ppb_uniprot_ids.txt', 'w') as f:
    for chembl_id, uniprot_id in unique_uniprot_ids:
        f.write(f"{chembl_id}\t{uniprot_id}\n")

print(f"Processed {len(ppb_chembl_ids)} ChEMBL IDs")
print(f"Found {len(unique_uniprot_ids)} unique UniProt IDs")


Processed 962 ChEMBL IDs
Found 4748 unique UniProt IDs


In [None]:
# Collect all UniProt IDs
all_uniprot_ids = []
for chembl_id in multitask_chembl_ids:
    uniprot_ids = get_uniprot_ids(chembl_id)

    if len(uniprot_ids) == 1:
        all_uniprot_ids.append((chembl_id, uniprot_ids[0]))
    else:
        for uniprot_id in uniprot_ids:
            all_uniprot_ids.append((chembl_id, uniprot_id))
    time.sleep(0.1)  # Be nice to the API

# Remove duplicates and write to output file
unique_uniprot_ids = list(set(all_uniprot_ids))
with open('data/target_prediction/multitask/multitask_uniprot_ids.txt', 'w') as f:
    for chembl_id, uniprot_id in unique_uniprot_ids:
        f.write(f"{chembl_id}\t{uniprot_id}\n")

print(f"Processed {len(multitask_chembl_ids)} ChEMBL IDs")
print(f"Found {len(unique_uniprot_ids)} unique UniProt IDs")
