In [1]:
import time
import polars as pl
import requests
import json
import pathlib
from typing import List, Tuple
from unipressed import IdMappingClient

In [2]:
gene_id_tsv = "../test/zea_mays_test/zea_mays_random_gene_list.tsv"
query_db = "Ensembl_Genomes"
target_db = "UniProtKB"
json_dir = "zea_mays_random_gene_afinfo"
data_url = "cifUrl" # or "pdbUrl", "bcifUrl", "paeImageUrl", "paeDocUrl"
structure_dir = "zea_mays_random_gene_mmcif"
id_mapping_all_file = "zea_mays_random_gene_idmapping_all.tsv"

In [3]:
# Parameters
gene_id_tsv = "/tmp/l757vlf2/stg9262bae7-7af8-4d7d-a88b-b70871629919/arabidopsis_random_100genes_list.tsv"
query_db = "Ensembl_Genomes"
target_db = "UniProtKB"
json_dir = "at_100_genes_afinfo_json"
data_url = "cifUrl"
structure_dir = "at_100_genes_mmcif"
id_mapping_all_file = "at_100_genes_idmapping_all.tsv"


&nbsp;

&nbsp;

## 1. UniProt ID mapping step

In [4]:
def chunk_list(lst: List, chunk_size: int) -> List[List]:
    """Split a gene list into chunks"""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

In [5]:
def batch_id_mapping(from_db: str, to_db: str, ids: List[str], chunk_size: int = 100) -> Tuple[pl.DataFrame, List[str]]:
    """function for batch id mapping"""
    all_results = []
    all_unmapped = []
    chunked_ids = chunk_list(ids, chunk_size)

    for i, chunk in enumerate(chunked_ids):
        print(f"Processing chunk {i+1}/{len(chunked_ids)}...")
        
        # create request and run
        request = IdMappingClient.submit(source=from_db, dest=to_db, ids=chunk)
        
        # process results
        chunk_results = list(request.each_result())
        mapped_results = [{"from": item["from"], "to": item["to"]} for item in chunk_results]
        all_results.extend(mapped_results)

        # record unmapped ids
        mapped_ids = set(item["from"] for item in mapped_results)
        unmapped = [id for id in chunk if id not in mapped_ids]
        all_unmapped.extend(unmapped)

        # avoid API rate limit
        time.sleep(3)

    # convert results to DataFrame
    final_df = pl.DataFrame(all_results)
    return final_df, all_unmapped

In [6]:
gene_ids = pl.read_csv(
    gene_id_tsv,
    separator='\t'
).get_column("From").to_list()

mapped_df, unmapped_ids = batch_id_mapping(
    query_db,
    target_db,
    gene_ids
)

display(mapped_df)
display(unmapped_ids)

Processing chunk 1/1...


from,to
str,str
"""AT5G63380""","""Q84P23"""
"""AT3G60530""","""O49743"""
"""AT2G38220""","""B3H754"""
"""AT5G57190""","""F4KAK5"""
"""AT1G31770""","""Q9C6W5"""
…,…
"""AT2G29570""","""Q9ZW35"""
"""AT5G62160""","""Q9FIS2"""
"""AT5G61690""","""Q9FKF2"""
"""AT2G24693""","""Q3E7S1"""


['AT5G06610',
 'AT5G39140',
 'AT4G04180',
 'AT2G16016',
 'AT5G06065',
 'AT3G57180',
 'AT1G37000',
 'AT3G07875',
 'AT1G54953',
 'AT1G40087',
 'AT1G15370',
 'AT1G15140',
 'AT1G78400',
 'AT4G21192',
 'AT3G45190',
 'AT2G47190',
 'AT5G40430',
 'AT5G67020',
 'AT1G72500',
 'AT4G01870',
 'AT5G04025',
 'AT5G16940',
 'AT4G06085',
 'AT3G50170',
 'AT5G40470',
 'AT5G36810',
 'AT4G08005',
 'AT5G09070',
 'AT1G12530',
 'AT3G03145',
 'ATMG09450',
 'AT2G23890',
 'AT1G09947',
 'AT1G15840',
 'AT3G05100',
 'AT5G03430',
 'AT4G06080',
 'AT5G59616',
 'AT1G65340',
 'AT2G07695',
 'AT3G03615',
 'AT3G01665',
 'AT4G07535',
 'AT1G54650',
 'AT4G08874',
 'AT4G05655',
 'AT1G08310',
 'AT5G10830',
 'AT2G09480',
 'AT1G80140',
 'AT3G07975',
 'AT5G01075',
 'AT1G80990',
 'AT1G06967']

In [7]:
mapped_df2 = mapped_df.rename(
    {
        "from": "From",
        "to": "UniProt Accession"
    }
)

display(mapped_df2)

From,UniProt Accession
str,str
"""AT5G63380""","""Q84P23"""
"""AT3G60530""","""O49743"""
"""AT2G38220""","""B3H754"""
"""AT5G57190""","""F4KAK5"""
"""AT1G31770""","""Q9C6W5"""
…,…
"""AT2G29570""","""Q9ZW35"""
"""AT5G62160""","""Q9FIS2"""
"""AT5G61690""","""Q9FKF2"""
"""AT2G24693""","""Q3E7S1"""


In [8]:
print(mapped_df.is_empty())
print(len(unmapped_ids) == 0)


False
False


&nbsp;

&nbsp;

## 2. UniProt re-mapping 

## 3. Concatenate the two dataframes

## 4. AlphaFoldDB metadata JSON files

In [9]:
def fetch_uniprot_data(ensembl_ids: List[str]) -> pl.DataFrame:
    results = []

    for id in ensembl_ids:
        print(f"Processing {id}...")
        url = (
            f"https://rest.uniprot.org/uniprotkb/search?"
            f"query=gene:{id}&format=json"
        )
        response = requests.get(url)
        
        if response.status_code == 200:
            data = json.loads(response.text)
            for item in data.get('results', []):
                primary_accession = item.get('primaryAccession', '')
                secondary_accessions = item.get('secondaryAccessions', [])
                all_accessions = [primary_accession] + secondary_accessions
                
                for accession in all_accessions:
                    entry = {
                        "From": id,
                        "UniProt Accession": accession
                    }
                    
                    # Check if the accession is a match for the gene 
                    # (e.g. Os03g0293000 matches OrderedLocusNames)
                    match_found = False
                    for gene in item.get('genes', []):
                        for locus in gene.get('orderedLocusNames', []):
                            if locus.get('value', '') == id:
                                match_found = True
                                break
                        if match_found:
                            break
                    
                    if match_found:
                        results.append(entry)
        else:
            print(f"Error fetching data for {id}: {response.status_code}")
        
        time.sleep(1)

    return pl.DataFrame(results)

In [10]:
def get_af_json(dataframe: pl.DataFrame, target_dir: str):
    """
    Get JSON file from AlphaFoldDB
    """
    pathlib.Path(target_dir).mkdir(parents=True, exist_ok=True)
    
    for row in dataframe.iter_rows():
        gene_id = row[0]
        uniprot_id = row[1]
        
        json_file_name = pathlib.Path(target_dir) / f"{gene_id}_{uniprot_id}_info.json"
        
        if json_file_name.exists():
            message_1 = f"{json_file_name} already exists"
            print(message_1)
            continue
        
        request_url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot_id}"
        
        try:
            response = requests.get(request_url, headers={"Accept": "application/json"}, timeout=30)
            response.raise_for_status()
            
            if response.text:
                data = json.loads(response.text) # parse json
                if isinstance(data, list) and len(data) > 0:
                    message_2 = f"AlphaFold ID {uniprot_id} found in AlphaFold"
                    print(message_2)
                    with open(json_file_name, 'w') as f:
                        json.dump(data[0], f, indent=4)
                else:
                    message_3 = f"AlphaFold ID {uniprot_id} not found in AlphaFold"
                    print(message_3)
            else:
                message_4 = f"Empty response for AlphaFold ID {uniprot_id}"
                print(message_4)
        except requests.exceptions.RequestException as e:
            message_5 = f"Request failed: {e}"
            print(message_5)
            message_6 = f"AlphaFold ID {uniprot_id} not found in AlphaFold"
            print(message_6)
        time.sleep(5)

In [11]:
unmapped_df = fetch_uniprot_data(unmapped_ids)
display(unmapped_df)

Processing AT5G06610...


Processing AT5G39140...


Processing AT4G04180...


Processing AT2G16016...


Processing AT5G06065...


Processing AT3G57180...


Processing AT1G37000...


Processing AT3G07875...


Processing AT1G54953...


Processing AT1G40087...


Processing AT1G15370...


Processing AT1G15140...


Processing AT1G78400...


Processing AT4G21192...


Processing AT3G45190...


Processing AT2G47190...


Processing AT5G40430...


Processing AT5G67020...


Processing AT1G72500...


Processing AT4G01870...


Processing AT5G04025...


Processing AT5G16940...


Processing AT4G06085...


Processing AT3G50170...


Processing AT5G40470...


Processing AT5G36810...


Processing AT4G08005...


Processing AT5G09070...


Processing AT1G12530...


Processing AT3G03145...


Processing ATMG09450...


Processing AT2G23890...


Processing AT1G09947...


Processing AT1G15840...


Processing AT3G05100...


Processing AT5G03430...


Processing AT4G06080...


Processing AT5G59616...


Processing AT1G65340...


Processing AT2G07695...


Processing AT3G03615...


Processing AT3G01665...


Processing AT4G07535...


Processing AT1G54650...


Processing AT4G08874...


Processing AT4G05655...


Processing AT1G08310...


Processing AT5G10830...


Processing AT2G09480...


Processing AT1G80140...


Processing AT3G07975...


Processing AT5G01075...


Processing AT1G80990...


Processing AT1G06967...


In [12]:
if len(unmapped_ids) > 0:
    unmapped_df = fetch_uniprot_data(unmapped_ids)
    display(unmapped_df)

    # Concatenate the two dataframes
    if not unmapped_df.is_empty():
        id_mapping_df = pl.concat(
            [
                mapped_df2,
                unmapped_df
            ],
            how="vertical_relaxed"
        ).sort(
            by="From",
            descending=False
        )
        display(id_mapping_df)
        # Get AlphaFold metadata JSON files
        get_af_json(id_mapping_df, json_dir)
    else:
        print("unmapped dataframe is empty, skipping get_af_json.")
        get_af_json(mapped_df2, json_dir)
else:
    print("unmapped_ids is empty, skipping fetch_uniprot_data.")
    get_af_json(mapped_df2, json_dir)

Processing AT5G06610...


Processing AT5G39140...


Processing AT4G04180...


Processing AT2G16016...


Processing AT5G06065...


Processing AT3G57180...


Processing AT1G37000...


Processing AT3G07875...


Processing AT1G54953...


Processing AT1G40087...


Processing AT1G15370...


Processing AT1G15140...


Processing AT1G78400...


Processing AT4G21192...


Processing AT3G45190...


Processing AT2G47190...


Processing AT5G40430...


Processing AT5G67020...


Processing AT1G72500...


Processing AT4G01870...


Processing AT5G04025...


Processing AT5G16940...


Processing AT4G06085...


Processing AT3G50170...


Processing AT5G40470...


Processing AT5G36810...


Processing AT4G08005...


Processing AT5G09070...


Processing AT1G12530...


Processing AT3G03145...


Processing ATMG09450...


Processing AT2G23890...


Processing AT1G09947...


Processing AT1G15840...


Processing AT3G05100...


Processing AT5G03430...


Processing AT4G06080...


Processing AT5G59616...


Processing AT1G65340...


Processing AT2G07695...


Processing AT3G03615...


Processing AT3G01665...


Processing AT4G07535...


Processing AT1G54650...


Processing AT4G08874...


Processing AT4G05655...


Processing AT1G08310...


Processing AT5G10830...


Processing AT2G09480...


Processing AT1G80140...


Processing AT3G07975...


Processing AT5G01075...


Processing AT1G80990...


Processing AT1G06967...


unmapped dataframe is empty, skipping get_af_json.
AlphaFold ID Q84P23 found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/O49743
AlphaFold ID O49743 not found in AlphaFold


AlphaFold ID B3H754 found in AlphaFold


AlphaFold ID F4KAK5 found in AlphaFold


AlphaFold ID Q9C6W5 found in AlphaFold


AlphaFold ID Q9ZVK1 found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/O23038
AlphaFold ID O23038 not found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/Q6R8G8
AlphaFold ID Q6R8G8 not found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/Q8LCH5
AlphaFold ID Q8LCH5 not found in AlphaFold


AlphaFold ID O04378 found in AlphaFold


AlphaFold ID P49107 found in AlphaFold


AlphaFold ID O80795 found in AlphaFold


AlphaFold ID Q9FX36 found in AlphaFold


AlphaFold ID Q9LR78 found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/Q8W566
AlphaFold ID Q8W566 not found in AlphaFold


AlphaFold ID Q9FML6 found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/O23469
AlphaFold ID O23469 not found in AlphaFold


AlphaFold ID Q2V3K0 found in AlphaFold


AlphaFold ID Q8W593 found in AlphaFold


AlphaFold ID Q9MA83 found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/Q9C5C4
AlphaFold ID Q9C5C4 not found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/Q93Y00
AlphaFold ID Q93Y00 not found in AlphaFold


AlphaFold ID Q9SAJ4 found in AlphaFold


AlphaFold ID A7XDQ9 found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/P0C6E7
AlphaFold ID P0C6E7 not found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/Q9FK59
AlphaFold ID Q9FK59 not found in AlphaFold


AlphaFold ID Q9MAT5 found in AlphaFold


AlphaFold ID Q9FID8 found in AlphaFold


AlphaFold ID Q9LYD8 found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/Q8RXN0
AlphaFold ID Q8RXN0 not found in AlphaFold


AlphaFold ID P30187 found in AlphaFold


AlphaFold ID Q9LE86 found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/Q9SCM4
AlphaFold ID Q9SCM4 not found in AlphaFold


AlphaFold ID Q8RWE5 found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/Q9SQN9
AlphaFold ID Q9SQN9 not found in AlphaFold


AlphaFold ID Q9C7K0 found in AlphaFold


AlphaFold ID Q9SJH7 found in AlphaFold


AlphaFold ID Q8LF89 found in AlphaFold


AlphaFold ID F4JCB2 found in AlphaFold


AlphaFold ID Q93ZH0 found in AlphaFold


AlphaFold ID Q84K35 found in AlphaFold


AlphaFold ID Q9ZW35 found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/Q9FIS2
AlphaFold ID Q9FIS2 not found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/Q9FKF2
AlphaFold ID Q9FKF2 not found in AlphaFold


AlphaFold ID Q3E7S1 found in AlphaFold


AlphaFold ID Q38997 found in AlphaFold


&nbsp;

&nbsp;

## 5. Download CIF files

In [13]:
def get_cif_file(json_dir_path: str, output_dir_path: str):
    """
    Get CIF file from JSON files retrieved from AlphaFoldDB
    
    Args:
        json_dir_path: Directory containing AlphaFold JSON metadata files
        output_dir_path: Directory to save downloaded CIF files
    """
    pathlib.Path(output_dir_path).mkdir(parents=True, exist_ok=True)

    for json_file in pathlib.Path(json_dir_path).glob("*.json"):
        with open(json_file, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                cif_url = data.get(data_url)
                
                if not cif_url:
                    print(f"No {data_url} found in {json_file}")
                    continue
                    
                # Extract filename from URL and create output path
                cif_filename = pathlib.Path(cif_url).name
                output_file = pathlib.Path(output_dir_path) / cif_filename
                
                # Skip if file already exists
                if output_file.exists():
                    print(f"{output_file} already exists")
                    continue
                
                print(f"Downloading {cif_url}")
                response = requests.get(cif_url, timeout=30)
                response.raise_for_status()
                
                # Save CIF file
                output_file.write_bytes(response.content)
                print(f"Saved {output_file}")
                
            except json.JSONDecodeError:
                print(f"Error parsing JSON file: {json_file}")
            except requests.exceptions.RequestException as e:
                print(f"Error downloading CIF file: {e}")
            
            # Rate limiting
            time.sleep(1)

In [14]:
get_cif_file(json_dir, structure_dir)

Downloading https://alphafold.ebi.ac.uk/files/AF-Q38997-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q38997-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q3E7S1-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q3E7S1-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q9ZW35-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q9ZW35-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q84K35-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q84K35-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q93ZH0-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q93ZH0-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-F4JCB2-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-F4JCB2-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q8LF89-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q8LF89-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q9SJH7-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q9SJH7-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q9C7K0-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q9C7K0-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q8RWE5-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q8RWE5-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q9LE86-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q9LE86-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-P30187-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-P30187-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q9LYD8-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q9LYD8-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q9FID8-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q9FID8-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q9MAT5-2-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q9MAT5-2-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-A7XDQ9-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-A7XDQ9-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q9SAJ4-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q9SAJ4-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q9MA83-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q9MA83-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q8W593-2-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q8W593-2-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q2V3K0-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q2V3K0-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q9FML6-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q9FML6-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q9LR78-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q9LR78-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q9FX36-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q9FX36-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-O80795-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-O80795-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-P49107-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-P49107-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-O04378-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-O04378-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q9ZVK1-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q9ZVK1-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q9C6W5-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q9C6W5-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-F4KAK5-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-F4KAK5-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-B3H754-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-B3H754-F1-model_v6.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q84P23-F1-model_v6.cif


Saved at_100_genes_mmcif/AF-Q84P23-F1-model_v6.cif


&nbsp;

&nbsp;

## 6. Save all results

In [15]:
# Save all results
if len(unmapped_ids) > 0 and not unmapped_df.is_empty():
    id_mapping_df.write_csv(id_mapping_all_file, separator="\t")

elif unmapped_df.is_empty():
    print("re-mapping process is skipped, mapping results are saved in id_mapping_df.write_csv.")
    mapped_df2.write_csv(id_mapping_all_file, separator="\t")

else:
    print("unmapped_ids is empty, skipping re-mapping process.")
    mapped_df2.write_csv(id_mapping_all_file, separator="\t")

re-mapping process is skipped, mapping results are saved in id_mapping_df.write_csv.
