In [1]:
import time
import polars as pl
import requests
import json
import pathlib
from typing import List, Tuple
from unipressed import IdMappingClient

In [2]:
gene_id_tsv = "../test/oryza_sativa_test/oryza_sativa_random_gene_list.tsv"
query_db = "Ensembl_Genomes"
target_db = "UniProtKB"
json_dir = "rice_up_afinfo"
data_url = "cifUrl" # or "pdbUrl", "bcifUrl", "paeImageUrl", "paeDocUrl"
structure_dir = "rice_up_mmcif"
id_mapping_all_file = "rice_up_idmapping_all.tsv"

In [3]:
# Parameters
gene_id_tsv = "/tmp/08g7e6sx/stg188bc4ca-f07d-41a8-a973-88f74a9ceef6/oryza_sativa_random_gene_list.tsv"
query_db = "Ensembl_Genomes"
target_db = "UniProtKB"
json_dir = "rice_random_gene_afinfo"
data_url = "cifUrl"
structure_dir = "rice_random_gene_mmcif"
id_mapping_all_file = "rice_random_gene_idmapping_all.tsv"


&nbsp;

&nbsp;

## 1. UniProt ID mapping step

In [4]:
def chunk_list(lst: List, chunk_size: int) -> List[List]:
    """Split a gene list into chunks"""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

In [5]:
def batch_id_mapping(from_db: str, to_db: str, ids: List[str], chunk_size: int = 100) -> Tuple[pl.DataFrame, List[str]]:
    """function for batch id mapping"""
    all_results = []
    all_unmapped = []
    chunked_ids = chunk_list(ids, chunk_size)

    for i, chunk in enumerate(chunked_ids):
        print(f"Processing chunk {i+1}/{len(chunked_ids)}...")
        
        # create request and run
        request = IdMappingClient.submit(source=from_db, dest=to_db, ids=chunk)
        
        # process results
        chunk_results = list(request.each_result())
        mapped_results = [{"from": item["from"], "to": item["to"]} for item in chunk_results]
        all_results.extend(mapped_results)

        # record unmapped ids
        mapped_ids = set(item["from"] for item in mapped_results)
        unmapped = [id for id in chunk if id not in mapped_ids]
        all_unmapped.extend(unmapped)

        # avoid API rate limit
        time.sleep(3)

    # convert results to DataFrame
    final_df = pl.DataFrame(all_results)
    return final_df, all_unmapped

In [6]:
gene_ids = pl.read_csv(
    gene_id_tsv,
    separator='\t'
).get_column("From").to_list()

mapped_df, unmapped_ids = batch_id_mapping(
    query_db,
    target_db,
    gene_ids
)

display(mapped_df)
display(unmapped_ids)

Processing chunk 1/1...


from,to
str,str
"""Os01g0187600""","""A0A0P0UZ77"""
"""Os12g0129300""","""A0A0P0Y6G7"""
"""Os12g0129300""","""B9GBP4"""
"""Os12g0159500""","""A0A0P0Y794"""
"""Os12g0159500""","""A0A8J8YJ44"""
…,…
"""Os04g0391500""","""Q01LC6"""
"""Os01g0795250""","""A0A0P0V975"""
"""Os01g0859200""","""A0A0P0VAK7"""
"""Os01g0859200""","""A0A0P0VAM0"""


[]

In [7]:
mapped_df2 = mapped_df.rename(
    {
        "from": "From",
        "to": "UniProt Accession"
    }
)

display(mapped_df2)

From,UniProt Accession
str,str
"""Os01g0187600""","""A0A0P0UZ77"""
"""Os12g0129300""","""A0A0P0Y6G7"""
"""Os12g0129300""","""B9GBP4"""
"""Os12g0159500""","""A0A0P0Y794"""
"""Os12g0159500""","""A0A8J8YJ44"""
…,…
"""Os04g0391500""","""Q01LC6"""
"""Os01g0795250""","""A0A0P0V975"""
"""Os01g0859200""","""A0A0P0VAK7"""
"""Os01g0859200""","""A0A0P0VAM0"""


In [8]:
print(mapped_df.is_empty())
print(len(unmapped_ids) == 0)


False
True


&nbsp;

&nbsp;

## 2. UniProt re-mapping 

## 3. Concatenate the two dataframes

## 4. AlphaFoldDB metadata JSON files

In [9]:
def fetch_uniprot_data(ensembl_ids: List[str]) -> pl.DataFrame:
    results = []

    for id in ensembl_ids:
        print(f"Processing {id}...")
        url = (
            f"https://rest.uniprot.org/uniprotkb/search?"
            f"query=gene:{id}&format=json"
        )
        response = requests.get(url)
        
        if response.status_code == 200:
            data = json.loads(response.text)
            for item in data.get('results', []):
                primary_accession = item.get('primaryAccession', '')
                secondary_accessions = item.get('secondaryAccessions', [])
                all_accessions = [primary_accession] + secondary_accessions
                
                for accession in all_accessions:
                    entry = {
                        "From": id,
                        "UniProt Accession": accession
                    }
                    
                    # Check if the accession is a match for the gene 
                    # (e.g. Os03g0293000 matches OrderedLocusNames)
                    match_found = False
                    for gene in item.get('genes', []):
                        for locus in gene.get('orderedLocusNames', []):
                            if locus.get('value', '') == id:
                                match_found = True
                                break
                        if match_found:
                            break
                    
                    if match_found:
                        results.append(entry)
        else:
            print(f"Error fetching data for {id}: {response.status_code}")
        
        time.sleep(1)

    return pl.DataFrame(results)

In [10]:
def get_af_json(dataframe: pl.DataFrame, target_dir: str):
    """
    Get JSON file from AlphaFoldDB
    """
    pathlib.Path(target_dir).mkdir(parents=True, exist_ok=True)
    
    for row in dataframe.iter_rows():
        gene_id = row[0]
        uniprot_id = row[1]
        
        json_file_name = pathlib.Path(target_dir) / f"{gene_id}_{uniprot_id}_info.json"
        
        if json_file_name.exists():
            message_1 = f"{json_file_name} already exists"
            print(message_1)
            continue
        
        request_url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot_id}"
        
        try:
            response = requests.get(request_url, headers={"Accept": "application/json"}, timeout=30)
            response.raise_for_status()
            
            if response.text:
                data = json.loads(response.text) # parse json
                if isinstance(data, list) and len(data) > 0:
                    message_2 = f"AlphaFold ID {uniprot_id} found in AlphaFold"
                    print(message_2)
                    with open(json_file_name, 'w') as f:
                        json.dump(data[0], f, indent=4)
                else:
                    message_3 = f"AlphaFold ID {uniprot_id} not found in AlphaFold"
                    print(message_3)
            else:
                message_4 = f"Empty response for AlphaFold ID {uniprot_id}"
                print(message_4)
        except requests.exceptions.RequestException as e:
            message_5 = f"Request failed: {e}"
            print(message_5)
            message_6 = f"AlphaFold ID {uniprot_id} not found in AlphaFold"
            print(message_6)
        time.sleep(5)

In [11]:
if len(unmapped_ids) > 0:
    unmapped_df = fetch_uniprot_data(unmapped_ids)
    display(unmapped_df)

    # Concatenate the two dataframes
    id_mapping_df = pl.concat(
        [
            mapped_df2,
            unmapped_df
        ],
        how="vertical_relaxed"
    ).sort(
        by="From",
        descending=False
    )
    display(id_mapping_df)

    # Get AlphaFold metadata JSON files
    get_af_json(id_mapping_df, json_dir)
else:
    print("unmapped_ids is empty, skipping fetch_uniprot_data.")
    get_af_json(mapped_df2, json_dir)

unmapped_ids is empty, skipping fetch_uniprot_data.


AlphaFold ID A0A0P0UZ77 found in AlphaFold


AlphaFold ID A0A0P0Y6G7 found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/B9GBP4
AlphaFold ID B9GBP4 not found in AlphaFold


AlphaFold ID A0A0P0Y794 found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/A0A8J8YJ44
AlphaFold ID A0A8J8YJ44 not found in AlphaFold


AlphaFold ID B9GBZ8 found in AlphaFold


Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/B9F105
AlphaFold ID B9F105 not found in AlphaFold


AlphaFold ID Q6K1X4 found in AlphaFold


AlphaFold ID A0A0P0WNJ6 found in AlphaFold


AlphaFold ID A0A0N7KKL4 found in AlphaFold


AlphaFold ID Q5VPE3 found in AlphaFold


AlphaFold ID A0A0P0W9M0 found in AlphaFold


AlphaFold ID Q0JDM8 found in AlphaFold


AlphaFold ID Q7XVN5 found in AlphaFold


AlphaFold ID Q01LC6 found in AlphaFold


AlphaFold ID A0A0P0V975 found in AlphaFold


AlphaFold ID A0A0P0VAK7 found in AlphaFold


AlphaFold ID A0A0P0VAM0 found in AlphaFold


AlphaFold ID Q94DD0 found in AlphaFold


&nbsp;

&nbsp;

## 5. Download CIF files

In [12]:
def get_cif_file(json_dir_path: str, output_dir_path: str):
    """
    Get CIF file from JSON files retrieved from AlphaFoldDB
    
    Args:
        json_dir_path: Directory containing AlphaFold JSON metadata files
        output_dir_path: Directory to save downloaded CIF files
    """
    pathlib.Path(output_dir_path).mkdir(parents=True, exist_ok=True)

    for json_file in pathlib.Path(json_dir_path).glob("*.json"):
        with open(json_file, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                cif_url = data.get(data_url)
                
                if not cif_url:
                    print(f"No {data_url} found in {json_file}")
                    continue
                    
                # Extract filename from URL and create output path
                cif_filename = pathlib.Path(cif_url).name
                output_file = pathlib.Path(output_dir_path) / cif_filename
                
                # Skip if file already exists
                if output_file.exists():
                    print(f"{output_file} already exists")
                    continue
                
                print(f"Downloading {cif_url}")
                response = requests.get(cif_url, timeout=30)
                response.raise_for_status()
                
                # Save CIF file
                output_file.write_bytes(response.content)
                print(f"Saved {output_file}")
                
            except json.JSONDecodeError:
                print(f"Error parsing JSON file: {json_file}")
            except requests.exceptions.RequestException as e:
                print(f"Error downloading CIF file: {e}")
            
            # Rate limiting
            time.sleep(1)

In [13]:
get_cif_file(json_dir, structure_dir)

Downloading https://alphafold.ebi.ac.uk/files/AF-A0A0P0UZ77-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-A0A0P0UZ77-F1-model_v4.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-A0A0P0Y6G7-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-A0A0P0Y6G7-F1-model_v4.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-A0A0P0Y794-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-A0A0P0Y794-F1-model_v4.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-B9GBZ8-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-B9GBZ8-F1-model_v4.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q6K1X4-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-Q6K1X4-F1-model_v4.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-A0A0P0WNJ6-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-A0A0P0WNJ6-F1-model_v4.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-A0A0N7KKL4-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-A0A0N7KKL4-F1-model_v4.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q5VPE3-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-Q5VPE3-F1-model_v4.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-A0A0P0W9M0-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-A0A0P0W9M0-F1-model_v4.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q0JDM8-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-Q0JDM8-F1-model_v4.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q7XVN5-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-Q7XVN5-F1-model_v4.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q01LC6-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-Q01LC6-F1-model_v4.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-A0A0P0V975-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-A0A0P0V975-F1-model_v4.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-A0A0P0VAK7-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-A0A0P0VAK7-F1-model_v4.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-A0A0P0VAM0-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-A0A0P0VAM0-F1-model_v4.cif


Downloading https://alphafold.ebi.ac.uk/files/AF-Q94DD0-F1-model_v4.cif
Saved rice_random_gene_mmcif/AF-Q94DD0-F1-model_v4.cif


&nbsp;

&nbsp;

## 6. Save all results

In [14]:
# Save all results
if len(unmapped_ids) > 0:
    id_mapping_df.write_csv(id_mapping_all_file, separator="\t")
else:
    print("unmapped_ids is empty, mapping results are saved in id_mapping_df.write_csv.")
    mapped_df2.write_csv(id_mapping_all_file, separator="\t")


unmapped_ids is empty, mapping results are saved in id_mapping_df.write_csv.
