In [2]:
import time
import polars as pl
import requests
import json
import pathlib
from typing import List, Tuple
from unipressed import IdMappingClient

In [3]:
gene_id_tsv = "../Data/Data_HN5_genelist_rice_2402/HN5_genes_down_rice.tsv"
query_db = "Ensembl_Genomes"
target_db = "UniProtKB"
json_dir = "../out/rice_down/rice_down_afinfo"

## 1. UniProt ID mapping step

In [4]:
def chunk_list(lst: List, chunk_size: int) -> List[List]:
    """Split a gene list into chunks"""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

In [5]:
def batch_id_mapping(from_db: str, to_db: str, ids: List[str], chunk_size: int = 100) -> Tuple[pl.DataFrame, List[str]]:
    """function for batch id mapping"""
    all_results = []
    all_unmapped = []
    chunked_ids = chunk_list(ids, chunk_size)

    for i, chunk in enumerate(chunked_ids):
        print(f"Processing chunk {i+1}/{len(chunked_ids)}...")
        
        # create request and run
        request = IdMappingClient.submit(source=from_db, dest=to_db, ids=chunk)
        
        # process results
        chunk_results = list(request.each_result())
        mapped_results = [{"from": item["from"], "to": item["to"]} for item in chunk_results]
        all_results.extend(mapped_results)

        # record unmapped ids
        mapped_ids = set(item["from"] for item in mapped_results)
        unmapped = [id for id in chunk if id not in mapped_ids]
        all_unmapped.extend(unmapped)

        # avoid API rate limit
        time.sleep(1)

    # convert results to DataFrame
    final_df = pl.DataFrame(all_results)
    return final_df, all_unmapped

In [6]:
gene_ids = pl.read_csv(
    gene_id_tsv,
    separator='\t'
).get_column("From").to_list()

mapped_df, unmapped_ids = batch_id_mapping(
    query_db,
    target_db,
    gene_ids
)

display(mapped_df)
display(unmapped_ids)

Processing chunk 1/4...
Processing chunk 2/4...
Processing chunk 3/4...
Processing chunk 4/4...


from,to
str,str
"""Os03g0859100""","""A0A0P0W5R9"""
"""Os03g0859100""","""A0A8J8XPT0"""
"""Os03g0859100""","""A0A8J8YIZ0"""
"""Os03g0859100""","""A3APY6"""
"""Os03g0859100""","""A3APY7"""
…,…
"""Os03g0307200""","""Q10MI9"""
"""Os07g0142200""","""A0A0P0X2Q0"""
"""Os07g0142200""","""Q8H4K4"""
"""Os03g0307300""","""Q0DSH9"""


['Os01g0192900',
 'Os09g0249750',
 'Os12g0116800',
 'Os10g0556000',
 'Os04g0578600',
 'Os07g0142100']

In [7]:
mapped_df2 = mapped_df.rename(
    {
        "from": "From",
        "to": "UniProt Accession"
    }
)

display(mapped_df2)

From,UniProt Accession
str,str
"""Os03g0859100""","""A0A0P0W5R9"""
"""Os03g0859100""","""A0A8J8XPT0"""
"""Os03g0859100""","""A0A8J8YIZ0"""
"""Os03g0859100""","""A3APY6"""
"""Os03g0859100""","""A3APY7"""
…,…
"""Os03g0307200""","""Q10MI9"""
"""Os07g0142200""","""A0A0P0X2Q0"""
"""Os07g0142200""","""Q8H4K4"""
"""Os03g0307300""","""Q0DSH9"""


## 2. UniProt re-mapping

In [8]:
def fetch_uniprot_data(ensembl_ids: List[str]) -> pl.DataFrame:
    results = []

    for id in ensembl_ids:
        print(f"Processing {id}...")
        url = (
            f"https://rest.uniprot.org/uniprotkb/search?"
            f"query=gene:{id}&format=json"
        )
        response = requests.get(url)
        
        if response.status_code == 200:
            data = json.loads(response.text)
            for item in data.get('results', []):
                primary_accession = item.get('primaryAccession', '')
                secondary_accessions = item.get('secondaryAccessions', [])
                all_accessions = [primary_accession] + secondary_accessions
                
                for accession in all_accessions:
                    entry = {
                        "From": id,
                        "UniProt Accession": accession
                    }
                    
                    # Check if the accession is a match for the gene 
                    # (e.g. Os03g0293000 matches OrderedLocusNames)
                    match_found = False
                    for gene in item.get('genes', []):
                        for locus in gene.get('orderedLocusNames', []):
                            if locus.get('value', '') == id:
                                match_found = True
                                break
                        if match_found:
                            break
                    
                    if match_found:
                        results.append(entry)
        else:
            print(f"Error fetching data for {id}: {response.status_code}")
        
        time.sleep(1)

    return pl.DataFrame(results)

In [9]:
unmapped_df = fetch_uniprot_data(unmapped_ids)
display(unmapped_df)

Processing Os01g0192900...
Processing Os09g0249750...
Processing Os12g0116800...
Processing Os10g0556000...
Processing Os04g0578600...
Processing Os07g0142100...


From,UniProt Accession
str,str
"""Os01g0192900""","""A0A0P0UZP7"""
"""Os01g0192900""","""O24220"""
"""Os01g0192900""","""Q9FU17"""
"""Os09g0249750""","""A0A0P0XK13"""
"""Os12g0116800""","""A0A0P0Y6C0"""
"""Os10g0556000""","""A0A0P0XXI3"""
"""Os04g0578600""","""Q0JAT2"""
"""Os04g0578600""","""A0A0P0WE87"""
"""Os07g0142100""","""Q0D8P1"""
"""Os07g0142100""","""A0A0N7KMX3"""


## 3. Concatenate the two dataframes

In [10]:
id_mapping_df = pl.concat(
    [
        mapped_df2,
        unmapped_df
    ],
    how="vertical_relaxed"
).sort(
    by="From",
    descending=False
)

display(id_mapping_df)

From,UniProt Accession
str,str
"""Os01g0105300""","""A0A0P0UWX1"""
"""Os01g0136300""","""A0A8J8YJH9"""
"""Os01g0136300""","""Q5ZC68"""
"""Os01g0137950""","""A0A0P0UXR3"""
"""Os01g0179600""","""B7FAP2"""
…,…
"""Os12g0610800""","""A0A0N7KUC5"""
"""Os12g0630100""","""Q0ILR1"""
"""Os12g0630100""","""Q2QLS8"""
"""Os12g0630200""","""A3CJP3"""


## 4. Get AlphaFold metadata JSON files

In [11]:
def get_af_json(dataframe: pl.DataFrame, target_dir: str):
    """
    Get JSON file from AlphaFoldDB
    """
    pathlib.Path(target_dir).mkdir(parents=True, exist_ok=True)
    
    for row in dataframe.iter_rows():
        gene_id = row[0]
        uniprot_id = row[1]
        
        json_file_name = pathlib.Path(target_dir) / f"{gene_id}_{uniprot_id}_info.json"
        
        if json_file_name.exists():
            message_1 = f"{json_file_name} already exists"
            print(message_1)
            continue
        
        request_url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot_id}"
        
        try:
            response = requests.get(request_url, headers={"Accept": "application/json"}, timeout=30)
            response.raise_for_status()
            
            if response.text:
                data = json.loads(response.text) # parse json
                if isinstance(data, list) and len(data) > 0:
                    message_2 = f"AlphaFold ID {uniprot_id} found in AlphaFold"
                    print(message_2)
                    with open(json_file_name, 'w') as f:
                        json.dump(data[0], f, indent=4)
                else:
                    message_3 = f"AlphaFold ID {uniprot_id} not found in AlphaFold"
                    print(message_3)
            else:
                message_4 = f"Empty response for AlphaFold ID {uniprot_id}"
                print(message_4)
        except requests.exceptions.RequestException as e:
            message_5 = f"Request failed: {e}"
            print(message_5)
            message_6 = f"AlphaFold ID {uniprot_id} not found in AlphaFold"
            print(message_6)
        time.sleep(5)

In [12]:
get_af_json(id_mapping_df, json_dir)

../out/rice_down/rice_down_afinfo/Os01g0105300_A0A0P0UWX1_info.json already exists
Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/A0A8J8YJH9
AlphaFold ID A0A8J8YJH9 not found in AlphaFold
../out/rice_down/rice_down_afinfo/Os01g0136300_Q5ZC68_info.json already exists
../out/rice_down/rice_down_afinfo/Os01g0137950_A0A0P0UXR3_info.json already exists
../out/rice_down/rice_down_afinfo/Os01g0179600_B7FAP2_info.json already exists
../out/rice_down/rice_down_afinfo/Os01g0179600_Q0JQ63_info.json already exists
../out/rice_down/rice_down_afinfo/Os01g0192900_A0A0P0UZP7_info.json already exists
../out/rice_down/rice_down_afinfo/Os01g0192900_O24220_info.json already exists
../out/rice_down/rice_down_afinfo/Os01g0192900_Q9FU17_info.json already exists
Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/A0A8J8YP02
AlphaFold ID A0A8J8YP02 not found in AlphaFold
../out/rice_down/rice_down_afinfo/Os01g0208400_Q

## 5. Download mmCIF files

In [14]:
!bash ../scripts/afurl_download.sh ../out/rice_down/rice_down_afinfo/ ../out/rice_down/rice_down_mmcif/

File ../out/rice_down/rice_down_mmcif//AF-Q7G7W2-F1-model_v4.cif already exists. Skipping download.
File ../out/rice_down/rice_down_mmcif//AF-A0A0P0V528-F1-model_v4.cif already exists. Skipping download.
File ../out/rice_down/rice_down_mmcif//AF-Q69MM2-F1-model_v4.cif already exists. Skipping download.
File ../out/rice_down/rice_down_mmcif//AF-A0A0P0X4W3-F1-model_v4.cif already exists. Skipping download.
File ../out/rice_down/rice_down_mmcif//AF-A0A0P0V5L9-F1-model_v4.cif already exists. Skipping download.
File ../out/rice_down/rice_down_mmcif//AF-A0A0P0V3I1-F1-model_v4.cif already exists. Skipping download.
File ../out/rice_down/rice_down_mmcif//AF-Q8LRI2-F1-model_v4.cif already exists. Skipping download.
File ../out/rice_down/rice_down_mmcif//AF-A0A0P0VAZ6-F1-model_v4.cif already exists. Skipping download.
File ../out/rice_down/rice_down_mmcif//AF-Q7XDM8-F1-model_v4.cif already exists. Skipping download.
File ../out/rice_down/rice_down_mmcif//AF-Q6Z7E4-F1-model_v4.cif already exists.

## 6. Save all results

In [16]:
unmapped_df.write_csv("../out/rice_down/rice_down_unmapped_idmapping.tsv", separator="\t")
id_mapping_df.write_csv("../out/rice_down/rice_down_idmapping.tsv", separator="\t")