In [1]:
import time
import polars as pl
import requests
import json
import pathlib
from typing import List, Tuple
from unipressed import IdMappingClient

In [2]:
gene_id_tsv = "../Data/Data_HN5_genelist_rice_2402/HN5_genes_up_rice.tsv"
query_db = "Ensembl_Genomes"
target_db = "UniProtKB"
json_dir = "rice_up_afinfo"
data_url = "cifUrl" # or "pdbUrl", "bcifUrl", "paeImageUrl", "paeDocUrl"
structure_dir = "rice_up_mmcif"
id_mapping_all_file = "rice_up_idmapping_all.tsv"
unmapped_file = "rice_up_unmapped_idmapping.tsv"

&nbsp;

&nbsp;

## 1. UniProt ID mapping step

In [3]:
def chunk_list(lst: List, chunk_size: int) -> List[List]:
    """Split a gene list into chunks"""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

In [4]:
def batch_id_mapping(from_db: str, to_db: str, ids: List[str], chunk_size: int = 100) -> Tuple[pl.DataFrame, List[str]]:
    """function for batch id mapping"""
    all_results = []
    all_unmapped = []
    chunked_ids = chunk_list(ids, chunk_size)

    for i, chunk in enumerate(chunked_ids):
        print(f"Processing chunk {i+1}/{len(chunked_ids)}...")
        
        # create request and run
        request = IdMappingClient.submit(source=from_db, dest=to_db, ids=chunk)
        
        # process results
        chunk_results = list(request.each_result())
        mapped_results = [{"from": item["from"], "to": item["to"]} for item in chunk_results]
        all_results.extend(mapped_results)

        # record unmapped ids
        mapped_ids = set(item["from"] for item in mapped_results)
        unmapped = [id for id in chunk if id not in mapped_ids]
        all_unmapped.extend(unmapped)

        # avoid API rate limit
        time.sleep(3)

    # convert results to DataFrame
    final_df = pl.DataFrame(all_results)
    return final_df, all_unmapped

In [5]:
gene_ids = pl.read_csv(
    gene_id_tsv,
    separator='\t'
).get_column("From").to_list()

mapped_df, unmapped_ids = batch_id_mapping(
    query_db,
    target_db,
    gene_ids
)

display(mapped_df)
display(unmapped_ids)

Processing chunk 1/4...
Processing chunk 2/4...
Processing chunk 3/4...
Processing chunk 4/4...


from,to
str,str
"""Os04g0107900""","""A0A0P0W5Z1"""
"""Os04g0107900""","""A0A0P0W604"""
"""Os04g0107900""","""A0A0P0W643"""
"""Os04g0107900""","""A0A0P0W6G9"""
"""Os04g0107900""","""C7J151"""
…,…
"""Os01g0160800""","""Q9LGK6"""
"""Os01g0160800""","""Q7GC65"""
"""Os06g0651200""","""A0A0N7KMI4"""
"""Os06g0651200""","""Q0DAI2"""


['Os03g0293000',
 'Os05g0156500',
 'Os05g0156401',
 'Os01g0699400',
 'Os10g0440500']

In [6]:
mapped_df2 = mapped_df.rename(
    {
        "from": "From",
        "to": "UniProt Accession"
    }
)

display(mapped_df2)

From,UniProt Accession
str,str
"""Os04g0107900""","""A0A0P0W5Z1"""
"""Os04g0107900""","""A0A0P0W604"""
"""Os04g0107900""","""A0A0P0W643"""
"""Os04g0107900""","""A0A0P0W6G9"""
"""Os04g0107900""","""C7J151"""
…,…
"""Os01g0160800""","""Q9LGK6"""
"""Os01g0160800""","""Q7GC65"""
"""Os06g0651200""","""A0A0N7KMI4"""
"""Os06g0651200""","""Q0DAI2"""


&nbsp;

&nbsp;

## 2. UniProt re-mapping

In [7]:
def fetch_uniprot_data(ensembl_ids: List[str]) -> pl.DataFrame:
    results = []

    for id in ensembl_ids:
        print(f"Processing {id}...")
        url = (
            f"https://rest.uniprot.org/uniprotkb/search?"
            f"query=gene:{id}&format=json"
        )
        response = requests.get(url)
        
        if response.status_code == 200:
            data = json.loads(response.text)
            for item in data.get('results', []):
                primary_accession = item.get('primaryAccession', '')
                secondary_accessions = item.get('secondaryAccessions', [])
                all_accessions = [primary_accession] + secondary_accessions
                
                for accession in all_accessions:
                    entry = {
                        "From": id,
                        "UniProt Accession": accession
                    }
                    
                    # Check if the accession is a match for the gene 
                    # (e.g. Os03g0293000 matches OrderedLocusNames)
                    match_found = False
                    for gene in item.get('genes', []):
                        for locus in gene.get('orderedLocusNames', []):
                            if locus.get('value', '') == id:
                                match_found = True
                                break
                        if match_found:
                            break
                    
                    if match_found:
                        results.append(entry)
        else:
            print(f"Error fetching data for {id}: {response.status_code}")
        
        time.sleep(5)

    return pl.DataFrame(results)

In [8]:
unmapped_df = fetch_uniprot_data(unmapped_ids)
display(unmapped_df)

Processing Os03g0293000...
Processing Os05g0156500...
Processing Os05g0156401...
Processing Os01g0699400...
Processing Os10g0440500...


From,UniProt Accession
str,str
"""Os03g0293000""","""Q10MW6"""
"""Os03g0293000""","""A0A0P0VWC9"""
"""Os03g0293000""","""Q10MW5"""
"""Os05g0156500""","""B9FHF3"""
"""Os05g0156500""","""Q0DKL2"""
…,…
"""Os05g0156401""","""A0A0P0WI59"""
"""Os01g0699400""","""A0A0P0V6Z9"""
"""Os01g0699400""","""Q0JK36"""
"""Os10g0440500""","""A0A0P0XV47"""


&nbsp;

&nbsp;

## 3. Concatenate the two dataframes

In [9]:
id_mapping_df = pl.concat(
    [
        mapped_df2,
        unmapped_df
    ],
    how="vertical_relaxed"
).sort(
    by="From",
    descending=False
)

display(id_mapping_df)

From,UniProt Accession
str,str
"""Os01g0105800""","""A0A8J8YF34"""
"""Os01g0105800""","""Q657Z2"""
"""Os01g0124000""","""Q5ZCB1"""
"""Os01g0124000""","""Q5ZD53"""
"""Os01g0124100""","""A0A0P0UYB2"""
…,…
"""Os12g0569200""","""Q2QND9"""
"""Os12g0600100""","""Q0IM43"""
"""Os12g0600100""","""Q2QML3"""
"""gene-orf224""","""Q35317"""


&nbsp;

&nbsp;

## 4. Get AlphaFold metadata JSON files

In [10]:
def get_af_json(dataframe: pl.DataFrame, target_dir: str):
    """
    Get JSON file from AlphaFoldDB
    """
    pathlib.Path(target_dir).mkdir(parents=True, exist_ok=True)
    
    for row in dataframe.iter_rows():
        gene_id = row[0]
        uniprot_id = row[1]
        
        json_file_name = pathlib.Path(target_dir) / f"{gene_id}_{uniprot_id}_info.json"
        
        if json_file_name.exists():
            message_1 = f"{json_file_name} already exists"
            print(message_1)
            continue
        
        request_url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot_id}"
        
        try:
            response = requests.get(request_url, headers={"Accept": "application/json"}, timeout=30)
            response.raise_for_status()
            
            if response.text:
                data = json.loads(response.text) # parse json
                if isinstance(data, list) and len(data) > 0:
                    message_2 = f"AlphaFold ID {uniprot_id} found in AlphaFold"
                    print(message_2)
                    with open(json_file_name, 'w') as f:
                        json.dump(data[0], f, indent=4)
                else:
                    message_3 = f"AlphaFold ID {uniprot_id} not found in AlphaFold"
                    print(message_3)
            else:
                message_4 = f"Empty response for AlphaFold ID {uniprot_id}"
                print(message_4)
        except requests.exceptions.RequestException as e:
            message_5 = f"Request failed: {e}"
            print(message_5)
            message_6 = f"AlphaFold ID {uniprot_id} not found in AlphaFold"
            print(message_6)
        time.sleep(5)

In [11]:
get_af_json(id_mapping_df, json_dir)

Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/A0A8J8YF34
AlphaFold ID A0A8J8YF34 not found in AlphaFold
rice_up_afinfo/Os01g0105800_Q657Z2_info.json already exists
rice_up_afinfo/Os01g0124000_Q5ZCB1_info.json already exists
Request failed: 404 Client Error: Not Found for url: https://alphafold.ebi.ac.uk/api/prediction/Q5ZD53
AlphaFold ID Q5ZD53 not found in AlphaFold
rice_up_afinfo/Os01g0124100_A0A0P0UYB2_info.json already exists
rice_up_afinfo/Os01g0124100_Q0JR27_info.json already exists
rice_up_afinfo/Os01g0124401_Q0JR25_info.json already exists
rice_up_afinfo/Os01g0135800_Q943Q3_info.json already exists
rice_up_afinfo/Os01g0136000_Q943E7_info.json already exists
rice_up_afinfo/Os01g0136000_E5D3J8_info.json already exists
rice_up_afinfo/Os01g0136050_A0A0P0UXN9_info.json already exists
rice_up_afinfo/Os01g0136100_P27777_info.json already exists
rice_up_afinfo/Os01g0136200_Q943E6_info.json already exists
rice_up_afinfo/Os01g0136200_E5D3

&nbsp;

&nbsp;

## 5. Download CIF files

In [12]:
def get_cif_file(json_dir_path: str, output_dir_path: str):
    """
    Get CIF file from JSON files retrieved from AlphaFoldDB
    
    Args:
        json_dir_path: Directory containing AlphaFold JSON metadata files
        output_dir_path: Directory to save downloaded CIF files
    """
    pathlib.Path(output_dir_path).mkdir(parents=True, exist_ok=True)

    for json_file in pathlib.Path(json_dir_path).glob("*.json"):
        with open(json_file, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                cif_url = data.get(data_url)
                
                if not cif_url:
                    print(f"No {data_url} found in {json_file}")
                    continue
                    
                # Extract filename from URL and create output path
                cif_filename = pathlib.Path(cif_url).name
                output_file = pathlib.Path(output_dir_path) / cif_filename
                
                # Skip if file already exists
                if output_file.exists():
                    print(f"{output_file} already exists")
                    continue
                
                print(f"Downloading {cif_url}")
                response = requests.get(cif_url, timeout=30)
                response.raise_for_status()
                
                # Save CIF file
                output_file.write_bytes(response.content)
                print(f"Saved {output_file}")
                
            except json.JSONDecodeError:
                print(f"Error parsing JSON file: {json_file}")
            except requests.exceptions.RequestException as e:
                print(f"Error downloading CIF file: {e}")
            
            # Rate limiting
            time.sleep(1)

In [13]:
get_cif_file(json_dir, structure_dir)


Downloading https://alphafold.ebi.ac.uk/files/AF-Q7FAD5-F1-model_v4.cif
Saved rice_up_mmcif/AF-Q7FAD5-F1-model_v4.cif
Downloading https://alphafold.ebi.ac.uk/files/AF-Q2R376-F1-model_v4.cif
Saved rice_up_mmcif/AF-Q2R376-F1-model_v4.cif
Downloading https://alphafold.ebi.ac.uk/files/AF-A0A0P0WAG6-F1-model_v4.cif
Saved rice_up_mmcif/AF-A0A0P0WAG6-F1-model_v4.cif
Downloading https://alphafold.ebi.ac.uk/files/AF-Q10CE7-F1-model_v4.cif
Saved rice_up_mmcif/AF-Q10CE7-F1-model_v4.cif
Downloading https://alphafold.ebi.ac.uk/files/AF-Q943E6-F1-model_v4.cif
Saved rice_up_mmcif/AF-Q943E6-F1-model_v4.cif
Downloading https://alphafold.ebi.ac.uk/files/AF-A0A0P0WC16-F1-model_v4.cif
Saved rice_up_mmcif/AF-A0A0P0WC16-F1-model_v4.cif
Downloading https://alphafold.ebi.ac.uk/files/AF-A0A0P0WHW6-F1-model_v4.cif
Saved rice_up_mmcif/AF-A0A0P0WHW6-F1-model_v4.cif
Downloading https://alphafold.ebi.ac.uk/files/AF-Q2R4Z5-F1-model_v4.cif
Saved rice_up_mmcif/AF-Q2R4Z5-F1-model_v4.cif
Downloading https://alphafold.eb

&nbsp;

&nbsp;

## 6. Save all results

In [14]:
unmapped_df.write_csv(unmapped_file, separator="\t")
id_mapping_df.write_csv(id_mapping_all_file, separator="\t")
