In [None]:
import os
from Bio import ExPASy
from Bio import SwissProt
from Bio.PDB import PDBList, PDBParser
import requests
import pandas as pd
from tqdm import tqdm

In [None]:

# Function to fetch the PDB ID with the best resolution for a given UniProt ID
def get_best_pdb_id(uniprot_id):
    url = f"https://www.ebi.ac.uk/pdbe/api/mappings/best_structures/{uniprot_id}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if uniprot_id in data:
            best_structure = sorted(data[uniprot_id], key=lambda x: x['resolution'])[0]
            return best_structure['pdb_id'], best_structure['resolution']
    return None, None

In [None]:
# Function to download a PDB file given a PDB ID
def download_pdb(pdb_id, output_dir):
    pdbl = PDBList()
    pdbl.retrieve_pdb_file(pdb_id, pdir=output_dir, file_format="pdb")

In [None]:
# for MAC
AF_structs = pd.read_csv("/Users/talgalper/Documents/GitHub/PhD-MOC/Druggability_analysis/Fpocket/results_2024.05/fpocket_druggability.csv")
AF_low_conf_structs = pd.read_csv("/Users/talgalper/Documents/GitHub/PhD-MOC/Druggability_analysis/Fpocket/results_2024.05/af_low_conf_struct.csv")

# combine AF_structs and AF_low_conf_structs into a vector of uniprot_ids
uniprot_ids = pd.concat([AF_structs["uniprot_id"], AF_low_conf_structs["uniprot_id"]])
# get distinct uniprot_ids
uniprot_ids = uniprot_ids.drop_duplicates()
# create subset of first 10 IDs
uniprot_ids = uniprot_ids.head(10)


In [7]:
# for Ubuntu
AF_structs = pd.read_csv("/home/ubuntu/Documents/GitHub/PhD-MOC/Druggability_analysis/Fpocket/results_2024.05/fpocket_druggability.csv")
AF_low_conf_structs = pd.read_csv("/home/ubuntu/Documents/GitHub/PhD-MOC/Druggability_analysis/Fpocket/results_2024.05/af_low_conf_struct.csv")

# combine AF_structs and AF_low_conf_structs into a vector of uniprot_ids
uniprot_ids = pd.concat([AF_structs["uniprot_id"], AF_low_conf_structs["uniprot_id"]])
# get distinct uniprot_ids
uniprot_ids = uniprot_ids.drop_duplicates()
# create subset of first 10 IDs to test
#uniprot_ids = uniprot_ids.head(10)

In [8]:
# Main function to process a list of UniProt IDs
def download_structures(uniprot_ids, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for uniprot_id in tqdm(uniprot_ids, desc="Downloading structures"):
        pdb_id, resolution = get_best_pdb_id(uniprot_id)
        if pdb_id:
            print(f"Downloading PDB {pdb_id} for UniProt ID {uniprot_id} (Resolution: {resolution} Å)")
            download_pdb(pdb_id, output_dir)
        else:
            print(f"No structure found for UniProt ID {uniprot_id}")

if __name__ == "__main__":
    # Example list of UniProt IDs
    #output_dir = "/Users/talgalper/OneDrive - RMIT University/PhD/structures/PDB_query/" # MAC
    output_dir = "/home/ubuntu/Desktop/PDB_structures/" # Ubuntu

    download_structures(uniprot_ids, output_dir)

Downloading structures:   0%|          | 1/20462 [00:01<9:02:18,  1.59s/it]

No structure found for UniProt ID A6NDP7


Downloading structures:   0%|          | 2/20462 [00:02<7:47:17,  1.37s/it]

No structure found for UniProt ID E9PKD4


Downloading structures:   0%|          | 3/20462 [00:04<7:25:26,  1.31s/it]

No structure found for UniProt ID M0QZD8


Downloading structures:   0%|          | 4/20462 [00:05<7:14:13,  1.27s/it]

No structure found for UniProt ID O00116


Downloading structures:   0%|          | 5/20462 [00:06<7:07:48,  1.25s/it]

No structure found for UniProt ID O14609
Downloading PDB 1sz7 for UniProt ID O43617 (Resolution: 1.55 Å)
Downloading PDB structure '1sz7'...


Downloading structures:   0%|          | 7/20462 [00:10<9:10:43,  1.62s/it] 

No structure found for UniProt ID O95490


Downloading structures:   0%|          | 8/20462 [00:11<8:30:01,  1.50s/it]

No structure found for UniProt ID O95992
Downloading PDB 5z62 for UniProt ID P00395 (Resolution: 3.6 Å)
Downloading PDB structure '5z62'...


Downloading structures:   0%|          | 9/20462 [00:14<10:20:49,  1.82s/it]

Downloading PDB 2obd for UniProt ID P11597 (Resolution: 2.1 Å)
Downloading PDB structure '2obd'...


Downloading structures:   0%|          | 10/20462 [00:16<10:58:42,  1.93s/it]

Downloading PDB 4md4 for UniProt ID P16112 (Resolution: 1.95 Å)
Downloading PDB structure '4md4'...


Downloading structures:   0%|          | 11/20462 [00:18<11:50:36,  2.08s/it]

Downloading PDB 5lgd for UniProt ID P16671 (Resolution: 2.07 Å)
Downloading PDB structure '5lgd'...


Downloading structures:   0%|          | 12/20462 [00:21<12:31:51,  2.21s/it]

Downloading PDB 4dvq for UniProt ID P19099 (Resolution: 2.49 Å)
Downloading PDB structure '4dvq'...


Downloading structures:   0%|          | 13/20462 [00:24<13:40:19,  2.41s/it]

Downloading PDB 2z5y for UniProt ID P21397 (Resolution: 2.17 Å)
Downloading PDB structure '2z5y'...


Downloading structures:   0%|          | 15/20462 [00:27<11:22:09,  2.00s/it]

No structure found for UniProt ID P25391
Downloading PDB 1g3m for UniProt ID P49888 (Resolution: 1.7 Å)
Downloading PDB structure '1g3m'...


Downloading structures:   0%|          | 17/20462 [00:31<10:16:26,  1.81s/it]

No structure found for UniProt ID P51790


Downloading structures:   0%|          | 18/20462 [00:32<9:17:20,  1.64s/it] 

No structure found for UniProt ID P58550


Downloading structures:   0%|          | 18/20462 [00:33<10:34:42,  1.86s/it]


TypeError: '<' not supported between instances of 'NoneType' and 'float'