In [71]:
# conda environment: "fepenv", python version=3.12.7

import pandas as pd
import sys
import biotite.database.rcsb as rcsb
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
import urllib.request
import pypdb
import os
import tqdm
import warnings

# Querying PDB based on methodological/protein requirements

In [67]:
# Methodology
experimental_method = "X-ray"
max_resolution = 2.0
deposition_date = "2020-01-01T00:00:00Z"

# Protein characteristics
membrane_annotations = ['PDBTM', 'MemProtMD'] # 'PDBTM', 'OPM', 'MemProtMD', 'mpstruc'
organism = "Homo sapiens"

# Ligand characteristics
min_ligand_molecular_weight = 100
ligand_binding_type = "HAS_NO_COVALENT_LINKAGE"

In [68]:
query_by_experimental_method = rcsb.FieldQuery(
    "rcsb_entry_info.experimental_method",
    exact_match=experimental_method
)

query_by_resolution = rcsb.FieldQuery(
    "rcsb_entry_info.resolution_combined",
    less_or_equal=max_resolution
)

# query_by_deposition_date = rcsb.FieldQuery(
#     "rcsb_accession_info.deposit_date", 
#     greater=deposition_date
# )

query_by_ligand_mw = rcsb.FieldQuery( #broken?
    "chem_comp.formula_weight",
    molecular_definition=True,
    greater=min_ligand_molecular_weight
)

query_by_membrane_annotation = rcsb.FieldQuery(
    "rcsb_polymer_entity_annotation.type",
    is_in=membrane_annotations
)

query_by_organism = rcsb.FieldQuery(
    "rcsb_entity_source_organism.scientific_name",
    exact_match=organism
)

query_by_ligand_binding_type = rcsb.FieldQuery(
    "rcsb_nonpolymer_instance_annotation.type",
    exact_match=ligand_binding_type
)



In [69]:
query = rcsb.CompositeQuery(
    [
        query_by_resolution,
        query_by_experimental_method,
        query_by_membrane_annotation,
        query_by_organism,
        query_by_ligand_mw,
        query_by_ligand_binding_type,
    ],
    "and",
)

pdb_ids = rcsb.search(query)
print(f"{len(pdb_ids)} matches:")
print(pdb_ids)

64 matches:
['1OJA', '2BK5', '2C67', '2UUI', '2V60', '2V61', '2XCG', '2XFN', '2XFP', '3D9S', '3GD8', '3PCV', '3PO7', '4A79', '4A7A', '4AL0', '4AL1', '4EIY', '4N6H', '4WOL', '4YK5', '4YL0', '4YL1', '4YL3', '4ZW9', '5BQG', '5BQH', '5BQI', '5IU4', '5IU7', '5K0I', '5K2C', '5K2D', '5MZJ', '5NM2', '5NM4', '5OLG', '5OLV', '5OLZ', '5OM4', '5SYT', '5T36', '5T37', '5TL9', '5WIU', '5WQC', '6FVZ', '6FW0', '6FWC', '6GT3', '6I6H', '6IGK', '6LPJ', '6LPK', '6LPL', '6PS7', '6QZI', '6RKP', '6VL4', '6WQA', '6WVH', '6YT2', '6ZDR', '8PYV']


# Downloading `.pdb` and `.cif` file for selected PDB entries

In [None]:
def download_pdb_cif_from_pdbid(pdb_id):
    folder_name = "proteins"
    file_path = os.path.join(folder_name, pdb_id)
    os.makedirs(folder_name, exist_ok=True)

    warnings.simplefilter("ignore") # ignore deprecation warning
    cifstring = pypdb.get_pdb_file(pdb_id, filetype='cif', compression=False)
    with open((file_path+".cif"), "w") as file:
        file.write(cifstring)

    warnings.simplefilter("ignore") # ignore deprecationwarning
    pdbstring = pypdb.get_pdb_file(pdb_id, filetype="pdb", compression=False)
    with open((file_path+".pdb"), "w") as file:
        file.write(pdbstring)

In [None]:
for pdb_id in tqdm.tqdm(pdb_ids):
    download_pdb_cif_from_pdbid(pdb_id)

In [82]:
pdb_info = MMCIF2Dict(f"{folder_name}/6lpj.cif")

for pdb_id in pdb_ids:
    pdb_info = MMCIF2Dict(f"{folder_name}/{pdb_id}.cif")
    for key, value in pdb_info.items():
        if "refine.pdbx_overall_ESU_R_Free" in key:
            print(key, value)

_refine.pdbx_overall_ESU_R_Free ['0.094']
_refine.pdbx_overall_ESU_R_Free ['0.112']
_refine.pdbx_overall_ESU_R_Free ['0.094']
_refine.pdbx_overall_ESU_R_Free ['0.120']
_refine.pdbx_overall_ESU_R_Free ['0.146']
_refine.pdbx_overall_ESU_R_Free ['0.089']
_refine.pdbx_overall_ESU_R_Free ['0.109']
_refine.pdbx_overall_ESU_R_Free ['0.073']
_refine.pdbx_overall_ESU_R_Free ['?']
_refine.pdbx_overall_ESU_R_Free ['?']
_refine.pdbx_overall_ESU_R_Free ['0.083']
_refine.pdbx_overall_ESU_R_Free ['0.084']
_refine.pdbx_overall_ESU_R_Free ['0.108']
_refine.pdbx_overall_ESU_R_Free ['0.117']
_refine.pdbx_overall_ESU_R_Free ['0.094']
_refine.pdbx_overall_ESU_R_Free ['0.020']
_refine.pdbx_overall_ESU_R_Free ['0.094']
_refine.pdbx_overall_ESU_R_Free ['0.119']
_refine.pdbx_overall_ESU_R_Free ['0.101']
_refine.pdbx_overall_ESU_R_Free ['?']
_refine.pdbx_overall_ESU_R_Free ['0.042']
_refine.pdbx_overall_ESU_R_Free ['0.0660']
_refine.pdbx_overall_ESU_R_Free ['0.043']
_refine.pdbx_overall_ESU_R_Free ['0.046']
_re

To do:

1. voor PDB entries:
- parse mmcif file, e.g. https://files.rcsb.org/view/4XNV.cif
- extract "Blow_DPI" / free blow DPI?

-> vaak geen Blow DPI in de .cif te vinden. Wel ESU R_Free.
-> zelf coordinate error berekenen?

2. nadenken over datum


3. nadenken over strengheid van resolutiefilter (2.0 - 3.5 Ä)
-> bij cutoff < 2.0 Ä: 64 matches

4. automatisch export naar xlsx / csv