In [2]:
# conda environment: "fepenv", python version=3.12.7

import pandas as pd
import sys
import biotite.database.rcsb as rcsb

In [83]:
# Methodology
experimental_method = "X-ray"
max_resolution = 2.0

# Protein characteristics
membrane_annotations = ['PDBTM', 'MemProtMD'] # 'PDBTM', 'OPM', 'MemProtMD', 'mpstruc'
organism = "Homo sapiens"

# Ligand characteristics
min_ligand_molecular_weight = 100
ligand_binding_type = "HAS_NO_COVALENT_LINKAGE"

In [None]:
query_by_experimental_method = rcsb.FieldQuery(
    "rcsb_entry_info.experimental_method",
    exact_match=experimental_method
)

query_by_resolution = rcsb.FieldQuery(
    "rcsb_entry_info.resolution_combined",
    less_or_equal=max_resolution
)

query_by_ligand_mw = rcsb.FieldQuery( #broken?
    "chem_comp.formula_weight",
    molecular_definition=True,
    greater=min_ligand_molecular_weight
)

query_by_membrane_annotation = rcsb.FieldQuery(
    "rcsb_polymer_entity_annotation.type",
    is_in=membrane_annotations
)

query_by_organism = rcsb.FieldQuery(
    "rcsb_entity_source_organism.scientific_name",
    exact_match=organism
)

query_by_ligand_binding_type = rcsb.FieldQuery(
    "rcsb_nonpolymer_instance_annotation.type",
    exact_match=ligand_binding_type
)



In [None]:
query = rcsb.CompositeQuery(
    [
        query_by_resolution,
        query_by_experimental_method,
        query_by_membrane_annotation,
        query_by_organism,
        query_by_ligand_mw,
        query_by_ligand_binding_type
    ],
    "and",
)

pdb_ids = rcsb.search(query)
print(f"{len(pdb_ids)} matches:")
print(pdb_ids)

64 matches:
['1OJA', '2BK5', '2C67', '2UUI', '2V60', '2V61', '2XCG', '2XFN', '2XFP', '3D9S', '3GD8', '3PCV', '3PO7', '4A79', '4A7A', '4AL0', '4AL1', '4EIY', '4N6H', '4WOL', '4YK5', '4YL0', '4YL1', '4YL3', '4ZW9', '5BQG', '5BQH', '5BQI', '5IU4', '5IU7', '5K0I', '5K2C', '5K2D', '5MZJ', '5NM2', '5NM4', '5OLG', '5OLV', '5OLZ', '5OM4', '5SYT', '5T36', '5T37', '5TL9', '5WIU', '5WQC', '6FVZ', '6FW0', '6FWC', '6GT3', '6I6H', '6IGK', '6LPJ', '6LPK', '6LPL', '6PS7', '6QZI', '6RKP', '6VL4', '6WQA', '6WVH', '6YT2', '6ZDR', '8PYV']


In [88]:
'4XNV' in pdb_ids

False

To do:

1. voor PDB entries:
- parse mmcif file, e.g. https://files.rcsb.org/view/4XNV.cif
- extract "Blow_DPI" / free blow DPI?

parsing van mmcif files met Bio.PDB 
https://education.molssi.org/python-scripting-biochemistry/chapters/biopython_mmcif.html
https://www.youtube.com/watch?v=dWqvNQuInjA


parsing CIF file van web
https://stackoverflow.com/questions/57586583/problem-to-parse-a-cif-file-using-mmcif2dict


2. extra query op basis van date?
3. nadenken over strengheid van resolutiefilter (2.0 - 3.5 Ä)

