# pypdb demos

This is a set of basic examples of the usage and outputs of the various individual functions included in. There are generally three types of functions.

### Preamble

In [2]:
%matplotlib inline
from IPython.display import HTML

# Import from local directory
# import sys
# sys.path.insert(0, '../pypdb')
# from pypdb import *

# Import from installed package
from pypdb import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Search functions that return lists of PDB IDs

#### Get a list of PDBs for a specific search term

In [3]:
found_pdbs = Query("ribosome").search()
print(found_pdbs[:10])

['1T1M', '1QI7', '6FSW', '1MRI', '486D', '1MRJ', '1AHC', '1MRH', '1RL0', '2PQG']


#### Search by PubMed ID Number

In [5]:
found_pdbs = Query(27499440, "PubmedIdQuery").search()
print(found_pdbs[:10])

['5IMT', '5IMW', '5IMY']


#### Search by source organism using NCBI TaxId

In [6]:
found_pdbs = Query('6239', 'TreeEntityQuery').search() #TaxID for C elegans
print(found_pdbs[:5])

['1D4X', '1DYW', '1E3B', '1E8K', '1EMS']


#### Search by a specific experimental method

In [7]:
found_pdbs = Query('SOLID-STATE NMR', query_type='ExpTypeQuery').search()
print(found_pdbs[:10])

['1CEK', '1EQ8', '1M8M', '1MAG', '1MP6', '1MZT', '1NH4', '1NYJ', '1PI7', '1PI8']


#### Search by protein structure similarity

In [8]:
found_pdbs = Query('2E8D', query_type="structure").search()
print(found_pdbs[:10])

['2E8D', '4OBA', '4OGV', '4JVR', '3LBL', '4QO4', '4JWR', '2G1E', '2WS4', '4ERE']


#### Search by Author

In [8]:
found_pdbs = Query('Perutz, M.F.', query_type='AdvancedAuthorQuery').search()
print(found_pdbs)

['1CQ4', '1FDH', '1GDJ', '1HDA', '1PBX', '2DHB', '2GDM', '2HHB', '2MHB', '3HHB', '4HHB']


#### Search by organism

In [9]:
q = Query("Dictyostelium", query_type="OrganismQuery")
print(q.search()[:10])

['2H84', '3MNQ', '4AE3', '8OHY', '5AN9', '6QKL', '2VM9', '2VMC', '2VMD', '2VME']


#### Search by Uniprot ID

In [6]:
uniprot_info = Query("A0A023GPI8", query_type="uniprot").search()
print(uniprot_info[:5])

['4K1Y', '4K1Z', '4K20', '4K21']


#### Search by PFAM number

In [5]:
pfam_info = Query("PF00008", query_type="pfam").search()
print(pfam_info[:5])

['1A3P', '1APO', '1AUT', '1BF9', '1CCF']


# Information Search functions
While the basic functions described in the previous section are useful for looking up and manipulating individual unique entries, these functions are intended to be more user-facing: they take search keywords and return lists of authors or dates

#### Find papers for a given keyword

In [10]:
matching_papers = find_papers('crispr', max_results=10)
print(list(matching_papers)[:10])

['Structures of the Cmr-beta Complex Reveal the Regulation of the Immunity Mechanism of Type III-B CRISPR-Cas.', 'Structures of the Cmr-beta Complex Reveal the Regulation of the Immunity Mechanism of Type III-B CRISPR-Cas', 'Crystal structure of the CRISPR-Cas RNA silencing Cmr complex bound to a target analog.', 'Structures of an active type III-A CRISPR effector complex.']


# Functions that return information about single PDB IDs

#### Get the full PDB file

In [11]:
pdb_file = get_pdb_file('4lza', filetype='cif', compression=False)
print(pdb_file[:400])



Sending GET request to https://files.rcsb.org/download/4lza.cif to fetch 4lza's cif file as a string.
data_4LZA
# 
_entry.id   4LZA 
# 
_audit_conform.dict_name       mmcif_pdbx.dic 
_audit_conform.dict_version    5.281 
_audit_conform.dict_location   http://mmcif.pdb.org/dictionaries/ascii/mmcif_pdbx.dic 
# 
loop_
_database_2.database_id 
_database_2.database_code 
PDB   4LZA         
RCSB  RCSB081269   
WWPDB D_1000081269 
# 
_pdbx_database_related.db_name        TargetTrack 
_pdbx_database_rela


#### Get a general description of the entry's metadata

In [12]:
all_info = get_info('4LZA')
print(list(all_info.keys()))

['audit_author', 'cell', 'citation', 'diffrn', 'diffrn_detector', 'diffrn_radiation', 'diffrn_source', 'entry', 'exptl', 'exptl_crystal', 'exptl_crystal_grow', 'pdbx_sgproject', 'pdbx_audit_revision_details', 'pdbx_audit_revision_history', 'pdbx_database_related', 'pdbx_database_status', 'rcsb_accession_info', 'rcsb_entry_container_identifiers', 'rcsb_entry_info', 'rcsb_primary_citation', 'refine', 'refine_hist', 'refine_ls_restr', 'reflns', 'reflns_shell', 'software', 'struct', 'struct_keywords', 'symmetry', 'rcsb_id']


#### Run a Sequence search

Formerly using BLAST, this method now uses MMseqs2

In [13]:
q = Query("VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTAVAHVDDMPNAL", 
          query_type="sequence", 
          return_type="polymer_entity")

print(q.search())

{'query_id': '5ec265f7-2164-4e35-8d07-08df2a964532', 'result_type': 'polymer_entity', 'total_count': 846, 'result_set': [{'identifier': '1A00_1', 'score': 1.0, 'services': [{'service_type': 'sequence', 'nodes': [{'node_id': 20656, 'original_score': 164.0, 'norm_score': 1.0, 'match_context': [{'sequence_identity': 0.987, 'evalue': 2.933e-47, 'bitscore': 164, 'alignment_length': 80, 'mismatches': 0, 'gaps_opened': 1, 'query_beg': 1, 'query_end': 79, 'subject_beg': 1, 'subject_end': 80, 'query_length': 79, 'subject_length': 141, 'query_aligned_seq': 'VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALT-AVAHVDDMPNAL', 'subject_aligned_seq': 'VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAVAHVDDMPNAL'}]}]}]}, {'identifier': '1A01_1', 'score': 1.0, 'services': [{'service_type': 'sequence', 'nodes': [{'node_id': 20656, 'original_score': 164.0, 'norm_score': 1.0, 'match_context': [{'sequence_identity': 0.987, 'evalue': 2.933e-47, 'bitscore': 164, 'alignment

# New API for advanced search

The old API will gradually migrate to use these functions

In [15]:
from pypdb.clients.search.search_client import perform_search
from pypdb.clients.search.search_client import ReturnType
from pypdb.clients.search.operators import text_operators

## Search for all entries that mention the word 'ribosome'

In [16]:
search_operator = text_operators.DefaultOperator(value="ribosome")
return_type = ReturnType.ENTRY

results = perform_search(search_operator, return_type)

print(results[:10])

Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "full_text", "parameters": {"value": "ribosome"}}, "request_options": {"return_all_hits": true}, "return_type": "entry"} 

['486D', '1T1M', '1QI7', '1MRI', '1AHC', '1MOM', '1MRH', '1WIH', '1MRJ', '7AFQ']


## Search for polymers from 'Mus musculus'

In [17]:
search_operator = text_operators.ExactMatchOperator(value="Mus musculus",
                                                    attribute="rcsb_entity_source_organism.taxonomy_lineage.name")
return_type = ReturnType.POLYMER_ENTITY

results = perform_search(search_operator, return_type)

print(results[:5])

Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"attribute": "rcsb_entity_source_organism.taxonomy_lineage.name", "operator": "exact_match", "value": "Mus musculus"}}, "request_options": {"return_all_hits": true}, "return_type": "polymer_entity"} 

['12E8_1', '12E8_2', '15C8_1', '15C8_2', '1914_1']


## Search for non-polymers from 'Mus musculus' or 'Homo sapiens'

In [None]:
search_operator = text_operators.InOperator(values=["Mus musculus", "Homo sapiens"],
                                            attribute="rcsb_entity_source_organism.taxonomy_lineage.name")
return_type = ReturnType.NON_POLYMER_ENTITY

results = perform_search(search_operator, return_type)
print(results[:5])

## Search for polymer instances whose titles contain "actin" or "binding" or "protein"

In [None]:
search_operator = text_operators.ContainsWordsOperator(value="actin-binding protein",
                                            attribute="struct.title")
return_type = ReturnType.POLYMER_INSTANCE

results = perform_search(search_operator, return_type)

print(results[:5])

## Search for assemblies that contain the words "actin binding protein"
(must be in that order).

For example, "actin-binding protein" and "actin binding protein" will match,
but "protein binding actin" will not.

In [None]:
search_operator = text_operators.ContainsPhraseOperator(value="actin-binding protein",
                                            attribute="struct.title")
return_type = ReturnType.ASSEMBLY

results = perform_search(search_operator, return_type, verbosity=True)

print(results[:5])

## Search for entries released in 2019 or later

In [None]:
search_operator = text_operators.ComparisonOperator(
       value="2019-01-01T00:00:00Z",
       attribute="rcsb_accession_info.initial_release_date",
       comparison_type=text_operators.ComparisonType.GREATER)
return_type = ReturnType.ENTRY

results = perform_search(search_operator, return_type)

print(results[:5])

## Search for entries released only in 2019

In [None]:
search_operator = text_operators.RangeOperator(
    from_value="2019-01-01T00:00:00Z",
    to_value="2020-01-01T00:00:00Z",
    include_lower=True,
    include_upper=False,
    attribute="rcsb_accession_info.initial_release_date")
return_type = ReturnType.ENTRY

results = perform_search(search_operator, return_type)

print(results[:5])

## Search by cell length and suppress query output

In [4]:
from pypdb.clients.search.search_client import perform_search_with_graph, SearchService, ReturnType
from pypdb.clients.search.operators import text_operators

cell_a_operator = text_operators.RangeOperator(
    attribute='cell.length_a',
    from_value=80,
    to_value=84,
    include_upper=True
)

results = perform_search_with_graph(
    query_object=cell_a_operator,
    return_type=ReturnType.ENTRY,
    verbosity=False,
)

print(results[:5])

['190D', '1A0L', '1A0Z', '1A18', '1A2D']


## Search for structures under 4 angstroms of resolution

In [5]:
search_operator = text_operators.ComparisonOperator(
           value=4,
           attribute="rcsb_entry_info.resolution_combined",
           comparison_type=text_operators.ComparisonType.LESS)
return_type = ReturnType.ENTRY

results = perform_search(search_operator, return_type)

print(results[:5])

Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"operator": "less", "attribute": "rcsb_entry_info.resolution_combined", "value": 4}}, "request_options": {"return_all_hits": true}, "return_type": "entry"} 

['118D', '129D', '142L', '153D', '169L']


## Search for structures with a given attribute.

(Admittedly every structure has a release date, but the same logic would
 apply for a more sparse RCSB attribute).


In [None]:
search_operator = text_operators.ExistsOperator(
    attribute="rcsb_accession_info.initial_release_date")
return_type = ReturnType.ENTRY

results = perform_search(search_operator, return_type)

print(results[:5])

## Search for 'Mus musculus' or 'Homo sapiens' structures after 2019 using graph search


In [None]:
from pypdb.clients.search.search_client import perform_search_with_graph
from pypdb.clients.search.search_client import ReturnType
from pypdb.clients.search.search_client import QueryGroup, LogicalOperator
from pypdb.clients.search.operators import text_operators

# SearchOperator associated with structures with under 4 Angstroms of resolution
under_4A_resolution_operator = text_operators.ComparisonOperator(
       value=4,
       attribute="rcsb_entry_info.resolution_combined",
       comparison_type=text_operators.ComparisonType.GREATER)

# SearchOperator associated with entities containing 'Mus musculus' lineage
is_mus_operator = text_operators.ExactMatchOperator(
            value="Mus musculus",
            attribute="rcsb_entity_source_organism.taxonomy_lineage.name")

# SearchOperator associated with entities containing 'Homo sapiens' lineage
is_human_operator = text_operators.ExactMatchOperator(
            value="Homo sapiens",
            attribute="rcsb_entity_source_organism.taxonomy_lineage.name")

# QueryGroup associated with being either human or `Mus musculus`
is_human_or_mus_group = QueryGroup(
    queries = [is_mus_operator, is_human_operator],
    logical_operator = LogicalOperator.OR
)

# QueryGroup associated with being ((Human OR Mus) AND (Under 4 Angstroms))
is_under_4A_and_human_or_mus_group = QueryGroup(
    queries = [is_human_or_mus_group, under_4A_resolution_operator],
    logical_operator = LogicalOperator.AND
)

return_type = ReturnType.ENTRY

results = perform_search_with_graph(
  query_object=is_under_4A_and_human_or_mus_group,
  return_type=return_type)
print("\n", results[:10]) # Huzzah