# pypdb demos

This is a set of basic examples of the usage and outputs of the various individual functions included in. There are generally three types of functions.

### Preamble

In [1]:
%matplotlib inline
from IPython.display import HTML

# Import from local directory
# import sys
# sys.path.insert(0, '../pypdb')
# from pypdb import *

# Import from installed package
from pypdb import *

%load_ext autoreload
%autoreload 2

# Search functions that return lists of PDB IDs

#### Get a list of PDBs for a specific search term

In [2]:
found_pdbs = Query("ribosome").search()
print(found_pdbs[:10])



TypeError: 'NoneType' object is not subscriptable

#### Search by PubMed ID Number

In [21]:
Query(27499440, "PubmedIdQuery").scan_params

{'query': {'type': 'terminal',
  'service': 'text',
  'parameters': {'operator': 'in',
   'negation': False,
   'value': [27499440],
   'attribute': 'rcsb_pubmed_container_identifiers.pubmed_id'}},
 'return_type': 'entry',
 'request_options': {'return_all_hits': True}}

In [22]:
found_pdbs = Query(27499440, "PubmedIdQuery").search()
print(found_pdbs[:10])

TypeError: 'NoneType' object is not subscriptable

#### Search by source organism using NCBI TaxId

In [4]:
found_pdbs = Query('6239', 'TreeEntityQuery').search() #TaxID for C elegans
print(found_pdbs[:5])

TypeError: 'NoneType' object is not subscriptable

#### Search by a specific experimental method

In [5]:
found_pdbs = Query('SOLID-STATE NMR', query_type='ExpTypeQuery').search()
print(found_pdbs[:10])

TypeError: 'NoneType' object is not subscriptable

#### Search by protein structure similarity

In [6]:
found_pdbs = Query('2E8D', query_type="structure").search()
print(found_pdbs[:10])

TypeError: 'NoneType' object is not subscriptable

#### Search by Author

In [7]:
found_pdbs = Query('Perutz, M.F.', query_type='AdvancedAuthorQuery').search()
print(found_pdbs)

None


#### Search by organism

In [8]:
q = Query("Dictyostelium", query_type="OrganismQuery")
print(q.search()[:10])

TypeError: 'NoneType' object is not subscriptable

#### Search by Uniprot ID

In [9]:
uniprot_info = Query("A0A023GPI8", query_type="uniprot").search()
print(uniprot_info[:5])

TypeError: 'NoneType' object is not subscriptable

#### Search by PFAM number

In [10]:
pfam_info = Query("PF00008", query_type="pfam").search()
print(pfam_info[:5])

TypeError: 'NoneType' object is not subscriptable

# Information Search functions
While the basic functions described in the previous section are useful for looking up and manipulating individual unique entries, these functions are intended to be more user-facing: they take search keywords and return lists of authors or dates

#### Find papers for a given keyword

In [11]:
matching_papers = find_papers('crispr', max_results=10)
print(list(matching_papers)[:10])

TypeError: 'NoneType' object is not subscriptable

# Functions that return information about single PDB IDs

#### Get the full PDB file

In [12]:
pdb_file = get_pdb_file('4lza', filetype='cif', compression=False)
print(pdb_file[:400])



Sending GET request to https://files.rcsb.org/download/4lza.cif to fetch 4lza's cif file as a string.
data_4LZA
# 
_entry.id   4LZA 
# 
_audit_conform.dict_name       mmcif_pdbx.dic 
_audit_conform.dict_version    5.382 
_audit_conform.dict_location   http://mmcif.pdb.org/dictionaries/ascii/mmcif_pdbx.dic 
# 
loop_
_database_2.database_id 
_database_2.database_code 
_database_2.pdbx_database_accession 
_database_2.pdbx_DOI 
PDB   4LZA         pdb_00004lza 10.2210/pdb4lza/pdb 
RCSB  RCSB081269   ? 


#### Get a general description of the entry's metadata

In [13]:
all_info = get_info('4LZA')
print(list(all_info.keys()))

['audit_author', 'cell', 'citation', 'diffrn', 'diffrn_detector', 'diffrn_radiation', 'diffrn_source', 'entry', 'exptl', 'exptl_crystal', 'exptl_crystal_grow', 'pdbx_sgproject', 'pdbx_audit_revision_category', 'pdbx_audit_revision_details', 'pdbx_audit_revision_group', 'pdbx_audit_revision_history', 'pdbx_audit_revision_item', 'pdbx_database_related', 'pdbx_database_status', 'pdbx_initial_refinement_model', 'rcsb_accession_info', 'rcsb_entry_container_identifiers', 'rcsb_entry_info', 'rcsb_primary_citation', 'refine', 'refine_hist', 'refine_ls_restr', 'reflns', 'reflns_shell', 'software', 'struct', 'struct_keywords', 'symmetry', 'rcsb_id']


#### Run a Sequence search

Formerly using BLAST, this method now uses MMseqs2

In [14]:
q = Query("VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTAVAHVDDMPNAL", 
          query_type="sequence", 
          return_type="polymer_entity")

print(q.search())

None


# New API for advanced search

The old API will gradually migrate to use these functions

In [15]:
from pypdb.clients.search.search_client import perform_search
from pypdb.clients.search.search_client import ReturnType
from pypdb.clients.search.operators import text_operators

## Search for all entries that mention the word 'ribosome'

In [16]:
search_operator = text_operators.DefaultOperator(value="ribosome")
return_type = ReturnType.ENTRY

results = perform_search(search_operator, return_type)

print(results[:10])

Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "full_text", "parameters": {"value": "ribosome"}}, "request_options": {"return_all_hits": true}, "return_type": "entry"} 





HTTPError: 415 Client Error:  for url: https://search.rcsb.org/rcsbsearch/v2/query

## Search for polymers from 'Mus musculus'

In [17]:
search_operator = text_operators.ExactMatchOperator(value="Mus musculus",
                                                    attribute="rcsb_entity_source_organism.taxonomy_lineage.name")
return_type = ReturnType.POLYMER_ENTITY

results = perform_search(search_operator, return_type)

print(results[:5])

Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"attribute": "rcsb_entity_source_organism.taxonomy_lineage.name", "operator": "exact_match", "value": "Mus musculus"}}, "request_options": {"return_all_hits": true}, "return_type": "polymer_entity"} 





HTTPError: 415 Client Error:  for url: https://search.rcsb.org/rcsbsearch/v2/query

## Search for non-polymers from 'Mus musculus' or 'Homo sapiens'

In [18]:
search_operator = text_operators.InOperator(values=["Mus musculus", "Homo sapiens"],
                                            attribute="rcsb_entity_source_organism.taxonomy_lineage.name")
return_type = ReturnType.NON_POLYMER_ENTITY

results = perform_search(search_operator, return_type)
print(results[:5])

Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"attribute": "rcsb_entity_source_organism.taxonomy_lineage.name", "operator": "in", "value": ["Mus musculus", "Homo sapiens"]}}, "request_options": {"return_all_hits": true}, "return_type": "non_polymer_entity"} 





HTTPError: 415 Client Error:  for url: https://search.rcsb.org/rcsbsearch/v2/query

## Search for polymer instances whose titles contain "actin" or "binding" or "protein"

In [19]:
search_operator = text_operators.ContainsWordsOperator(value="actin-binding protein",
                                            attribute="struct.title")
return_type = ReturnType.POLYMER_INSTANCE

results = perform_search(search_operator, return_type)

print(results[:5])

Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"attribute": "struct.title", "operator": "contains_words", "value": "actin-binding protein"}}, "request_options": {"return_all_hits": true}, "return_type": "polymer_instance"} 





HTTPError: 415 Client Error:  for url: https://search.rcsb.org/rcsbsearch/v2/query

## Search for assemblies that contain the words "actin binding protein"
(must be in that order).

For example, "actin-binding protein" and "actin binding protein" will match,
but "protein binding actin" will not.

In [None]:
search_operator = text_operators.ContainsPhraseOperator(value="actin-binding protein",
                                            attribute="struct.title")
return_type = ReturnType.ASSEMBLY

results = perform_search(search_operator, return_type, verbosity=True)

print(results[:5])

## Search for entries released in 2019 or later

In [None]:
search_operator = text_operators.ComparisonOperator(
       value="2019-01-01T00:00:00Z",
       attribute="rcsb_accession_info.initial_release_date",
       comparison_type=text_operators.ComparisonType.GREATER)
return_type = ReturnType.ENTRY

results = perform_search(search_operator, return_type)

print(results[:5])

## Search for entries released only in 2019

In [None]:
search_operator = text_operators.RangeOperator(
    from_value="2019-01-01T00:00:00Z",
    to_value="2020-01-01T00:00:00Z",
    include_lower=True,
    include_upper=False,
    attribute="rcsb_accession_info.initial_release_date")
return_type = ReturnType.ENTRY

results = perform_search(search_operator, return_type)

print(results[:5])

## Search by cell length and suppress query output

In [4]:
from pypdb.clients.search.search_client import perform_search_with_graph, SearchService, ReturnType
from pypdb.clients.search.operators import text_operators

cell_a_operator = text_operators.RangeOperator(
    attribute='cell.length_a',
    from_value=80,
    to_value=84,
    include_upper=True
)

results = perform_search_with_graph(
    query_object=cell_a_operator,
    return_type=ReturnType.ENTRY,
    verbosity=False,
)

print(results[:5])

['190D', '1A0L', '1A0Z', '1A18', '1A2D']


## Search for structures under 4 angstroms of resolution

In [6]:
search_operator = text_operators.ComparisonOperator(
           value=4,
           attribute="rcsb_entry_info.resolution_combined",
           comparison_type=text_operators.ComparisonType.LESS)
return_type = ReturnType.ENTRY

results = perform_search(search_operator, return_type)

print(results[:5])

Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"operator": "less", "attribute": "rcsb_entry_info.resolution_combined", "value": 4}}, "request_options": {"return_all_hits": true}, "return_type": "entry"} 





HTTPError: 415 Client Error:  for url: https://search.rcsb.org/rcsbsearch/v2/query

## Search for structures with a given attribute.

(Admittedly every structure has a release date, but the same logic would
 apply for a more sparse RCSB attribute).


In [5]:
search_operator = text_operators.ExistsOperator(
    attribute="rcsb_accession_info.initial_release_date")
return_type = ReturnType.ENTRY

results = perform_search(search_operator, return_type)

print(results[:5])

Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"operator": "exists", "attribute": "rcsb_accession_info.initial_release_date"}}, "request_options": {"return_all_hits": true}, "return_type": "entry"} 





HTTPError: 415 Client Error:  for url: https://search.rcsb.org/rcsbsearch/v2/query

## Search for 'Mus musculus' or 'Homo sapiens' structures after 2019 using graph search


In [None]:
from pypdb.clients.search.search_client import perform_search_with_graph
from pypdb.clients.search.search_client import ReturnType
from pypdb.clients.search.search_client import QueryGroup, LogicalOperator
from pypdb.clients.search.operators import text_operators

# SearchOperator associated with structures with under 4 Angstroms of resolution
under_4A_resolution_operator = text_operators.ComparisonOperator(
       value=4,
       attribute="rcsb_entry_info.resolution_combined",
       comparison_type=text_operators.ComparisonType.GREATER)

# SearchOperator associated with entities containing 'Mus musculus' lineage
is_mus_operator = text_operators.ExactMatchOperator(
            value="Mus musculus",
            attribute="rcsb_entity_source_organism.taxonomy_lineage.name")

# SearchOperator associated with entities containing 'Homo sapiens' lineage
is_human_operator = text_operators.ExactMatchOperator(
            value="Homo sapiens",
            attribute="rcsb_entity_source_organism.taxonomy_lineage.name")

# QueryGroup associated with being either human or `Mus musculus`
is_human_or_mus_group = QueryGroup(
    queries = [is_mus_operator, is_human_operator],
    logical_operator = LogicalOperator.OR
)

# QueryGroup associated with being ((Human OR Mus) AND (Under 4 Angstroms))
is_under_4A_and_human_or_mus_group = QueryGroup(
    queries = [is_human_or_mus_group, under_4A_resolution_operator],
    logical_operator = LogicalOperator.AND
)

return_type = ReturnType.ENTRY

results = perform_search_with_graph(
  query_object=is_under_4A_and_human_or_mus_group,
  return_type=return_type)
print("\n", results[:10]) # Huzzah