# WS_ch08B.ipynb
### WESmith 04/18/23

## INTRODUCING BIO.PDB
#### (see book code in Chapter08/PDB.py)

### WS created this notebook to follow along with code from the book
### 'Bioinformatics with Python Cookbook' by Tiago Antao
#### Each recipe will have its own notebook, suffixed by A, B, etc.¶


In [1]:
from Bio import PDB
from Bio.SeqIO import PdbIO, FastaIO
import os

In [2]:
data_dir  = 'data/ch08_data'

In [3]:
repository = PDB.PDBList()

In [4]:
repository.retrieve_pdb_file?

[0;31mSignature:[0m
[0mrepository[0m[0;34m.[0m[0mretrieve_pdb_file[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpdb_code[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mobsolete[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpdir[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfile_format[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moverwrite[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Fetch PDB structure file from PDB server, and store it locally.

The PDB structure's file name is returned as a single string.
If obsolete ``==`` True, the file will be saved in a special file tree.

NOTE. The default download format has changed from PDB to PDBx/mmCif

:param pdb_code: 4-symbols structure Id from PDB (e.g. 3J92).
:type pdb_code: string

:param file_format:
    File format. Available options:

    * "mmCif" (default, PDBx/

In [5]:
# WS this downloads the pdb data to a local file suffixed with .ent
repository.retrieve_pdb_file('1TUP', pdir=data_dir, file_format='pdb')
repository.retrieve_pdb_file('1OLG', pdir=data_dir, file_format='pdb')
repository.retrieve_pdb_file('1YCQ', pdir=data_dir, file_format='pdb')

Structure exists: 'data/ch08_data/pdb1tup.ent' 
Structure exists: 'data/ch08_data/pdb1olg.ent' 
Structure exists: 'data/ch08_data/pdb1ycq.ent' 


'data/ch08_data/pdb1ycq.ent'

In [6]:
parser = PDB.PDBParser()

In [7]:
parser.get_structure?

[0;31mSignature:[0m [0mparser[0m[0;34m.[0m[0mget_structure[0m[0;34m([0m[0mid[0m[0;34m,[0m [0mfile[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return the structure.

Arguments:
 - id - string, the id that will be used for the structure
 - file - name of the PDB file OR an open filehandle
[0;31mFile:[0m      ~/.venv_220526/lib/python3.10/site-packages/Bio/PDB/PDBParser.py
[0;31mType:[0m      method


# WS
pdb1tup_file = os.path.join(data_dir, 'pdb1tup.ent')
pdb1olg_file = os.path.join(data_dir, 'pdb1olg.ent')
pdb1ycq_file = os.path.join(data_dir, 'pdb1ycq.ent')

In [9]:
p53_1tup = parser.get_structure('P 53 - DNA Binding',     pdb1tup_file)
p53_1olg = parser.get_structure('P 53 - Tetramerization', pdb1olg_file)
p53_1ycq = parser.get_structure('P 53 - Transactivation', pdb1ycq_file)



In [10]:
def print_pdb_headers(headers, indent=0):
    ind_text = ' ' * indent
    for header, content in headers.items():
        if type(content) == dict:
            print('\n%s%20s:' % (ind_text, header))
            print_pdb_headers(content, indent + 4)
            print()
        elif type(content) == list:
            print('%s%20s:' % (ind_text, header))
            for elem in content:
                print('%s%21s %s' % (ind_text, '->', elem))
        else:
            print('%s%20s: %s' % (ind_text, header, content))

In [11]:
print(p53_1tup.header.keys()) # WS

dict_keys(['name', 'head', 'idcode', 'deposition_date', 'release_date', 'structure_method', 'resolution', 'structure_reference', 'journal_reference', 'author', 'compound', 'source', 'has_missing_residues', 'missing_residues', 'keywords', 'journal'])


In [12]:
print_pdb_headers(p53_1tup.header)

                name: tumor suppressor p53 complexed with dna
                head: antitumor protein/dna
              idcode: 1TUP
     deposition_date: 1995-07-11
        release_date: 1995-07-11
    structure_method: x-ray diffraction
          resolution: 2.2
 structure_reference:
                   -> n.p.pavletich,k.a.chambers,c.o.pabo the dna-binding domain of p53 contains the four conserved regions and the major mutation hot spots genes dev. v. 7 2556 1993 issn 0890-9369 
                   -> b.vogelstein,k.w.kinzler p53 function and dysfunction cell(cambridge,mass.) v. 70 523 1992 issn 0092-8674 
   journal_reference: y.cho,s.gorina,p.d.jeffrey,n.p.pavletich crystal structure of a p53 tumor suppressor-dna complex: understanding tumorigenic mutations. science v. 265 346 1994 issn 0036-8075 8023157 
              author: Y.Cho,S.Gorina,P.D.Jeffrey,N.P.Pavletich

            compound:

                       1:
                        misc: 
                    molecule: dna (5

In [13]:
print(p53_1tup.header['compound'])
print(p53_1olg.header['compound'])
print(p53_1ycq.header['compound'])

{'1': {'misc': '', 'molecule': "dna (5'-d(*tp*tp*tp*cp*cp*tp*ap*gp*ap*cp*tp*tp*gp*cp*cp*cp*a p*ap*tp*tp*a)-3') ", 'chain': 'e', 'engineered': 'yes'}, '2': {'misc': '', 'molecule': "dna (5'-d(*ap*tp*ap*ap*tp*tp*gp*gp*gp*cp*ap*ap*gp*tp*cp*tp*a p*gp*gp*ap*a)-3') ", 'chain': 'f', 'engineered': 'yes'}, '3': {'misc': '', 'molecule': 'protein (p53 tumor suppressor )', 'chain': 'a, b, c', 'engineered': 'yes'}}
{'1': {'misc': '', 'molecule': 'tumor suppressor p53 (oligomerization domain)', 'chain': 'a, b, c, d', 'engineered': 'yes'}}
{'1': {'misc': '', 'molecule': 'mdm2', 'chain': 'a', 'synonym': 'mdm2', 'engineered': 'yes'}, '2': {'misc': '', 'molecule': 'p53', 'chain': 'b', 'fragment': 'residues 13 - 29', 'engineered': 'yes'}}


In [14]:
print_pdb_headers(p53_1tup.header['compound']) # WS a cleaner output


                   1:
                    misc: 
                molecule: dna (5'-d(*tp*tp*tp*cp*cp*tp*ap*gp*ap*cp*tp*tp*gp*cp*cp*cp*a p*ap*tp*tp*a)-3') 
                   chain: e
              engineered: yes


                   2:
                    misc: 
                molecule: dna (5'-d(*ap*tp*ap*ap*tp*tp*gp*gp*gp*cp*ap*ap*gp*tp*cp*tp*a p*gp*gp*ap*a)-3') 
                   chain: f
              engineered: yes


                   3:
                    misc: 
                molecule: protein (p53 tumor suppressor )
                   chain: a, b, c
              engineered: yes



In [15]:
print_pdb_headers(p53_1olg.header['compound']) # WS a cleaner output


                   1:
                    misc: 
                molecule: tumor suppressor p53 (oligomerization domain)
                   chain: a, b, c, d
              engineered: yes



In [16]:
print_pdb_headers(p53_1ycq.header['compound']) # WS a cleaner output


                   1:
                    misc: 
                molecule: mdm2
                   chain: a
                 synonym: mdm2
              engineered: yes


                   2:
                    misc: 
                molecule: p53
                   chain: b
                fragment: residues 13 - 29
              engineered: yes



In [17]:
def describe_model(name, pdb):
    print()
    for model in pdb:
        for chain in model:
            print('%s - Chain: %s. Number of residues: %d. Number of atoms: %d.' %
                  (name, chain.id, len(chain), len(list(chain.get_atoms()))))

In [18]:
describe_model('1TUP', p53_1tup)
describe_model('1OLG', p53_1olg)
describe_model('1YCQ', p53_1ycq)


1TUP - Chain: E. Number of residues: 43. Number of atoms: 442.
1TUP - Chain: F. Number of residues: 35. Number of atoms: 449.
1TUP - Chain: A. Number of residues: 395. Number of atoms: 1734.
1TUP - Chain: B. Number of residues: 265. Number of atoms: 1593.
1TUP - Chain: C. Number of residues: 276. Number of atoms: 1610.

1OLG - Chain: A. Number of residues: 42. Number of atoms: 698.
1OLG - Chain: B. Number of residues: 42. Number of atoms: 698.
1OLG - Chain: C. Number of residues: 42. Number of atoms: 698.
1OLG - Chain: D. Number of residues: 42. Number of atoms: 698.

1YCQ - Chain: A. Number of residues: 123. Number of atoms: 741.
1YCQ - Chain: B. Number of residues: 16. Number of atoms: 100.


In [19]:
# WS to explore chain object
dd = []
for model in p53_1tup:
    for chain in model:
        dd.append(chain)
ee = list(dd[0].get_atoms())

In [20]:
for residue in p53_1tup.get_residues():
    if residue.id[0] in [' ', 'W']:
        continue
    print(residue.id)

('H_ZN', 951, ' ')
('H_ZN', 952, ' ')
('H_ZN', 953, ' ')


In [21]:
type(p53_1tup.get_residues())

generator

# WS to explore residue object
gg = list(p53_1tup.get_residues())
for k in gg: 
    print(k.id)

In [23]:
res = next(p53_1tup[0]['A'].get_residues())
print(res)

<Residue SER het=  resseq=94 icode= >


In [24]:
for atom in res:
    print(atom, atom.serial_number, atom.element)

<Atom N> 858 N
<Atom CA> 859 C
<Atom C> 860 C
<Atom O> 861 O
<Atom CB> 862 C
<Atom OG> 863 O


In [25]:
print(p53_1tup[0]['A'][94]['CA'])  # WS model, chain, residue, atom

<Atom CA>


In [26]:
ff = p53_1tup[0]['A'][94]['CA']

In [27]:
ff.full_id, ff.coord  # WS

(('P 53 - DNA Binding', 0, 'A', (' ', 94, ' '), ('CA', ' ')),
 array([75.562, 21.797, 80.653], dtype=float32))

In [28]:
# write fasta file
def get_fasta(pdb_file, fasta_file, transfer_ids=None):
    fasta_writer = FastaIO.FastaWriter(fasta_file)
    fasta_writer.write_header()
    for rec in PdbIO.PdbSeqresIterator(pdb_file):
        if len(rec.seq) == 0:
            continue
        if transfer_ids is not None and rec.id not in transfer_ids:
            continue
        print(rec.id, rec.seq, len(rec.seq))
        fasta_writer.write_record(rec)

In [29]:
# WS
fasta_1tup_file = os.path.join(data_dir, '1tup.fasta')
fasta_1olg_file = os.path.join(data_dir, '1olg.fasta')
fasta_1ycq_file = os.path.join(data_dir, '1ycq.fasta')

In [30]:
get_fasta(open(pdb1tup_file), open(fasta_1tup_file, 'w'), transfer_ids=['1TUP:B'])
get_fasta(open(pdb1olg_file), open(fasta_1olg_file, 'w'), transfer_ids=['1OLG:B'])
get_fasta(open(pdb1ycq_file), open(fasta_1ycq_file, 'w'), transfer_ids=['1YCQ:B'])

1TUP:B SSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNT 219
1OLG:B KKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPG 42
1YCQ:B PLSQETFSDLWKLLPEN 17
