# WS_ch08C.ipynb
### WESmith 04/18/23

## EXTRACTING MORE INFO FROM A PDB FILE
#### (see book code in Chapter08/Stats.py)

### WS created this notebook to follow along with code from the book
### 'Bioinformatics with Python Cookbook' by Tiago Antao
#### Each recipe will have its own notebook, suffixed by A, B, etc.¶


In [None]:
from collections import defaultdict
import sys
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# #%matplotlib inline
from Bio import PDB
import os

In [None]:
import utils as ws

In [None]:
data_dir  = 'data'

In [None]:
repository = PDB.PDBList()
parser     = PDB.PDBParser()

In [None]:
repository.retrieve_pdb_file('1TUP', pdir=data_dir, file_format='pdb')

In [None]:
pdb1tup_file = os.path.join(data_dir, 'pdb1tup.ent')

In [None]:
p53_1tup = parser.get_structure('P 53', pdb1tup_file)

In [None]:
ws.attrs(p53_1tup) # WS examine the object

In [None]:
atom_cnt       = defaultdict(int)
atom_chain     = defaultdict(int)
atom_res_types = defaultdict(int)

for atom in p53_1tup.get_atoms():
    my_residue = atom.parent
    my_chain   = my_residue.parent
    atom_chain[my_chain.id] += 1
    if my_residue.resname != 'HOH':
        atom_cnt[atom.element] += 1
    atom_res_types[my_residue.resname] += 1

In [None]:
# test of print_dict()
example_dict = { 'abcde': 'hello',
                 12345: 'whereami',
                 'overhere' : { 98.4: 'temp' },
                 'keyzee' : { 'subdict': { 'keykey': 'valuevalue',
                                           'keybee': -54,
                                            3.1415: 109876},
                              'lastkey': 'lastval'}}

In [None]:
ws.print_dict(example_dict) # defaults

In [None]:
ws.print_dict(example_dict, leader=';-)', keywidth=9, indent=2, sep=1)

In [None]:
ws.print_dict(atom_res_types, leader='residuals', sep=4); print()
ws.print_dict(atom_chain, sep=2, leader='chain'); print()
ws.print_dict(atom_cnt, sep=3, leader='count')

In [None]:
list(p53_1tup.get_residues()) # WS explore structure

In [None]:
type(tt) == dict

In [None]:
res_types     = defaultdict(int)
res_per_chain = defaultdict(int)

for residue in p53_1tup.get_residues():
    res_types[residue.resname] += 1
    # WS remove water from res_per_chain count
    #    so that nucleotides add up with res_types count
    #    and so that res_per_chain count is pure residues (eg amino acids only)
    if residue.resname != 'HOH': 
        res_per_chain[residue.parent.id] +=1

In [None]:
# note: 30 CYS and 18 MET gives 48 sulfur atoms (CYS and MET each have 1 sulfur atom), consistent with above sulfur count
ws.print_dict(dict(res_types), leader='residue counts'); print()
ws.print_dict(dict(res_per_chain), leader='res per chain')

In [None]:
[sys.maxsize] * 3

In [None]:
def get_bounds(my_atoms):
    my_min = [sys.maxsize]  * 3
    my_max = [-sys.maxsize] * 3
    for atom in my_atoms:
        for i, coord in enumerate(atom.coord):
            if coord < my_min[i]:
                my_min[i] = coord
            if coord > my_max[i]:
                my_max[i] = coord
    return my_min, my_max

In [None]:
chain_bounds = {}
for chain in p53_1tup.get_chains():
    print(chain.id, get_bounds(chain.get_atoms()))
    chain_bounds[chain.id] = get_bounds(chain.get_atoms())

In [None]:
print(get_bounds(p53_1tup.get_atoms())) # show bounds for all atoms

In [None]:
fig = plt.figure(figsize=(16, 9))
ax3d = fig.add_subplot(111, projection='3d')
# WS a nice feature: can add subplots on top of subplots
ax_xy = fig.add_subplot(331)
ax_xy.set_title('X/Y')
ax_xz = fig.add_subplot(334)
ax_xz.set_title('X/Z')
ax_zy = fig.add_subplot(337)
ax_zy.set_title('Z/Y')
# WS colored DNA strands cyan, magenta instead of grays
color = {'A': 'r', 'B': 'g', 'C': 'b', 'E': 'c', 'F': 'm'}
zx, zy, zz = [], [], []
for chain in p53_1tup.get_chains():
    xs, ys, zs = [], [], []
    for residue in chain.get_residues():
        # WS make residue object an iterator,
        #    just get FIRST atom of the residue to plot with next()
        #ref_atom = next(residue.get_iterator())
        # WS replaced get_iterator with get_atoms: clearer, works the same
        ref_atom = next(residue.get_atoms())
        x, y, z = ref_atom.coord
        if ref_atom.element == 'ZN':
            zx.append(x)
            zy.append(y)
            zz.append(z)
            continue
        xs.append(x)
        ys.append(y)
        zs.append(z)
    ax3d.scatter(xs, ys, zs, color=color[chain.id])
    ax_xy.scatter(xs, ys, marker='.', color=color[chain.id])
    ax_xz.scatter(xs, zs, marker='.', color=color[chain.id])
    ax_zy.scatter(zs, ys, marker='.', color=color[chain.id])
ax3d.set_xlabel('X')
ax3d.set_ylabel('Y')
ax3d.set_zlabel('Z')
ax3d.scatter(zx, zy, zz, color='k', marker='v', s=300)
# WS zinc markers follow
ax_xy.scatter(zx, zy, color='k', marker='v', s=80)
ax_xz.scatter(zx, zz, color='k', marker='v', s=80)
ax_zy.scatter(zz, zy, color='k', marker='v', s=80)
for ax in [ax_xy, ax_xz, ax_zy]:
    ax.get_yaxis().set_visible(False)
    ax.get_xaxis().set_visible(False)

In [None]:
# WS explore the object structures
ws.attrs(p53_1tup.get_chains(), skip=False)
# get_chains() IS an iterable: it has a __next__:

In [None]:
dd = list(p53_1tup.get_chains())
dd

In [None]:
ws.attrs(dd[2], skip=False)  # chain object is NOT an iterable

In [None]:
ws.attrs(dd[2].get_residues(), skip=False)  # get_residues function IS an iterator (ie generator)

In [None]:
ee = list(dd[2].get_residues())
ws.attrs(ee[0], skip=False)
# WS  residue object is NOT an iterable: it is missing a __next__:
#     get_iterator() or get_atoms() makes it an iterator (see plotting code above)

In [None]:
ee[0]

In [None]:
hh = list(ee[0].get_iterator())
hh

In [None]:
# WS use this instead of get_iterator, same result, but clearer
list(ee[0].get_atoms())

In [None]:
# list() alone probably invokes get_iterator()? and works the same
list(ee[0])

In [None]:
ws.attrs(hh[0], skip=False) # WS atom object not an iterable