In [1]:
from Bio import Align
import mdtraj as md
import nglview as nv
import numpy as np
import pandas as pd

from itertools import combinations
import os
import requests



### Download the file for Estrogen receptor with pdb id 1QKU

In [2]:
def fetch_pdb(pdb_id, download_path="./"):
        if len(pdb_id) != 4:
            raise ValueError("Invalid pdb id")
        url = 'http://files.rcsb.org/download/{}.pdb'.format(pdb_id)
        try:
            res = requests.get(url, allow_redirects=True)
        except:
            print("Could not fetch pdb from {}".format(url))
            return 
        
        file_path = os.path.join(download_path, pdb_id + ".pdb")
        with open(file_path, "wb") as f:
            f.write(res.content)

In [3]:
fetch_pdb("1QKU")

<div class="alert alert-success">
<strong>Well Done!</strong> Let me show you a couple of things that could improve your function
</div>

In [4]:
import re

# Have look to https://bioportal.bioontology.org/ontologies/EDAM?p=classes&conceptid=data_1127
pattern= re.compile('[0-9][a-zA-Z_0-9]{3}')

def fetch_pdb(pdb_id, download_path="./"):
    
    if not pattern.match(pdb_id):
        raise ValueError("Invalid pdb id")
        
    url = 'http://files.rcsb.org/download/{}.pdb'.format(pdb_id)
    res = requests.get(url, allow_redirects=True)
    
    # I am not sure if a failed request raise an exception. The failing code is stored
    # in the attribute 'status_code'. For instance, if everything works properly:
    # res.status_code == 200
    
    if res.status_code != requests.codes.ok:
        print("Could not fetch pdb from {}".format(url))
        return 
        
    file_path = os.path.join(download_path, pdb_id + ".pdb")
    with open(file_path, "wb") as f:
        f.write(res.content)

### Visualizing Estrogen receptor

In [5]:
view = nv.show_structure_file("./1QKU.pdb")
view

NGLWidget()

## Inspecting the topology of estrogen receptor

In [6]:
traj = md.load("1QKU.pdb")
topology = traj.topology
print(traj)
print(topology)

<mdtraj.Trajectory with 1 frames, 6596 atoms, 1343 residues, and unitcells>
<mdtraj.Topology with 9 chains, 1343 residues, 6596 atoms, 6123 bonds>


<div class="alert alert-warning">
<strong>What?</strong> Where are the 9 chains in the above nglview representation? And why if you pass the mouse by every molecule, the id label unfolded says that there is only a chain named "A". 
</div>

In [7]:
view = nv.show_mdtraj(traj)
view

NGLWidget()

# Select just protein.

In [8]:
protein_inxs = topology.select("protein")
protein_traj = traj.atom_slice(protein_inxs)
protein_topo = protein_traj.topology
print(protein_traj)
print(protein_topo)

<mdtraj.Trajectory with 1 frames, 5940 atoms, 744 residues, and unitcells>
<mdtraj.Topology with 3 chains, 744 residues, 5940 atoms, 6054 bonds>


In [9]:
# Split protein into its 3 chains
protein_1 = protein_traj.atom_slice(protein_topo.select("chainid == 0"))
protein_2 = protein_traj.atom_slice(protein_topo.select("chainid == 1"))
protein_3 = protein_traj.atom_slice(protein_topo.select("chainid == 2"))
print(protein_1)
print(protein_2)
print(protein_3)

<mdtraj.Trajectory with 1 frames, 1990 atoms, 250 residues, and unitcells>
<mdtraj.Trajectory with 1 frames, 1975 atoms, 247 residues, and unitcells>
<mdtraj.Trajectory with 1 frames, 1975 atoms, 247 residues, and unitcells>


In [10]:
# Save receptor to file
protein_1.save_pdb("./receptor_1.pdb")

In [11]:
view = nv.show_mdtraj(protein_1)
view

NGLWidget()

In [12]:
view = nv.show_mdtraj(protein_traj.atom_slice(protein_topo.select("chainid 1 to 2")))
view

NGLWidget()

<div class="alert alert-info">
<strong>Note:</strong> Apparently this is not the whole receptor. This is a segment known as the 'binding domain'. And the protein is found as homodimer under certain circunstances. I didn't read much yet to about its function and the effect of the ligands. Some of them might inhibit the dimerization. There must probably be some conformational changes induced by some ligands when they bind. <a href='https://www.uniprot.org/uniprot/P03372'>Have a look to the uniprot database (the uniprot id can be found in the RCSB PDB web page: P03372)</a>
</div>

## Compare the sequence of each protein

In [13]:
prot_1_sequence = [res.name for res in protein_1.topology.residues]
prot_2_sequence = [res.name for res in protein_2.topology.residues]
prot_3_sequence = [res.name for res in protein_3.topology.residues]

# Protein 2 and 3 have 247 resiudes. Protein 1 has 250. List must be of same length
# for pandas dataframe

prot_2_sequence += ["", "", ""]
prot_3_sequence += ["", "", ""]

sequences = {
    "Protein_1": prot_1_sequence,
    "Protein_2": prot_2_sequence,
    "Protein_3": prot_3_sequence,
}

sequences = pd.DataFrame.from_dict(sequences)

In [14]:
sequences.head(10)

Unnamed: 0,Protein_1,Protein_2,Protein_3
0,SER,ASN,ASN
1,LYS,SER,SER
2,LYS,LEU,LEU
3,ASN,ALA,ALA
4,SER,LEU,LEU
5,LEU,SER,SER
6,ALA,LEU,LEU
7,LEU,THR,THR
8,SER,ALA,ALA
9,LEU,ASP,ASP


In [15]:
# Transform aminoacids to one letter code
aa_code = {
    "ALA": "A","ARG": "R","ASN": "N","ASP": "D",
    "CYS": "C","GLU": "E","GLN": "Q","GLY": "G",
    "HIS": "H","ILE": "I","LEU": "L","LYS": "K",
    "MET": "M","PHE": "F","PRO": "P","SER": "S",
    "THR": "T","TRP": "W","TYR": "Y","VAL": "V"
}

prot_1_sequence = [aa_code[res.name] for res in protein_1.topology.residues]
prot_2_sequence = [aa_code[res.name] for res in protein_2.topology.residues]
prot_3_sequence = [aa_code[res.name] for res in protein_3.topology.residues]


sequences = {
    "Protein_1": prot_1_sequence,
    "Protein_2": prot_2_sequence + [""] * 3,
    "Protein_3": prot_3_sequence + [""] * 3,
}

sequences = pd.DataFrame.from_dict(sequences)
sequences.head(10)

Unnamed: 0,Protein_1,Protein_2,Protein_3
0,S,N,N
1,K,S,S
2,K,L,L
3,N,A,A
4,S,L,L
5,L,S,S
6,A,L,L
7,L,T,T
8,S,A,A
9,L,D,D


In [16]:
# Convert sequences to string
prot_1_sequence = "".join(prot_1_sequence)
prot_2_sequence = "".join(prot_2_sequence)
prot_3_sequence = "".join(prot_3_sequence)
print(prot_2_sequence)

NSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNLADRELVHMINWAKRVPGFVDLTLHDQVHLLECAWLEILMIGLVWRSMEHPGKLLFAPNLLLDRNQGKCVEGMVEIFDMLLATSSRFRMMNLQGEEFVCLKSIILLNSGVYTFLSSTLKSLEEKDHIHRVLDKITDTLIHLMAKAGLTLQQQHQRLAQLLLILSHIRHMSNKGMEHLYSMKCKNVVPLYDLLLEMLDAHRLH


## Sequence Alignment

In [44]:
sequences = [prot_1_sequence, prot_2_sequence, prot_3_sequence]
aligner = Align.PairwiseAligner()

for i, j in combinations([1, 2, 3], 2):
    seq_i = sequences[i - 1] 
    seq_j = sequences[j - 1]
    alignments = aligner.align(seq_i, seq_j)
    print("Alignment between protein {} and protein {}".format(i, j))
    print("Number of alignments: {}".format(len(alignments)))
    for align in alignments:
        print("Score = {:.1f}\n".format(align.score))
        print(align)

Alignment between protein 1 and protein 2
Number of alignments: 1
Score = 247.0

SKKNSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNLADRELVHMINWAKRVPGFVDLTLHDQVHLLECAWLEILMIGLVWRSMEHPGKLLFAPNLLLDRNQGKCVEGMVEIFDMLLATSSRFRMMNLQGEEFVCLKSIILLNSGVYTFLSSTLKSLEEKDHIHRVLDKITDTLIHLMAKAGLTLQQQHQRLAQLLLILSHIRHMSNKGMEHLYSMKCKNVVPLYDLLLEMLDAHRLH
---|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
---NSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNLADRELVHMINWAKRVPGFVDLTLHDQVHLLECAWLEILMIGLVWRSMEHPGKLLFAPNLLLDRNQGKCVEGMVEIFDMLLATSSRFRMMNLQGEEFVCLKSIILLNSGVYTFLSSTLKSLEEKDHIHRVLDKITDTLIHLMAKAGLTLQQQHQRLAQLLLILSHIRHMSNKGMEHLYSMKCKNVVPLYDLLLEMLDAHRLH

Alignment between protein 1 and protein 3
Number of alignments: 1
Score = 247.0

SKKNSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNLADRELVHMINWAKRVPGFVDLTLHDQVHLLECAWL

<div class="alert alert-danger">
<strong>What's a sequence alignment?</strong> Performing a sequence alignment between two or more aminoacids' sequences is not straightforward, there is not a unique way to do it. The sequences must be aligned matching common segments, but be aware of the following. A sequence can be modified with mutations, insertions and deletions. As I said, it is not a trivial task. And multiple sequence alignment is a problem far from being solved. Try to make a sequence alignment of these 3 sequences (one to one) with the help of <a href='https://biopython.org/'>Biopython</a>. Check the <a href='https://biopython.org/docs/1.75/api/Bio.Align.html#Bio.Align.PairwiseAligner'>PairWiseAligner</a> of Biopython and try it.
</div>

## Structural Alignment

In [45]:
proteins = [protein_1, protein_2, protein_3]
aligner = Align.PairwiseAligner()
n_proteins = len(proteins)

rows = []

for ii in range(n_proteins):
    seq_i = sequences[ii]
    row = []
    for jj in range(n_proteins):
        seq_j = sequences[jj]
        # Align sequences
        alignments = aligner.align(seq_i, seq_j)
        align = alignments[0]
        # Get indices of common resiudes
        indices = align.aligned
        start_i, end_i = indices[0][0]
        start_j, end_j = indices[1][0]        
        prot_i_indices = np.arange(start_i, end_i + 1)
        prot_j_indices = np.arange(start_j, end_j + 1)
        # Select common residues from each protein
        prot_i_backbone = proteins[ii].atom_slice(prot_i_indices)
        prot_j_backbone = proteins[jj].atom_slice(prot_j_indices)
        # Compute rmsd
        rmsd = md.rmsd(prot_i_backbone, prot_j_backbone, 0)
        row.append(rmsd[0])
    rows.append(row)
    
print("RMSD between common segments")
columns = ["protein_1", "protein_2", "protein_3"]
RMSD = pd.DataFrame(rows, index=columns, columns=columns)
RMSD

RMSD between common segments


Unnamed: 0,protein_1,protein_2,protein_3
protein_1,0.000697,0.58874,0.588521
protein_2,0.588743,0.001215,0.138954
protein_3,0.588524,0.138945,0.001569


<div class="alert alert-danger">
<strong>What's a structural alignment?</strong> What you were trying to do here is a structural alignment. Sometimes we want to overlap structures to check their "geometric" similarity, and these structures are not equal. A measure of "geometric" similarity between objects composed by distinguishable elements is the root mean square deviation (rmsd). The problem of the structural alignment again is far from trivial, there are different approaches. But in this case it is simple: with the previous sequence alignment you know the common segments, you can then compute the rmsd of the backbone atoms belonging to those common segments. Again, do this by pairs... A vs B and A vs C, for instance.
</div>

In [21]:
# help(mdtraj.core.topology.Residue)