# Training set analysis and visualization

> 1. Visualize structures with `py3dmol`
> 2. Visualize clusters obained in the `clustering` module
> 3. Show distribution of target values

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, dendrogram

from prody import parsePDB

from jclinic.pairwise_rmsd import make_rmsds_matrix

from matplotlib import pyplot as plt

import py3Dmol
from ipywidgets import interact


COLOR_PALETTE_RGB_6 = np.array([
    [64, 83, 211],
    [221, 179, 16],
    [181, 29, 20],
    [0, 190, 255],
    [251, 73, 176],
    [0, 178, 93],
    [202, 202, 202]
])


COLOR_PALETTE_RGB_12 = np.array([
    [235, 172, 35],
    [184, 0, 88],
    [0, 140, 249],
    [0, 110, 0],
    [0, 187, 173],
    [209, 99, 230],
    [178, 69, 2],
    [255, 146, 135],
    [89, 84, 214],
    [0, 198, 248],
    [135, 133, 0],
    [0, 167, 108],
    [189, 189, 189]
])

## Gather and parse all PDB structures with `ProDy`

**NOTE**: We sort the collection of structure names alphanumerically.

In [None]:
structures_dir = Path("../data/structures_fixed")
structures_paths = {x.name.removesuffix(".pdb"): x for x in sorted(structures_dir.glob("*.pdb"))}

In [None]:
parsed_structures_prody = {name: parsePDB(str(path)) for name, path in structures_paths.items()}

### Check that all structures have only one coordinate set (model)

In [None]:
for name, structure in parsed_structures_prody.items():
    if structure.numCoordsets() > 1:
        warn(f"More than one coordinate set present in {name}")

## 1. Visualize all structures with an interactive widget, coloring by chain

In [None]:
def show_py3dmol_view(name):
    with open(structures_paths[name]) as ifile:
        system = "".join([x for x in ifile])
        view = py3Dmol.view(width=800, height=600)
        view.addModelsAsFrames(system)
        structure = parsed_structures_prody[name]
        unique_chain_ids = np.unique(structure.getChids())
        print(f"Chain IDs: {unique_chain_ids}")
        for i, chain_id in enumerate(unique_chain_ids):
            view.setStyle({"chain": chain_id},
                          {"cartoon": {"color": f"rgb{tuple(COLOR_PALETTE_RGB_12[i])}"}})
        view.zoomTo()
        view.show()

interact(show_py3dmol_view, name=structures_paths.keys());

## 2. Compute pairwise RMSD matrix and show hierarchical clustering dendrogram

In [None]:
pairwise_rmsds = make_rmsds_matrix(parsed_structures_prody, show=True)

In [None]:
# Fill NaN values with twice the maximum finite value for visualization purposes
pairwise_rmsds[pairwise_rmsds.isna()] = 2 * pairwise_rmsds.max(axis=None)

Z = linkage(squareform(pairwise_rmsds, checks=False))

plt.subplots(figsize=(10, 4))
dendrogram(Z, labels=pairwise_rmsds.index);

## 3. Read target values as a `Pandas` dataframe and show hierarchical clustering dendrogram

**NOTE**: Again, we sort structure names alphanumerically.

Several structures belong to the same PDB ID and have identical or very similar target values.

In [None]:
labels_path = Path("../data/labels.txt")
df = pd.read_csv(
    labels_path,
    sep=" ",
    header=None,
    names=["Structure", "Value"],
    index_col="Structure"
)
df.sort_index(inplace=True)

# Check that the dataframe index is identical to the `structures_paths` ordered key collection
assert list(df.index) == list(structures_paths.keys())

In [None]:
Z = linkage(df)

plt.subplots(figsize=(10, 4))
dendrogram(Z, labels=df.index);