In [1]:
import os
import matplotlib

import umap
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA


MIN_SUPERFAMILY_SIZE = 128

CDD_SUPERFAMILY_PATH = os.path.abspath(
        '../data/interim/CDD_metadata/family_superfamily_links')
CDD_MASTER_PROCESSED_ALGN_FEAT_PATH = os.path.abspath(
        '../data/processed/CDD_alignment/cdd_master_alignment_feature.csv')

CDD_TSNE_IMAGE_DIR_PATH = os.path.abspath('../docs/images/cdd_tsne')
CDD_UMAP_IMAGE_DIR_PATH = os.path.abspath('../docs/images/cdd_umap')


In [2]:
cdd_superfamily_df = pd.read_table(
    CDD_SUPERFAMILY_PATH,
    header=None,
    index_col=0,
    names=[
        'accession',
        'pssm_id',
        'superfamily_accession',
        'superfamily_pssm_id'
    ],
)

cdd_master_seq_algn_feat_df = pd.read_csv(
    CDD_MASTER_PROCESSED_ALGN_FEAT_PATH,
    index_col=0,
)


In [3]:
cdd_superfamily_value_counts = \
    cdd_superfamily_df['superfamily_accession'].value_counts()
selected_cdd_superfamilies = \
    list(cdd_superfamily_value_counts[cdd_superfamily_value_counts > MIN_SUPERFAMILY_SIZE].index)
selected_cdd_superfamily_df = \
    cdd_superfamily_df[cdd_superfamily_df['superfamily_accession'].isin(selected_cdd_superfamilies)]

print(selected_cdd_superfamily_df.head(10).to_markdown())


| accession   |   pssm_id | superfamily_accession   |   superfamily_pssm_id |
|:------------|----------:|:------------------------|----------------------:|
| PRK12559    |     79035 | cl00388                 |                381987 |
| cd00009     |     99707 | cl38936                 |                393306 |
| cd00378     |     99733 | cl18945                 |                388495 |
| cd00609     |     99734 | cl18945                 |                388495 |
| cd00610     |     99735 | cl18945                 |                388495 |
| cd00611     |     99736 | cl18945                 |                388495 |
| cd00613     |     99737 | cl18945                 |                388495 |
| cd00614     |     99738 | cl18945                 |                388495 |
| cd00615     |     99739 | cl18945                 |                388495 |
| cd00616     |     99740 | cl18945                 |                388495 |


In [4]:
def visualize_cdd_with_tsne(pca_n_components):

    plt.figure(figsize=(24, 16))

    pca = PCA(n_components=pca_n_components)
    cdd_master_seq_algn_feat_arr = pca.fit_transform(cdd_master_seq_algn_feat_df)

    tsne = TSNE(n_components=2)
    cdd_master_seq_algn_coord_df = pd.DataFrame(
        tsne.fit_transform(cdd_master_seq_algn_feat_arr),
        index=cdd_master_seq_algn_feat_df.index,
        columns=['x', 'y'],
    )

    cdd_master_seq_algn_coord_df.index.name = 'accession'
    cdd_coord_superfamily_df = pd.merge(
        cdd_master_seq_algn_coord_df,
        selected_cdd_superfamily_df[['superfamily_accession']],
        how='outer',
        on=['accession'],
    )
    cdd_coord_superfamily_df = cdd_coord_superfamily_df.fillna('others')


    num_superfamilies = len(cdd_coord_superfamily_df['superfamily_accession'].unique())
    palette = ['#DDDDDD', ] + sns.color_palette('hls', num_superfamilies - 1)

    _ax = sns.scatterplot(
        data=cdd_coord_superfamily_df,
        x='x', y='y',
        hue='superfamily_accession',
        palette=palette,
        legend='brief',
    )
    plt.legend(loc='upper right', ncol=3)
    plt.title(f't-SNE visualization of conserved domains with PCA (n_component={pca_n_components})')
    plt.tight_layout()
    plt.savefig(os.path.join(
        CDD_TSNE_IMAGE_DIR_PATH,
        f'pca_{pca_n_components}_tsne_cdd.png',
    ))



In [5]:
def visualize_cdd_with_umap(umap_n_neighbors, umap_min_dist):

    plt.figure(figsize=(24, 16))

    umap = umap.UMAP(n_neighbors=umap_n_neighbors, min_dist=umap_min_dist)
    cdd_master_seq_algn_coord_df = pd.DataFrame(
        umap.fit_transform(cdd_master_seq_algn_feat_df),
        index=cdd_master_seq_algn_feat_df.index,
        columns=['x', 'y'],
    )

    cdd_master_seq_algn_coord_df.index.name = 'accession'
    cdd_coord_superfamily_df = pd.merge(
        cdd_master_seq_algn_coord_df,
        selected_cdd_superfamily_df[['superfamily_accession']],
        how='outer',
        on=['accession'],
    )
    cdd_coord_superfamily_df = cdd_coord_superfamily_df.fillna('others')


    num_superfamilies = len(cdd_coord_superfamily_df['superfamily_accession'].unique())
    palette = ['#DDDDDD', ] + sns.color_palette('hls', num_superfamilies - 1)

    _ax = sns.scatterplot(
        data=cdd_coord_superfamily_df,
        x='x', y='y',
        hue='superfamily_accession',
        palette=palette,
        legend='brief',
    )
    plt.legend(loc='upper right', ncol=3)
    plt.title(f'UMAP (n_neighbors={umap_n_neighbors}, '
              f'min_dist={umap_min_dist}) '
              f'visualization of conserved domains')
    plt.tight_layout()
    plt.savefig(os.path.join(
        CDD_TSNE_IMAGE_DIR_PATH,
        f'umap_{umap_n_neighbors}_{umap_min_dist}_cdd.png',
    ))


In [None]:
for _pca_n_components in range(2, 51):
    visualize_cdd_with_tsne(_pca_n_components)

for _umap_n_neighbors in range(1, int(np.ceil(np.log2((len(cdd_master_seq_algn_feat_df) / 4))))):
    _umap_n_neighbors = 2 ** _umap_n_neighbors
    for _umap_min_dist in np.arange(0.09, 1, 0.1):
        visualize_cdd_with_umap(_umap_n_neighbors, _umap_min_dist)
