In this notebook, I will create several UMAPS with the data from Lenselink and check wether I can find physiochemical properties that are or are not captured by canonical SMILES.

In [16]:
import pandas as pd
import numpy as np
import plotly.express as px
import umap

In [17]:
df = pd.read_csv('../datasets/Lenselink_et_al/Lenselink_1_Molecular_Notation_Transformation_150samples.csv')

### Data preparation

In [18]:
umap_input = np.zeros([len(df), 2048], dtype=np.int8)

for i in range(len(df)):
    umap_input[i] = np.asarray(list(df.iloc[i,17]),
                               dtype=np.int8)

In [4]:
#umap_input

In [19]:
# function from Chris

def umap_reduce(embeddings, **kwargs):
    umap_params = dict()

    umap_params['n_components'] = kwargs.get('n_components', 3)
    umap_params['min_dist'] = kwargs.get('min_dist', .6)
    umap_params['random_state'] = kwargs.get('random_state', 420)
    umap_params['n_neighbors'] = kwargs.get('n_neighbors', 15)
    umap_params['verbose'] = kwargs.get('verbose', 1)
    umap_params['metric'] = kwargs.get('metric', 'cosine')

    transformed_embeddings = umap.UMAP(**umap_params).fit_transform(embeddings)

    return transformed_embeddings

In [20]:
transformed_embeddings = umap_reduce(umap_input)

UMAP(a=None, angular_rp_forest=True, b=None,
     force_approximation_algorithm=False, init='spectral', learning_rate=1.0,
     local_connectivity=1.0, low_memory=False, metric='cosine',
     metric_kwds=None, min_dist=0.6, n_components=3, n_epochs=None,
     n_neighbors=15, negative_sample_rate=5, output_metric='euclidean',
     output_metric_kwds=None, random_state=420, repulsion_strength=1.0,
     set_op_mix_ratio=1.0, spread=1.0, target_metric='categorical',
     target_metric_kwds=None, target_n_neighbors=-1, target_weight=0.5,
     transform_queue_size=4.0, transform_seed=42, unique=False, verbose=1)
Construct fuzzy simplicial set
Mon Jun 29 13:17:24 2020 Finding Nearest Neighbors
Mon Jun 29 13:17:24 2020 Finished Nearest Neighbor Search
Mon Jun 29 13:17:24 2020 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /

In [None]:
#transformed_embeddings

### UMAP base

In [21]:
from pandas import DataFrame

def render_3D_scatter_plotly(embeddings_dataframe: DataFrame):
    if 'label' in embeddings_dataframe.columns:
        fig = px.scatter_3d(embeddings_dataframe,
                            template='ggplot2',
                            x='x',
                            y='y',
                            z='z',
                            color='label',
                            #symbol='label',
                            hover_name=embeddings_dataframe.index,
                            hover_data=["label"]
                           )
    else:
        fig = px.scatter_3d(embeddings_dataframe,
                            template='ggplot2',
                            x='x',
                            y='y',
                            z='z',
                            hover_name=embeddings_dataframe.index,
                            )

    fig.update_layout(
        # Remove axes ticks and labels as they are usually not informative
        scene=dict(
            xaxis=dict(
                showticklabels=False,
                showspikes=False,
                title=""
            ),
            yaxis=dict(
                showticklabels=False,
                showspikes=False,
                title=""
            ),
            zaxis=dict(
                showticklabels=False,
                showspikes=False,
                title=""
            )
        ),
    )

    return fig

In [22]:
# Numpy nd array -> Pandas Dataframe
temp_plot_input = pd.DataFrame(data = transformed_embeddings)
temp_plot_input.columns = ['x', 'y', 'z']

### UMAP colored by physiochemical property

#### Molecular weight

In [23]:
molecular_weight = temp_plot_input
molecular_weight['label'] = df['CMP_MOLECULAR_WEIGHT']
molecular_weight_plot = render_3D_scatter_plotly(molecular_weight)
molecular_weight_plot

#### Solubility

In [24]:
solubility = temp_plot_input
solubility['label'] = df['CMP_SOLUBILITY']
solubility_plot = render_3D_scatter_plotly(solubility)
solubility_plot

#### Surface area

In [25]:
surfacearea = temp_plot_input
surfacearea['label'] = df['CMP_MOLECULAR_SURFACEAREA']
surfacearea_plot = render_3D_scatter_plotly(surfacearea)
surfacearea_plot

#### Number of aromatic rings

In [26]:
aromatic_rings = temp_plot_input
aromatic_rings['label'] = df['CMP_NUM_AROMATICRINGS']
aromatic_rings_plot = render_3D_scatter_plotly(aromatic_rings)
aromatic_rings_plot

#### LogP

In [28]:
LogP = temp_plot_input
LogP['label'] = df['CMP_LOGP']
LogP_plot = render_3D_scatter_plotly(LogP)
LogP_plot

#### Share of positively charged atoms

In [30]:
pos_charge = temp_plot_input
pos_charge['label'] = df['CMP_ATOMS_POSITIVE_FRAC']
pos_charge_plot = render_3D_scatter_plotly(pos_charge)
pos_charge_plot

#### Share of negatively charged atoms

In [31]:
neg_charge = temp_plot_input
neg_charge['label'] = df['CMP_ATOMS_NEGATIVE_FRAC']
neg_charge_plot = render_3D_scatter_plotly(neg_charge)
neg_charge_plot

In [32]:
#df.head()