In this notebook, I will create several UMAPS with the data from Lenselink and check wether I can find physiochemical properties that are or are not captured by canonical SMILES.

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import umap


No direct replacement for 'numba.targets' available. Visit https://gitter.im/numba/numba-dev to request help. Thanks!



In [2]:
df = pd.read_csv('../datasets/Lenselink_et_al/Lenselink_1_Molecular_Notation_Transformation_150samples.csv')

### Data preparation

In [3]:
umap_input = np.zeros([len(df), 2048], dtype=np.int8)

for i in range(len(df)):
    umap_input[i] = np.asarray(list(df.iloc[i,17]),
                               dtype=np.int8)

In [11]:
np.random.seed(42)    # I'm not sure if I need this, as I set a random state in the umap_reduce function.
# But as I'm too lazy to try it out now, I just set a seed.

[Guide](https://umap-learn.readthedocs.io/en/latest/parameters.html#) for parameter tuning

In [70]:
def umap_reduce(embeddings, **kwargs):
    umap_params = dict()

    umap_params['n_components'] = kwargs.get('n_components', 2)   # dimensionality of the output
    umap_params['random_state'] = kwargs.get('random_state', 420)
    umap_params['verbose'] = kwargs.get('verbose', 1)
    
    # ADAPT:
    umap_params['min_dist'] = kwargs.get('min_dist', .8)
    umap_params['n_neighbors'] = kwargs.get('n_neighbors', 5)
    umap_params['metric'] = kwargs.get('metric', 'cosine')
    
    transformed_embeddings = umap.UMAP(**umap_params).fit_transform(embeddings)

    return transformed_embeddings

In [71]:
transformed_embeddings = umap_reduce(umap_input)

UMAP(a=None, angular_rp_forest=True, b=None,
     force_approximation_algorithm=False, init='spectral', learning_rate=1.0,
     local_connectivity=1.0, low_memory=False, metric='cosine',
     metric_kwds=None, min_dist=0.8, n_components=2, n_epochs=None,
     n_neighbors=5, negative_sample_rate=5, output_metric='euclidean',
     output_metric_kwds=None, random_state=420, repulsion_strength=1.0,
     set_op_mix_ratio=1.0, spread=1.0, target_metric='categorical',
     target_metric_kwds=None, target_n_neighbors=-1, target_weight=0.5,
     transform_queue_size=4.0, transform_seed=42, unique=False, verbose=1)
Construct fuzzy simplicial set
Sun Jul  5 21:21:43 2020 Finding Nearest Neighbors
Sun Jul  5 21:21:43 2020 Finished Nearest Neighbor Search
Sun Jul  5 21:21:43 2020 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  / 

In [None]:
#transformed_embeddings

### UMAP base

In [36]:
from pandas import DataFrame

def render_2D_scatter_plotly(embeddings_dataframe: DataFrame, title=''):
    fig = px.scatter(embeddings_dataframe,
                     template='ggplot2',
                     x='x',
                     y='y',
                     color='label',
                     hover_name=embeddings_dataframe.index,
                     title=title,
                     width=600,
                     height=600,
                     color_continuous_scale='Bluered_r'
                     #color_continuous_scale='Viridis'
                     
                     # TODO:
                     # remove axis ticks + numbers
                     # remove legend
                     # remove axis labels
                     # make the points bigger
                     
                     #showticklabels=False,
                     #showspikes=False
                     #marker_size=3                     
                    )

    fig.layout.coloraxis.showscale=False
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)
    
    return fig

In [72]:
# Numpy nd array -> Pandas Dataframe
temp_plot_input = pd.DataFrame(data = transformed_embeddings)
temp_plot_input.columns = ['x', 'y']

In [60]:
#temp_plot_input

### UMAP colored by physiochemical property

#### Molecular weight

In [73]:
molecular_weight = temp_plot_input
molecular_weight['label'] = df['CMP_MOLECULAR_WEIGHT']
molecular_weight_plot = render_2D_scatter_plotly(molecular_weight, title='Molecular weight')
molecular_weight_plot

#### Solubility

In [None]:
solubility = temp_plot_input
solubility['label'] = df['CMP_SOLUBILITY']
solubility_plot = render_2D_scatter_plotly(solubility, title='Solubility')
solubility_plot

#### Surface area

In [None]:
surfacearea = temp_plot_input
surfacearea['label'] = df['CMP_MOLECULAR_SURFACEAREA']
surfacearea_plot = render_2D_scatter_plotly(surfacearea, title='Molecular surface area')
surfacearea_plot

#### Number of aromatic rings

In [None]:
aromatic_rings = temp_plot_input
aromatic_rings['label'] = df['CMP_NUM_AROMATICRINGS']
aromatic_rings_plot = render_2D_scatter_plotly(aromatic_rings, title='Number of aromatic rings')
aromatic_rings_plot

#### LogP

In [None]:
LogP = temp_plot_input
LogP['label'] = df['CMP_LOGP']
LogP_plot = render_2D_scatter_plotly(LogP, title='LogP')
LogP_plot

#### Share of positively charged atoms

In [None]:
pos_charge = temp_plot_input
pos_charge['label'] = df['CMP_ATOMS_POSITIVE_FRAC']
pos_charge_plot = render_2D_scatter_plotly(pos_charge, title='Share of positively charged atoms')
pos_charge_plot

#### Share of negatively charged atoms

In [None]:
neg_charge = temp_plot_input
neg_charge['label'] = df['CMP_ATOMS_NEGATIVE_FRAC']
neg_charge_plot = render_2D_scatter_plotly(neg_charge, title='Share of negatively charged atoms')
neg_charge_plot

In [None]:
df.head()

In [None]:
df.describe()