Dataset Inputs

In [None]:
###############
### Imports ###
###############

from ugtm import eGTM, eGTC, eGTR, pcaPreprocess, runGTM, transform
import numpy as np
import altair as alt
from vega_datasets import data
import pandas as pd
import csv
from sklearn import datasets
from sklearn import preprocessing

from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem

from IPython.display import SVG
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.PandasTools import ChangeMoleculeRendering

from mhfp.encoder import MHFPEncoder


##############
### Inputs ###
##############

Training_Datafile='ChEMBL_10K.csv'

Testing_Datafile='Notebook Outputs/Combined_DELs.csv' 

calc_fp=True

#################
### Functions ###
#################

def Generate_Fingerprinted_df(datafile, descriptor, nBits=512):
     
    """Creates fingerprints of nBits for the smiles in the input datafile. Returns a df containing Smiles, IDs, DEL, Mols, and FPs."""
    

    Pproc_df = pd.read_csv(datafile) #Create pre-processing df from datafile
    Pproc_df['ROMol'] = Pproc_df.Smiles.apply(Chem.MolFromSmiles) #Create mols
    Pproc_df['Smiles'] = Pproc_df.ROMol.apply(lambda x: Chem.MolToSmiles(x, kekuleSmiles=True, isomericSmiles=False)) #Cleanup the smiles
    
    if descriptor=='descriptors':
        descr_list=[]
        for smiles in Pproc_df['Smiles']:
            #descriptor_group = from_smiles(smiles)
            descr_list.append(list((from_smiles(smiles,descriptors=False, fingerprints=True)).values()))
            if len(descr_list)>=20:
                break
        fp_array=np.array(descr_list)

    if descriptor=='MXFP':
        MXFP = mxfp.MXFPCalculator(dimensionality='2D')
        fp = [MXFP.mxfp_from_mol(x) for x in Pproc_df['ROMol']]
        fp_array=np.array(fp)
        
    if descriptor=='MHFP6':
        MHFP6 = MHFPEncoder(n_permutations=nBits)
        fp=[MHFP6.encode(x) for x in Pproc_df['Smiles']]
        fp_array=np.array(fp)

    if descriptor=='ECFP6':
        ECFP6 = [AllChem.GetMorganFingerprintAsBitVect(x,radius=3, nBits=nBits) for x in Pproc_df['ROMol']]
        fp_list=[list(l) for l in ECFP6]
        fp_array = np.array(fp_list)
    
    if descriptor=='MACCS':
        fp_list=[]
        MACCS=[rdMolDescriptors.GetMACCSKeysFingerprint(x) for x in Pproc_df['ROMol']]
        fp_str=[fp.ToBitString() for fp in MACCS]
        for fp in fp_str:
            temp_list=[int(x) for x in fp]
            fp_list.append(temp_list)
        fp_array = np.array(fp_list)

    return(fp_array, Pproc_df)


def _prepareMol(mol,kekulize):
    """Prepares mols for use in the moltosvg function"""
    mc = Chem.Mol(mol.ToBinary())
    if kekulize:
        try:
            Chem.Kekulize(mc)
        except:
            mc = Chem.Mol(mol.ToBinary())
    if not mc.GetNumConformers():
        rdDepictor.Compute2DCoords(mc)
    return mc
 

def moltosvg(mol,molSize=(225,100),kekulize=True,drawer=None,**kwargs):
    """Prepares svg imgs for use in inteactive plot visualizations."""
    mc = _prepareMol(mol,kekulize)
    if drawer is None:
        drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])
    drawer.DrawMolecule(mc,**kwargs)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    return SVG(svg.replace('svg:',''))

def pca_import(Dataset):
    """Import a csv containing PCA coords"""
    pca_df=pd.read_csv(Dataset)
    output_df=pca_df[['DEL', 'ID', 'Smiles']]
    pca_array = np.array(pca_df[['pc1', 'pc2']].values.tolist())
    #output_df['3Dpca'] = pca_df[['pc1', 'pc2', 'pc3']].values.tolist()
    
    return(pca_array, output_df)

def pca_processing(Dataset):
    """Perform PCA using a txt file generated using the Fingerprinting_Utilities notebook. Needed only for large datasets."""
    raw_df=pd.read_csv(Dataset)
    full_list=[]

    #Process Fingerprints into ndarray
    for idx, fp in enumerate(raw_df['FP']):
        temp_list=fp.split(';')
        temp_int_list= [eval(i) for i in temp_list]
        full_list.append(temp_int_list)
        
    stack_arr= np.stack(full_list, axis=0)

    #Perform PCA on Fingerprints
    ###Use n_components=-1 for maximal variance. Note that PCA input ndarrays must all be the same size. 
    pca_coords=pcaPreprocess(stack_arr, doPCA=True,n_components=200)
    return(pca_coords, raw_df)




Preprocess and compile the np arrays for the Training set and Testing set

In [None]:
######################
### Pre-Processing ###
######################

#Fingerprint all the molecules using ecfp6
#Create the training and testing dataframes used as the input GTM datasets

if calc_fp is True:
    Training_fp, Training_df = Generate_Fingerprinted_df(Training_Datafile, descriptor='ECFP6',nBits=1024)
    Testing_fp, Testing_df = Generate_Fingerprinted_df(Testing_Datafile, descriptor='ECFP6',nBits=1024)
else:
    Training_fp, Training_df = pca_import(Training_Datafile)
    Testing_fp, Testing_df = pca_import(Testing_Datafile)

#Create the training and testing arrays used for GTM
X_train = np.array(Training_fp)
X_test = np.array(Testing_fp)

#Create label arrays for training and testing sets
Train_labels = np.array(Training_df['DEL'])
Test_labels = np.array(Testing_df['DEL'])

#Create id arrays for training and testing sets
Train_IDs = np.array(Training_df['ID'])
Test_IDs = np.array(Testing_df['ID'])

Train_smiles = np.array(Training_df['Smiles'])
Test_smiles = np.array(Testing_df['Smiles'])

# Additional Scaling (optional)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Perform GTM using Descrete Responsibility Patterns (RP)<br>
NOTE: This calculation should take between 20-60 minutes depending on your machine specs

In [None]:
#############
### G T M ###
#############

#Create a gtm training map 
trn_gtm = runGTM(X_train, k=50, m=22) 

#Project the test data onto the training map
tst_gtm=transform(optimizedModel=trn_gtm, train=X_train, test=X_test,)

#Create dataframe for the fit training array using column names "x1" and "x2"
trn_chart_df=pd.DataFrame(trn_gtm.matMeans, columns=["x1", "x2"])
trn_chart_df['ID']=Train_IDs
trn_chart_df['DEL']=Train_labels
trn_chart_df['Smiles']=Train_smiles


#Create dataframe for the projected array using column names "x1" and "x2"
tst_chart_df=pd.DataFrame(tst_gtm.matMeans, columns=["x1", "x2"])
tst_chart_df['ID']=Test_IDs
tst_chart_df['DEL']=Test_labels
tst_chart_df['Smiles']=Test_smiles

#Create dataframe combining the training and test arrays using the specified column names
Overlay_df= pd.DataFrame(np.concatenate((tst_chart_df, trn_chart_df),axis=0), columns=['x1','x2','ID','DEL','Smiles'])


============================================<br>
Reference GTM completion times (Desktop PC, 12th Gen Intel CPU): <br>
============================================<br>
Training 50K ChEMBL  |  Testing 50K DEL2 50K DEL6  |   ECPF6 2048bits |  k=41 m=18  |  178 min<br>
Training 50K ChEMBL  |  Testing 50K DEL2 50K DEL6  |   ECPF6 1024bits |  k=41 m=18  |  58 min <br>
Training 50K ChEMBL  |  Testing 50K DEL2 50K DEL6  |   ECPF6 1024bits |  k=30 m=12  |  26 min<br>
Training 50K ChEMBL  |  Testing 50K DEL2 50K DEL6  |   ECPF6 1024bits |  k=50 m=22  |  108 min<br>
Training 10K ChEMBL  |  Testing 50K DEL2 50K DEL6  |   ECPF6 1024bits |  k=50 m=22  |  18 min<br>

Output training and test data to csv (optional)

In [None]:
Overlay_df.to_csv('Notebook Outputs/gtm_output.csv')

Prepare the Plot Data

In [None]:
################
### Plotting ###
################

# Plot of training set colored by labels
trn_chart = alt.Chart(trn_chart_df).mark_circle().encode(
    x='x1',y='x2',
    size=alt.value(10),
    color=alt.Color("DEL:N",
           legend=alt.Legend(title="Type")),
    #opacity="probability_of_predominant_class:Q",
    tooltip=["x1", "x2", "DEL:N", 'ID:N']
).properties(title="Training Set", width=300, height=300).interactive()

# Projection of test set colored by label
tst_chart = alt.Chart(tst_chart_df).mark_circle(size=5).encode(
    x='x1', y='x2',
    color=alt.Color("DEL:N",
                    legend=alt.Legend(title="Type")),
    size=alt.value(10),
    tooltip=["x1", "x2", "DEL:N", 'ID:N']
).properties(title="Projected Sets", width=300, height=300).interactive()

# Seperate the projected points into diffrent plots based on "DEL" classification
Individual_charts = alt.Chart(tst_chart_df).mark_circle().encode(
    x='x1', y='x2',
    color=alt.Color("DEL:N",
                    legend=alt.Legend(title="Type")),
    size=alt.value(10),
    tooltip=["x1", "x2", "DEL:N", 'ID:N']
).properties(title="Projected Set", width=300, height=300).facet(
    column='DEL:N'
).interactive()

# Create an overlay of the training and test charts
overlay_chart = trn_chart + tst_chart

#Plot the training set heatmap
Trained_heatmap= alt.Chart(trn_chart_df).mark_rect().encode(
    alt.X('x1:Q', bin=alt.Bin(maxbins=200)),
    alt.Y('x2:Q', bin=alt.Bin(maxbins=200)),
    alt.Color('count():Q', scale=alt.Scale(scheme='turbo', domain=[0, 10])),
).properties(title="Training Set", width=300, height=300).interactive()


#Plot the projected heatmaps seperatley (faceted)
Projected_heatmaps= alt.Chart(tst_chart_df).mark_rect().encode(
    alt.X('x1:Q', bin=alt.Bin(maxbins=200)),
    alt.Y('x2:Q', bin=alt.Bin(maxbins=200)),
    alt.Color('count():Q', scale=alt.Scale(scheme='turbo', domain=[0, 10]))
).properties(title="Projected Set", width=300, height=300).facet(
    column='DEL:N'
).interactive()

Visualize the GTM Plots

In [None]:
alt.data_transformers.disable_max_rows()
plots = trn_chart | Individual_charts
heatmaps = Trained_heatmap | Projected_heatmaps

alt.vconcat(plots, heatmaps)
