This notebook calculates pairwise distances and generates violin plots showing the cooresponding distance distributions. <br>
The input csv file must contain, at minimum, the specifed column names:<br>
---For Tanimoto: Columns titled "DEL" and "Smiles"<br>
---For Euclidean, Manhattan, Cosine, etc: Columns titled "DEL", "x1", and "x2"<br>

In [None]:
#Import Dependancies
import numpy as np 
import pandas as pd

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit import DataStructs
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns

from tqdm import tqdm
tqdm.pandas()


inputfile='Notebook Outputs/gtm_output.csv'

#Create the dataframe to be used for fingerprinting 
data_df = pd.read_csv(inputfile)

#Create a list of members in the "MolType" column
DEL_list=data_df.DEL.unique()

#Create a list for holding the pairwise distances
distance_list=[]


def Construct_PairWise_DistanceList(metric,n):
    '''Use an input csv with 'x1' and 'x2' coordinate columns and calculate pairwise distances with the given metric.
    
    Parameters:
    ==========
    metric: The distance metrics to use with sklearn. Examples of valid values:'cityblock', 'cosine', 'euclidean', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard'

    Returns:
    ==========
    A dataframe of pairwise distances for each DEL in the input file
    
    
    '''
    dist_df=pd.DataFrame() #create an empty df for holding the distances

    #For each type in the csv "DEL" column:
    for type in DEL_list:
        temp_df=data_df.loc[data_df['DEL'] == type].sample(n=n) #create a temp dataframe for the current 'DEL' iteration for n random rows 
        x_vals=np.array(temp_df['x1'].values.tolist())  #extract the x values as np array
        y_vals=np.array(temp_df['x2'].values.tolist()) #extract the y values as np array
        xy_array=np.stack([x_vals, y_vals],axis=-1) #create the x,y coordinate array
        distances=pairwise_distances(xy_array,xy_array,metric=metric) #calulate the pairwise distances for the xy array
        distances_list=distances.tolist() #convert the ndarray to a list of lists
        FinalList = list(np.concatenate(distances_list).flat) #Flatten the list of lists into 1D list
        dist_df[type]=FinalList #Add the final list to the distance dataframe
    return(dist_df)

# Defines a function to calculate Tanimoto similarities among the molecules
def pairwise_similarity_list(fingerprints_list):
    """
    Parameters:
    ==========
    Input: List of RDkit fingerprint objects

    Returns:
    ==========
    List of pairwise tanimoto distances

    """
    global similarity_list
    similarity_list = [] #create the empty list
    for i in range(1, fp_len):
            tanimoto = DataStructs.BulkTanimotoSimilarity(fp_list[i], fp_list[:i]) #calulate the pairwise tanimoto distances
            similarity_list.append(tanimoto)  #fill the lower diagonal
    AllDistances = list(np.concatenate(similarity_list).flat) #Flatten the list of lists into 1D list
    return AllDistances

# Defines a function to calculate ECFP fingerprints
def Get_ECFP(smiles: str, radius: int, nbits: int):
    """
    Parameters:
    ==========
    smiles: smiles string to be fingerprinted
    radius: the circular morgan radius
    nbits: the number of bits to use for hashing

    Returns:
    ==========
    hashed ECFP RDkit fingerprint object

    """
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetHashedMorganFingerprint(mol, radius, nbits)
    return fp

# Defines a function to create a dataframe containing ecfp tanimoto distances
def Get_ECFP6_Tanimoto_DistanceList(samples):
    '''Calculate ecfp6 fingerprints and use them to calculate the tanimoto distances.

    Parameters:
    ==========
    samples: The number of random compounds from each DEL in the input file to use for distance calulations.

    Returns:
    ==========
    A dataframe of pairwise distances for each DEL in the input file
    '''
    dist_df=pd.DataFrame()
    global fp_list
    global fp_len
    #For each type in the csv "MolType" column:
    for DEL_name in DEL_list:
        temp_df=data_df.loc[data_df['DEL'] == DEL_name].sample(n=samples) #create a temp dataframe for the current 'DEL' iteration for n random rows 
        fp_list = [Get_ECFP(smiles= x, radius= 3, nbits= 1024) for x in temp_df['Smiles']] #Calculate the ECFP6 fingerprints
        fp_len = len(fp_list)
        print(str(DEL_name), fp_len, 'cpds Fingerprinted')
        tanimoto_distances=pairwise_similarity_list(fp_list) #Calculate the Tanimoto distances for the fingerprints
        dist_df[DEL_name]=tanimoto_distances #Add the list of distances to the distance dataframe
    return(dist_df)




Pairwise Tanimoto Distribution

In [None]:
df_distances=Get_ECFP6_Tanimoto_DistanceList(1000)
ax = sns.violinplot(data=df_distances, scale='count', palette=sns.color_palette('tab10'))
#ax.set(ylim=(0.4, 1))
sns.set(rc={'figure.figsize':(50, 25)})
sns.set_context("paper", rc={"font.size":5,"axes.titlesize":8,"axes.labelsize":5})  
sns.set(font_scale=4)
plt.title( "Pairwise Tanimoto Distance Distributions" , size =64 )
#sns.set(title='Pairwise Euclidean Distance Distributions')
sns.set_style('ticks')
sns.despine()
plt.savefig('Notebook Outputs/distance_distributions_Tanimoto_AllDELs.svg', bbox_inches='tight')

Pairwise Euclidean Distances <br><br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;see the sklearn documentation for more metrics and information:<br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html<br>
NOTE: the input file must contain a set of cartesian coordinates to use these distance meterics.<br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;The output csv from the "GTM_Analysis" or "PCA_Analysis" notebooks contain (x,y) coordinates. Obtain the GTM or PCA output first and change the inputfile path accordingly.

In [None]:
df_distances=Construct_PairWise_DistanceList(metric='euclidean',n=1000)
ax = sns.violinplot(data=df_distances, scale='count', palette=sns.color_palette('tab10'),bw=0.1, inner='box')
ax.set(ylim=(0, 2))
sns.set(rc={'figure.figsize':(40, 25)})
sns.set_context("paper", rc={"font.size":8,"axes.titlesize":8,"axes.labelsize":5})  
sns.set(font_scale=4)
plt.title( "Pairwise Euclidean Distance Distributions" , size =64 )
#sns.set(title='Pairwise Euclidean Distance Distributions')
sns.set_style('ticks')
sns.despine()
plt.savefig('Notebook Outputs/distance_distributions_Euclidean_AllDELs.svg', bbox_inches='tight')

Pairwise Manhattan Distance

In [None]:
df_distances=Construct_PairWise_DistanceList(metric='manhattan',n=1000)
ax = sns.violinplot(data=df_distances, scale='count', palette=sns.color_palette('tab10'))

sns.set(rc={'figure.figsize':(40, 25)})
sns.set_context("paper", rc={"font.size":8,"axes.titlesize":8,"axes.labelsize":5})  
sns.set(font_scale=4)
plt.title( "Pairwise Manhattan Distance Distributions" , size =64 )
#sns.set(title='Pairwise Euclidean Distance Distributions')
sns.set_style('ticks')
sns.despine()
plt.savefig('Notebook Outputs/distance_distributions_Manhattan_AllDELs.svg', bbox_inches='tight')

Pairwise Cosine Similarity

In [None]:
df_distances=Construct_PairWise_DistanceList(metric='cosine',n=999)
ax = sns.violinplot(data=df_distances, scale='count', palette=sns.color_palette('tab10'))

sns.set(rc={'figure.figsize':(40, 25)})
sns.set_context("paper", rc={"font.size":8,"axes.titlesize":8,"axes.labelsize":5})  
sns.set(font_scale=4)
plt.title( "Pairwise Cosine Similarity Distributions" , size =64 )
#sns.set(title='Pairwise Euclidean Distance Distributions')
sns.set_style('ticks')
sns.despine()
plt.savefig('Notebook Outputs/distance_distributions_Cosine_AllDELs.svg', bbox_inches='tight')