-Generates violin plots showing the distribution of properties for a dataset. <br>
-Performs Lipinski-Verber parameter analysis. <br>

Input csv files must be structured using one compound per row<br>

FOR GENERAL INPUTS <br>
---csv file must contain the following columns : "DEL", "ID", "smiles"<br>

FOR PRE-CALCULATED INPUTS<br>
---csv file must contain a column called "DEL" <br>
---csv file must contain at least one column with numerical property values<br>
<br>
GENERAL USE<br>
-The first cell in this notebook must be run to import the csv data. It does not need to be re-run unless you have made changes to the csv file.<br>
---By defualt, this notebook will use datafiles that are created using the GenerateLibrary notebook so be sure to run it first.<br>
-Seperate violin plots will be grouped according to the unique entries in the "DEL" column <br>
-This notebook uses RDkit to calulate the following properties:  <br>
---MW, cLogP, H-Donors, H-Acceptors, Rotatable Bonds, Polar Surface Area<br>
-Additional properties can be calculated in datawarrior and pasted into a csv file and used as an input dataset<br>
-Feel free to add additional properties to suit your needs using the "property" variable at the top of each cell

In [None]:
import sys
sys.path.append('./Modules')
import numpy as np 
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem import rdMolDescriptors



def Property_Dataframe(property): 
    '''Generate a dataframe for a specfic property column in an input csv file'''
    prop_df=pd.DataFrame() #create an empty df for holding the properties
    Property=[]
    #For each type in the csv "MolType" column:
    for Del in DEL_list:
        DEL_df=pd.DataFrame()
        temp_df=data_df.loc[data_df['MolType'] == str(Del)] #create a temp dataframe for the current 'Del' iteration
        temp_properties=np.array(temp_df[str(property)].values.tolist())  #extract the property values as np array
        DEL_df[Del]=temp_properties #create the DEL_df for the current Del iteration
        prop_df=pd.concat([prop_df, DEL_df], axis=1) #Concatonate the DEL_df to the main prop_df
    return(prop_df)

def Check_Lipinski_Verber_Params(df):
    Lipinski_counter=0
    Lipinski_params={'mol_wt':0, 'logp':0 ,'h_donors':0,'h_acceptors':0,'rotatable_bonds':0,'polar_surface_area':0, 'All':0}
    passing_IDs=[]
    passing_smiles=[]
    passing_DEL=[]
    druglike_df=pd.DataFrame()
    for index, row in df.iterrows():
        Lipinski_counter=0
        if row['mol_wt'] <= 500:
            Lipinski_params['mol_wt']+=1
            Lipinski_counter+=1
        if row['logp']<= 5:
            Lipinski_params['logp']+=1
            Lipinski_counter+=1
        if row['h_donors'] <= 5:
            Lipinski_params['h_donors']+=1
            Lipinski_counter+=1
        if row['h_acceptors'] <= 10:
            Lipinski_params['h_acceptors']+=1
            Lipinski_counter+=1
        if row['rotatable_bonds'] <= 5:
            Lipinski_params['rotatable_bonds']+=1
            Lipinski_counter+=1
        if row['polar_surface_area'] <=140:
            Lipinski_params['polar_surface_area']+=1
            Lipinski_counter+=1
        if Lipinski_counter==6:
            Lipinski_params['All']+=1
            passing_DEL.append(row['DEL'])
            passing_IDs.append(row['ID'])
            passing_smiles.append(row['smiles'])
    druglike_df['DEL']=passing_DEL    
    druglike_df['ID']=passing_IDs
    druglike_df['Smiles']=passing_smiles
    return(Lipinski_params,druglike_df)

def Create_Property_df():
    smiles=data_df.Smiles.to_list()
    Del=data_df.DEL.to_list()
    IDs=data_df.ID.to_list()
    property_dict={
        'DEL':Del,
        'ID':IDs,
        'smiles':smiles,
        'mol_wt':[], 
        'logp':[], 
        'h_donors':[],
        'h_acceptors':[],
        'rotatable_bonds':[],
        'polar_surface_area':[],
        'atoms':[],
        'heavy_atoms':[],
        'rings':[] }
    
    for mol in smiles:
        #mol=rdMolStandardize.StandardizeSmiles(mol)
        molecule=Chem.MolFromSmiles(mol)
        
        property_dict['mol_wt'].append(Descriptors.ExactMolWt(molecule))
        property_dict['logp'].append(Descriptors.MolLogP(molecule))
        property_dict['h_donors'].append(Descriptors.NumHDonors(molecule))
        property_dict['h_acceptors'].append(Descriptors.NumHAcceptors(molecule))
        property_dict['rotatable_bonds'].append(Descriptors.NumRotatableBonds(molecule))
        property_dict['polar_surface_area'].append(Chem.QED.properties(molecule).PSA)
        property_dict['atoms'].append(Chem.rdchem.Mol.GetNumAtoms(molecule))
        property_dict['heavy_atoms'].append(Chem.rdchem.Mol.GetNumHeavyAtoms(molecule))
        property_dict['rings'].append(Chem.rdMolDescriptors.CalcNumRings(molecule))

    Property_df=pd.DataFrame(property_dict)
    Property_df.to_csv('Properties.csv')
    return(Property_df)

def assemble_plot_df(property):
    output_df=pd.DataFrame()
    for type in DEL_list:
        temp_df=prop_df.loc[prop_df['DEL'] == type]
        property_vals=np.array(temp_df[property].values.tolist())
        output_df[type]=property_vals
    return(output_df)

Preprocessing

In [None]:
#Use for a csv file containing the smile you wish to calculate the properties for
inputfile='Notebook Outputs/Combined_DELs.csv' 

#Use for a csv file already containing calculated properties
#inputfile='Notebook Outputs/DEL_Properties.csv' 


#Create the dataframe to be used for fingerprinting 
data_df = pd.read_csv(inputfile)
data_df['ROMol'] = data_df.Smiles.apply(Chem.MolFromSmiles)
data_df['Smiles'] = data_df.ROMol.apply(lambda x: Chem.MolToSmiles(x, kekuleSmiles=True, isomericSmiles=False))

dataset='Example Libraries' #Name for the dataset (used for creation of the plot titles)

#Create a list of the unique members in the "MolType" column
DEL_list=data_df.DEL.unique()

Calculate the properties for the smiles in the input file

In [None]:
prop_df=Create_Property_df()

Perform Lipinski-Verber Analysis (optional)

In [None]:
Lipinski_dict, all_pass_df=Check_Lipinski_Verber_Params(prop_df)
total=len(prop_df)
print('Compounds Analyzed:',total)
for k,v in Lipinski_dict.items():
    percent=round((v/total)*100,1)
    print(k,v,str(percent)+'%')

Create a csv output for the calculated properties (optional)

In [None]:
prop_df.to_csv('Notebook Outputs/CalulatedProperties.csv')

Create output csv of compounds that pass all Lipinski-Verber parameters (optional)

In [None]:
all_pass_df.to_csv('Notebook Outputs/LipinskiCompounds.csv')

Create violin plots for molecular weight distributions

In [None]:
property='mol_wt'

plot_df=assemble_plot_df(property)




#Comment this line on if you are importing from a csv with pre-calculated values
#prop_df=Property_Dataframe(property)

ax = sns.violinplot(data=plot_df, scale='area', palette=sns.color_palette('tab10'), inner='box')

sns.set(rc={'figure.figsize':(40, 25)})
sns.set_context("paper", rc={"font.size":8,"axes.titlesize":8,"axes.labelsize":5})  
sns.set(font_scale=4)
plt.title( str(dataset+' ('+property+')'), size =64 )

sns.set_style('ticks')
sns.despine()
plt.savefig("Notebook Outputs/"+str(dataset+'_'+property+'.svg'), bbox_inches='tight')

Create violin plots for logP distributions

In [None]:
property='logp'

plot_df=assemble_plot_df(property)

#Comment this line on if you are importing from a csv with pre-calculated values
#prop_df=Property_Dataframe(property)

ax = sns.violinplot(data=plot_df, scale='count', palette=sns.color_palette('tab10'), inner='box')

sns.set(rc={'figure.figsize':(40, 25)})
sns.set_context("paper", rc={"font.size":8,"axes.titlesize":8,"axes.labelsize":5})  
sns.set(font_scale=4)
plt.title( str(dataset+' ('+property+')'), size =64 )

sns.set_style('ticks')
sns.despine()
plt.savefig(str(dataset+'_'+property+'.svg'), bbox_inches='tight')

Create violin plots for H-donor distributions

In [None]:
property='h_donors'

plot_df=assemble_plot_df(property)

#Comment this line on if you are importing from a csv with pre-calculated values
#prop_df=Property_Dataframe(property)

ax = sns.violinplot(data=plot_df, scale='count', palette=sns.color_palette('tab10'), inner='box', bw=1)

sns.set(rc={'figure.figsize':(40, 25)})
sns.set_context("paper", rc={"font.size":8,"axes.titlesize":8,"axes.labelsize":5})  
sns.set(font_scale=4)
plt.title( str(dataset+' ('+property+')'), size =64 )

sns.set_style('ticks')
sns.despine()
plt.savefig(str(dataset+'_'+property+'.svg'), bbox_inches='tight')

Create violin plots for H-acceptor distributions

In [None]:
property='h_acceptors'

plot_df=assemble_plot_df(property)

#Comment this line on if you are importing from a csv with pre-calculated values
#prop_df=Property_Dataframe(property)

ax = sns.violinplot(data=plot_df, scale='count', palette=sns.color_palette('tab10'), inner='box', bw=1)

sns.set(rc={'figure.figsize':(40, 25)})
sns.set_context("paper", rc={"font.size":8,"axes.titlesize":8,"axes.labelsize":5})  
sns.set(font_scale=4)
plt.title( str(dataset+' ('+property+')'), size =64 )

sns.set_style('ticks')
sns.despine()
plt.savefig(str(dataset+'_'+property+'.svg'), bbox_inches='tight')

Create violin plots for rotatable bond distributions

In [None]:
property='rotatable_bonds'

plot_df=assemble_plot_df(property)

#Comment this line on if you are importing from a csv with pre-calculated values
#prop_df=Property_Dataframe(property)

ax = sns.violinplot(data=plot_df, scale='count', palette=sns.color_palette('tab10'), inner='box', bw=1)

sns.set(rc={'figure.figsize':(40, 25)})
sns.set_context("paper", rc={"font.size":8,"axes.titlesize":8,"axes.labelsize":5})  
sns.set(font_scale=4)
plt.title( str(dataset+' ('+property+')'), size =64 )

sns.set_style('ticks')
sns.despine()
plt.savefig(str(dataset+'_'+property+'.svg'), bbox_inches='tight')

Create violin plots for polar surface area distributions

In [None]:
property='polar_surface_area'

plot_df=assemble_plot_df(property)

#Comment this line on if you are importing from a csv with pre-calculated values
#prop_df=Property_Dataframe(property)

ax = sns.violinplot(data=plot_df, scale='count', palette=sns.color_palette('tab10'), inner='box',bw=1)

sns.set(rc={'figure.figsize':(40, 25)})
sns.set_context("paper", rc={"font.size":8,"axes.titlesize":8,"axes.labelsize":5})  
sns.set(font_scale=4)
plt.title( str(dataset+' ('+property+')'), size =64 )

sns.set_style('ticks')
sns.despine()
plt.savefig(str(dataset+'_'+property+'.svg'), bbox_inches='tight')