First, we check if a molecule with different SMILES representations has a unique Morgan fingerprint

In [60]:
import os
import subprocess
import argparse
import warnings

import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import mols2grid
import useful_rdkit_utils as uru
from rdkit import Chem
from rdkit.Chem import PandasTools
import MDAnalysis as mda
import prolif as plf
from rdkit.Chem.rdFMCS import FindMCS
from rdkit.Chem import AllChem

In [61]:
f = open("data/fragments.smi")
input_mol = f.readline()[:-1]
f.close()

In [62]:
input_mol

'C1c2cncnc2NC1'

In [63]:
alt_representation = Chem.MolFromMolFile("data/docking/5s18_ligand.sdf")
alt_smiles = Chem.MolToSmiles(alt_representation)
alt_smiles

'c1ncc2c(n1)NCC2'

In [64]:
mols = [Chem.MolFromSmiles(smile) for smile in [input_mol, alt_smiles]]

In [65]:
mols

[<rdkit.Chem.rdchem.Mol at 0x79674b2a0190>,
 <rdkit.Chem.rdchem.Mol at 0x79674b2a0a50>]

In [66]:
mols2grid.display(mols)

MolGridWidget()

In [67]:
fpgen = AllChem.GetMorganGenerator()

In [68]:
fingerprints = np.array([fpgen.GetFingerprint(mol).ToList() for mol in mols])

In [69]:
np.min(fingerprints[0] == fingerprints[1])

True

In [70]:
medium_similarity = pd.read_csv("data/dataframe.csv")

In [71]:
medium_similarity['Prior'] = 'Medium Similarity'

In [75]:
smiles = medium_similarity['SMILES'][0]
smiles

'c1ncc2c(n1)CCNC2'

In [83]:
molecules = [Chem.MolFromSmiles(smile) for smile in medium_similarity['SMILES'].values]

fingerprints = [fpgen.GetFingerprint(mol) for mol in molecules]

In [84]:
medium_similarity['Fingerprint'] = fingerprints

In [85]:
medium_similarity

Unnamed: 0.1,Unnamed: 0,SMILES,Input_SMILES,Tanimoto,Prior,Fingerprint
0,0,c1ncc2c(n1)CCNC2,c1ncc2c(n1)NCC2,0.500000,Medium Similarity,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,CC1CNc2ncncc21,c1ncc2c(n1)NCC2,0.435897,Medium Similarity,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,5,c1ncc2[nH]cnc2n1,c1ncc2c(n1)NCC2,0.317073,Medium Similarity,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,6,Cc1ncc2c(n1)NCC2,c1ncc2c(n1)NCC2,0.513514,Medium Similarity,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,8,c1ncc2c(n1)NCC2,c1ncc2c(n1)NCC2,1.000000,Medium Similarity,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
229,452,O=S(=O)(c1ncc2c(n1)NCC2)C1CC1,c1ncc2c(n1)NCC2,0.240000,Medium Similarity,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
230,454,c1ncc2c(n1)NC[C@@H]2CCCN1CCCCC1,c1ncc2c(n1)NCC2,0.309091,Medium Similarity,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
231,456,c1ncc2c(n1)NC(CC1CCCCC1)C2,c1ncc2c(n1)NCC2,0.512821,Medium Similarity,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
232,458,C(#Cc1ncc2c(n1)NCC2)Cc1cccnc1,c1ncc2c(n1)NCC2,0.333333,Medium Similarity,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ..."


In [106]:
high_similarity = pd.read_csv("data/dataframe.csv")

high_similarity['Prior'] = 'High Similarity'

molecules = [Chem.MolFromSmiles(smile) for smile in high_similarity['SMILES'].values]

fingerprints = [fpgen.GetFingerprint(mol) for mol in molecules]

high_similarity['Fingerprint'] = fingerprints

len(high_similarity)

26

In [107]:
similarity = pd.read_csv("data/dataframe.csv")

similarity['Prior'] = 'Similarity'

molecules = [Chem.MolFromSmiles(smile) for smile in similarity['SMILES'].values]

fingerprints = [fpgen.GetFingerprint(mol) for mol in molecules]

similarity['Fingerprint'] = fingerprints

len(similarity)

218

In [99]:
mmp = pd.read_csv("data/dataframe.csv")

mmp['Prior'] = 'MMP'

molecules = [Chem.MolFromSmiles(smile) for smile in mmp['SMILES'].values]

fingerprints = [fpgen.GetFingerprint(mol) for mol in molecules]

mmp['Fingerprint'] = fingerprints

len(mmp)

234

In [100]:
scaffold_generic = pd.read_csv("data/dataframe.csv")

scaffold_generic['Prior'] = 'scaffold_generic'

molecules = [Chem.MolFromSmiles(smile) for smile in scaffold_generic['SMILES'].values]

fingerprints = [fpgen.GetFingerprint(mol) for mol in molecules]

scaffold_generic['Fingerprint'] = fingerprints

len(scaffold_generic)

336

In [101]:
scaffold = pd.read_csv("data/dataframe.csv")

scaffold['Prior'] = 'scaffold'

molecules = [Chem.MolFromSmiles(smile) for smile in scaffold['SMILES'].values]

fingerprints = [fpgen.GetFingerprint(mol) for mol in molecules]

scaffold['Fingerprint'] = fingerprints

len(scaffold)

21

In [119]:
df = pd.concat([medium_similarity, high_similarity, similarity, mmp, scaffold, scaffold_generic])

In [120]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [122]:
df

Unnamed: 0,SMILES,Input_SMILES,Tanimoto,Prior,Fingerprint
0,c1ncc2c(n1)CCNC2,c1ncc2c(n1)NCC2,0.500000,Medium Similarity,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,CC1CNc2ncncc21,c1ncc2c(n1)NCC2,0.435897,Medium Similarity,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,c1ncc2[nH]cnc2n1,c1ncc2c(n1)NCC2,0.317073,Medium Similarity,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Cc1ncc2c(n1)NCC2,c1ncc2c(n1)NCC2,0.513514,Medium Similarity,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,c1ncc2c(n1)NCC2,c1ncc2c(n1)NCC2,1.000000,Medium Similarity,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
331,CC1Cc2ncc([N+](=O)[O-])cc2CN1,c1ncc2c(n1)NCC2,0.234043,MMP,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
332,O=[N+]([O-])c1ccc2c(c1)CNC(CF)C2,c1ncc2c(n1)NCC2,0.150000,MMP,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
333,O=[N+]([O-])c1ccc2c(c1)CN(CCO)CC2,c1ncc2c(n1)NCC2,0.270833,MMP,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
334,CC(C)c1nc2c(C(F)(F)F)ccnc2[nH]1,c1ncc2c(n1)NCC2,0.235294,MMP,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [162]:
medium_vs_high = pd.merge(medium_similarity, high_similarity, how='inner', on='Fingerprint')
medium_vs_sim = pd.merge(medium_similarity, similarity, how='inner', on='Fingerprint')
medium_vs_scaffold = pd.merge(medium_similarity, scaffold, how='inner', on='Fingerprint')
medium_vs_mmp = pd.merge(medium_similarity, mmp, how='inner', on='Fingerprint')
medium_vs_generic = pd.merge(medium_similarity, scaffold_generic, how='inner', on='Fingerprint')

high_vs_sim = pd.merge(high_similarity, similarity, how='inner', on='Fingerprint')
high_vs_scaffold = pd.merge(high_similarity, scaffold, how='inner', on='Fingerprint')
high_vs_mmp = pd.merge(high_similarity, mmp, how='inner', on='Fingerprint')
high_vs_generic = pd.merge(high_similarity, scaffold_generic, how='inner', on='Fingerprint')

sim_vs_scaffold = pd.merge(similarity, scaffold, how='inner', on='Fingerprint')
sim_vs_mmp = pd.merge(similarity, mmp, how='inner', on='Fingerprint')
sim_vs_generic = pd.merge(similarity, scaffold_generic, how='inner', on='Fingerprint')

scaffold_vs_mmp = pd.merge(scaffold, mmp, how='inner', on='Fingerprint')
scaffold_vs_generic = pd.merge(scaffold, scaffold_generic, how='inner', on='Fingerprint')

mmp_vs_generic = pd.merge(mmp, scaffold_generic, how='inner', on='Fingerprint')

In [163]:
print(f"Medium vs high: {len(medium_vs_high)}")
print(f"Medium vs similarity: {len(medium_vs_sim)}")
print(f"Medium vs scaffold: {len(medium_vs_scaffold)}")
print(f"Medium vs mmp: {len(medium_vs_mmp)}")
print(f"Medium vs generic scaffold: {len(medium_vs_generic)}")

print(f"High vs sim: {len(high_vs_sim)}")
print(f"High vs scaffold: {len(high_vs_scaffold)}")
print(f"High vs mmp: {len(high_vs_mmp)}")
print(f"High vs generic scaffold: {len(high_vs_generic)}")

print(f"Sim vs scaffold: {len(sim_vs_scaffold)}")
print(f"Sim vs mmp: {len(sim_vs_mmp)}")
print(f"Sim vs generic scaffold: {len(sim_vs_generic)}")

print(f"Scaffold vs mmp: {len(scaffold_vs_mmp)}")
print(f"Scaffold vs generic scaffold: {len(scaffold_vs_generic)}")

print(f"mmp vs generic scaffold: {len(mmp_vs_generic)}")

Medium vs high: 0
Medium vs similarity: 0
Medium vs scaffold: 0
Medium vs mmp: 1
Medium vs generic scaffold: 0
High vs sim: 0
High vs scaffold: 1
High vs mmp: 0
High vs generic scaffold: 0
Sim vs scaffold: 0
Sim vs mmp: 0
Sim vs generic scaffold: 1
Scaffold vs mmp: 0
Scaffold vs generic scaffold: 0
mmp vs generic scaffold: 0


In [141]:
medium_vs_mmp

Unnamed: 0,Unnamed: 0_x,SMILES_x,Input_SMILES_x,Tanimoto_x,Prior_x,Fingerprint,Unnamed: 0_y,SMILES_y,Input_SMILES_y,Tanimoto_y,Prior_y
0,404,OC[C@H]1Cc2cncnc2N1,c1ncc2c(n1)NCC2,0.439024,Medium Similarity,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",62,OCC1Cc2cncnc2N1,c1ncc2c(n1)NCC2,0.439024,MMP


In [144]:
new_smi = medium_vs_mmp['SMILES_x'][0]

In [145]:
new_smi

'OC[C@H]1Cc2cncnc2N1'

In [146]:
mols = [Chem.MolFromSmiles(smile) for smile in [input_mol, new_smi]]

In [148]:
mols2grid.display(mols)

MolGridWidget()

In [153]:
new_smi_2 = high_vs_scaffold['SMILES_x'][0]
new_smi_2

'CNc1ncc2c(n1)NCC2'

In [155]:
mols = [Chem.MolFromSmiles(smile) for smile in [input_mol, new_smi_2]]
mols2grid.display(mols)

MolGridWidget()

In [158]:
new_smi_3 = sim_vs_generic['SMILES_x'][0]
new_smi_3

'c1ncc2c(n1)CNCC2'

In [159]:
mols = [Chem.MolFromSmiles(smile) for smile in [input_mol, new_smi_3]]
mols2grid.display(mols)

MolGridWidget()