<a href="https://colab.research.google.com/github/ullahsamee/tanimoto-similarity/blob/main/Tanimoto_similarity_script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
!pip install rdkit
!pip install pillow
!pip install rdkit
!pip install numpy
!pip install matplotlib

# Import required modules
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from google.colab import files
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from rdkit.Chem import Draw
from IPython.display import display

# Upload the file containing SMILES strings
uploaded = files.upload()

for fn in uploaded.keys():
    print(f"User uploaded file '{fn}' with length {len(uploaded[fn])} bytes")

In [None]:


# Read the SMILES strings from the uploaded file
smiles_list = []
with open(fn, 'r') as f:
    smiles_list = f.readlines()

# Generate fingerprints and 2D molecules for each compound using RDKit
fp_list = []
mol_list = []
for smiles in smiles_list:
    mol = Chem.MolFromSmiles(smiles.strip())  # Strip newline characters
    if mol is not None:
        mol = Chem.AddHs(mol)  # Add hydrogen atoms to the molecule
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
        fp_list.append(fp)
        mol_list.append(mol)

# Initialize the maximum similarity and the indices of the most similar compounds
max_sim = 0
i_max = 0
j_max = 0

# Loop over all pairs of fingerprints and calculate the Tanimoto similarity
for i in range(len(fp_list)):
    for j in range(i+1, len(fp_list)):
        sim = DataStructs.TanimotoSimilarity(fp_list[i], fp_list[j])
        # Update the maximum similarity and the indices if a higher similarity is found
        if sim > max_sim:
            max_sim = sim
            i_max = i
            j_max = j

# Print the results
print("The two most similar compounds are:")
print(smiles_list[i_max])
print(smiles_list[j_max])
print("The Tanimoto similarity score is:")
print(max_sim)

# Draw 2D structures of the most similar compounds
img = Draw.MolsToGridImage([mol_list[i_max], mol_list[j_max]], molsPerRow=2, subImgSize=(400, 400),
                           legends=[smiles_list[i_max].strip(), smiles_list[j_max].strip()], useSVG=True)
display(img)

# Calculate and plot 3D coordinates of the most similar compounds
mol_i = mol_list[i_max]
mol_j = mol_list[j_max]

AllChem.EmbedMolecule(mol_i)
AllChem.EmbedMolecule(mol_j)

AllChem.MMFFOptimizeMolecule(mol_i)
AllChem.MMFFOptimizeMolecule(mol_j)

coords_i = mol_i.GetConformer().GetPositions()
coords_j = mol_j.GetConformer().GetPositions()

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

# Set marker size and colors
marker_size = 50
colors = ['blue', 'red']

for i, coords in enumerate([coords_i, coords_j]):
    x = [coord[0] for coord in coords]
    y = [coord[1] for coord in coords]
    z = [coord[2] for coord in coords]

    ax.scatter(x, y, z, c=colors[i], s=marker_size, label=f"Compound {i+1}")

ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.set_title('3D Coordinates')
ax.legend()

# Customize plot style
ax.grid(False)
ax.xaxis.pane.fill = ax.yaxis.pane.fill = ax.zaxis.pane.fill = False
ax.xaxis.pane.set_edgecolor('w')
ax.yaxis.pane.set_edgecolor('w')
ax.zaxis.pane.set_edgecolor('w')

plt.show()
