In [10]:
!pip install rdkit-pypi --quiet

import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import matplotlib.pyplot as plt

# 1.
file_path = "./compounds.csv"
df = pd.read_csv(file_path, delimiter=';')
# SMILES column
smiles_list = df["smiles"]
compound_names = df["name"]


# Define the reference compound (Levodopa)
levodopa_smiles = "C1=CC(=C(C=C1CC(C(=O)O)N)O)O"
levodopa_mol = Chem.MolFromSmiles(levodopa_smiles)
levodopa_fingerprint = AllChem.GetMorganFingerprintAsBitVect(levodopa_mol, radius=2)

# 2 & 3
similarities = []
for smiles in smiles_list:
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        # 2
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2)
        # 3
        similarity = DataStructs.FingerprintSimilarity(levodopa_fingerprint, fp)
        similarities.append(similarity)
    else:
        similarities.append(None)

# 4
df["Tanimoto Similarity"] = similarities
# Sort and get top 3 hits
df_sorted = df.sort_values(by="Tanimoto Similarity", ascending=False).head(3)

print("Top 3 hits :")
df_sorted[["name", "smiles", "Tanimoto Similarity"]]

# first result is almost identical by Tanimoto Similarity
# second has this group (-OCH3) instead of a hydroxy (-OH) group that affects the similarity even by this small change
# third has the same similarity as the second one that suggests similar molecular feature change


Top 3 hits :


Unnamed: 0,name,smiles,Tanimoto Similarity
5,L-DOPA (sodium),C1=CC(=C(C=C1CC(C(=O)O)N)O)O.[Na],0.964286
6,"Tyrosine, 3-hydroxy-O-methyl-",COC1=C(C=C(C=C1)CC(C(=O)O)N)O,0.666667
7,3-Methoxytyrosine,COC1=C(C=CC(=C1)CC(C(=O)O)N)O,0.666667
