In [12]:
import pandas as pd

df = pd.read_csv('dataset.csv')

smiles_column = df["SMILES"]


In [13]:
from rdkit import Chem
from rdkit.Chem import AllChem

mols = [Chem.MolFromSmiles(smi) for smi in smiles_column]
fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024) for mol in mols]

In [14]:
from rdkit import DataStructs

def calculate_similarity(fp1, fp2):
    return DataStructs.TanimotoSimilarity(fp1, fp2)

reference_compound_fp = fingerprints[0]  # Assuming the first compound is the reference
similarities = [calculate_similarity(reference_compound_fp, fp) for fp in fingerprints]

In [18]:
sorted_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)
top_hits_indices = sorted_indices[1:4]
top_hits = [(smiles_column[i], similarities[i]) for i in top_hits_indices]

print("Top 3 Hits:")
for hit in top_hits:
    print("Compound:", hit[0])
    print("Similarity Score:", hit[1])
    print()

Top 3 Hits:
Compound: C1C(CNCC=C1)C2=CC(=C(C=C2)Cl)Cl
Similarity Score: 0.5526315789473685

Compound: C1[C@@H](CNCC=C1)C2=CC(=C(C=C2)Cl)Cl
Similarity Score: 0.5526315789473685

Compound: C1[C@H](CNCC=C1)C2=CC(=C(C=C2)Cl)Cl
Similarity Score: 0.5526315789473685

