In [8]:
import pandas as pd
import numpy as np
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from tqdm import tqdm

In [4]:
def get_morgan_fingerprint(smiles, radius=2, nBits=2048):
    """Converts SMILES to Morgan fingerprint."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"Invalid SMILES: {smiles}")
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
    return fingerprint

def compute_dice_similarity(smiles1, smiles2):
    """Computes Dice similarity between two SMILES strings."""
    fp1 = get_morgan_fingerprint(smiles1)
    fp2 = get_morgan_fingerprint(smiles2)
    similarity = DataStructs.DiceSimilarity(fp1, fp2)
    return similarity

# Example
smiles_1 = 'CCO'   # Ethanol
smiles_2 = 'CCCO'  # Propanol

similarity_score = compute_dice_similarity(smiles_1, smiles_2)
print(f"Dice Similarity: {similarity_score:.4f}")

Dice Similarity: 0.7143


In [5]:
file_path = "~//mini_project_full//datasets//tcm-suite-data//herb-compound.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,herbId,id,cid,latinName,molecularFormula,toxicity,omimName,omimMolecularFormla,xbMolId,xbMoleculeName,xbMw,xbAlogp,xbHdon,xbHacc,xbOb,xbCaco2,xbBbb,xbDl,xbFasa,xbHl
0,1,16615,CID000073205,sigmoidin B,CC(=CCc1cc(cc(c1O)O)C1CC(=O)c2c(O1)cc(cc2O)O)C,,sigmoidin B,CC(=CCc1cc(cc(c1O)O)C1CC(=O)c2c(O1)cc(cc2O)O)C,,,,,,,,,,,,
1,1,16616,CID000073207,xanthomicrol,COc1c(OC)c(OC)c(c2c1oc(cc2=O)c1ccc(cc1)O)O,,xanthomicrol,COc1c(OC)c(OC)c(c2c1oc(cc2=O)c1ccc(cc1)O)O,,,,,,,,,,,,
2,1,24208,CID000126542,caffeidine acid,CNC(=O)N(c1ncn(c1C(=O)O)C)C,,caffeidine acid,CNC(=O)N(c1ncn(c1C(=O)O)C)C,,,,,,,,,,,,
3,1,31868,CID000195184,fructose 2-phosphorothioate 6-phosphate,OCC1(OC(C(C1O)O)COP(=O)(O)O)OP(=S)(O)O,,fructose 2-phosphorothioate 6-phosphate,OCC1(OC(C(C1O)O)COP(=O)(O)O)OP(=S)(O)O,,,,,,,,,,,,
4,1,37349,CID000442153,abyssinone V,CC(=CCc1cc(cc(c1O)CC=C(C)C)C1CC(=O)c2c(O1)cc(c...,,abyssinone V,CC(=CCc1cc(cc(c1O)CC=C(C)C)C1CC(=O)c2c(O1)cc(c...,,,,,,,,,,,,


In [9]:
smiles_col = 'omimMolecularFormla'      # Use the column with valid SMILES
names_col = 'latinName'

# === Clean, Convert to Fingerprints ===
valid_smiles = []
valid_names = []
fingerprints = []
invalid_smiles = []

In [10]:
print("[INFO] Cleaning and generating fingerprints...")
for name, smi in tqdm(zip(df[names_col], df[smiles_col])):
    smi = str(smi).strip()
    cleaned = smi.split('.')[0] if '.' in smi else smi
    try:
        mol = Chem.MolFromSmiles(cleaned)
        if mol is None:
            raise ValueError("Invalid Mol")
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
        fingerprints.append(fp)
        valid_names.append(name)
        valid_smiles.append(cleaned)
    except:
        invalid_smiles.append((name, smi))

[INFO] Cleaning and generating fingerprints...


0it [00:00, ?it/s]

[09:51:34] SMILES Parse Error: syntax error while parsing: Cn1c(=Cc2ccn+(c3c2cccc3)CCCN+(CCCN+(CCCn+2ccc(c3c2cccc3)C=c2oc3c(n2C)cccc3)(C)C)(C)C)oc2c1cccc2
[09:51:34] SMILES Parse Error: Failed parsing SMILES 'Cn1c(=Cc2ccn+(c3c2cccc3)CCCN+(CCCN+(CCCn+2ccc(c3c2cccc3)C=c2oc3c(n2C)cccc3)(C)C)(C)C)oc2c1cccc2' for input: 'Cn1c(=Cc2ccn+(c3c2cccc3)CCCN+(CCCN+(CCCn+2ccc(c3c2cccc3)C=c2oc3c(n2C)cccc3)(C)C)(C)C)oc2c1cccc2'
[09:51:34] SMILES Parse Error: syntax error while parsing: COc1cc2CCn+3c(c2cc1O)cc1c(c3)c(OC)c(cc1)OC
[09:51:34] SMILES Parse Error: Failed parsing SMILES 'COc1cc2CCn+3c(c2cc1O)cc1c(c3)c(OC)c(cc1)OC' for input: 'COc1cc2CCn+3c(c2cc1O)cc1c(c3)c(OC)c(cc1)OC'
[09:51:34] SMILES Parse Error: syntax error while parsing: O-W(=O)(=O)O-
[09:51:34] SMILES Parse Error: Failed parsing SMILES 'O-W(=O)(=O)O-' for input: 'O-W(=O)(=O)O-'
[09:51:34] SMILES Parse Error: syntax error while parsing: OC(=O)CCC(C(=O)O)NC(=O)c1ccc(cc1)CCC1CNc2c(C1)c(=O)nc(nH2)N
[09:51:34] SMILES Parse Error: Failed par

In [11]:
print(f"[⚠️] Skipped {len(invalid_smiles)} invalid SMILES.")

[⚠️] Skipped 3203 invalid SMILES.


In [12]:
if invalid_smiles:
    pd.DataFrame(invalid_smiles, columns=['Name', 'Invalid_SMILES']).to_csv('invalid_smiles_log.csv', index=False)

In [13]:
n = len(fingerprints)
similarity_matrix = np.zeros((n, n))
n

22273

In [14]:
for i in range(n):
    for j in range(i, n):
        sim = DataStructs.DiceSimilarity(fingerprints[i], fingerprints[j])
        similarity_matrix[i][j] = sim
        similarity_matrix[j][i] = sim

In [15]:
sim_df = pd.DataFrame(similarity_matrix, index=valid_names, columns=valid_names)
sim_df.to_csv('herbal_component_similarity_matrix.csv')