Here, we will prepare and clean the data to train models Conformal Prediction and Random Forest models 

In [22]:
import pandas as pd 
df = pd.read_csv('./dataset.csv')

species_counts = df["species"].value_counts()
print(species_counts.head(3))

species
Caenorhabditis elegans     1559
Drosophila melanogaster     926
Mus musculus                277
Name: count, dtype: int64


Let's isolate our dataset to only rows that with the Canorhabiditis elegans species, since it has the most data points 

In [23]:
df = df[df["species"] == "Caenorhabditis elegans"].reset_index(drop=True)
df.head(1000) 

Unnamed: 0,id,compound_name,species,strain,dosage,age_at_initiation,treatment_duration,avg_lifespan_change_percent,avg_lifespan_significance,max_lifespan_change_percent,max_lifespan_significance,gender_new,weight_change_percent,weight_change_significance,ITP,pubmed_id,notes,last_modified
0,226,EUK-8,Caenorhabditis elegans,N2,0.05 mM,,,-12.90,S,,,Unknown,,,No,12521609,Lifespan assay conducted on solid medium. EUK-...,2024-10-07 15:17:17
1,223,EUK-8,Caenorhabditis elegans,N2,0.5 mM,,,-33.10,S,,,Unknown,,,No,12521609,Lifespan assay conducted on solid medium. EUK-...,2024-10-07 15:17:07
2,1558,Ursolic acid,Caenorhabditis elegans,N2,25 µM,,,32.48,S,,,Unknown,,,No,27773812,The lifespan assay was performed at 20 °C. Wor...,2024-06-20 17:07:48
3,1785,Ursolic acid,Caenorhabditis elegans,N2,25 µM,,,31.30,S,,,Unknown,,,No,28673026,,2024-06-20 17:07:42
4,1976,Tetracycline HCL,Caenorhabditis elegans,N2,100 µM,,,13.70,S,,,Unknown,,,No,33008901,,2024-06-20 17:04:06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1221,Rifampicin,Caenorhabditis elegans,N2,50 µM,,,45.40,S,,,Unknown,,,No,25720500,Treatment started on the fifth day.,2024-06-17 08:57:16
996,1220,Rifampicin,Caenorhabditis elegans,N2,50 µM,,,55.80,S,,,Unknown,,,No,25720500,Value is the average of all the reported lifes...,2024-06-17 08:57:16
997,1219,Rifampicin,Caenorhabditis elegans,N2,20 µM,,,50.80,S,,,Unknown,,,No,25720500,,2024-06-17 08:57:16
998,1218,Rifampicin,Caenorhabditis elegans,N2,10 µM,,,45.30,S,,,Unknown,,,No,25720500,,2024-06-17 08:57:16


Let's take the dose (for a specific organism and compound) that performs the best (highest lifespan extension)

In [24]:
# drop rows with empty compound names or average lifespan change percent 

df = df.dropna(subset=["compound_name", "avg_lifespan_change_percent"])

# take rows with max lifespan change percentages for a specific organism and compound 
df = df.loc[
    df.groupby('compound_name')['avg_lifespan_change_percent'].idxmax(), 
    ['compound_name', 'dosage', 'species', 'avg_lifespan_change_percent']
    ].reset_index(drop=True)

df.head()


Unnamed: 0,compound_name,dosage,species,avg_lifespan_change_percent
0,(Iso)lappaol A,100 µM,Caenorhabditis elegans,11.2
1,(R)-2-hydroxyglutarate,8 mM,Caenorhabditis elegans,42.9
2,"(R,R)-cis-Diethyl tetrahydro-2,8-chrysenediol",33 µM,Caenorhabditis elegans,7.0
3,(S)-2-hydroxyglutarate,8 mM,Caenorhabditis elegans,31.8
4,"1,2,3-Triazolyl ester of ketorolac",50 nM,Caenorhabditis elegans,15.0


We can convert all the compounds being tested on this most common species into molecular fingerprints. In addition, lets drop all rows in which either the ECFP is unavailable. 

let's first define a few functions to convert the compounds to molecular fingerprints. We only have to do this for rows taht involve our specific species. 

In [25]:
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
import ssl

# Disable SSL verification (only if you have certificate issues)
ssl._create_default_https_context = ssl._create_unverified_context


def get_smiles_from_pubchem(compound_name):
    """Fetch canonical SMILES for a compound name from PubChem."""
    try:
        compound = pcp.get_compounds(compound_name, 'name')
        if compound:
            return compound[0].canonical_smiles
    except Exception as e:
        print(f"Error fetching SMILES for {compound_name}: {e}")
    return None 

def smiles_to_ecfp(smiles, radius=2, n_bits=2048):
    """Generate ECFP (Morgan) fingerprint as a list of bits (0/1) from a SMILES string."""
    try:
        if smiles:
            mol = Chem.MolFromSmiles(smiles)
            if mol:
                generator = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
                fp = generator.GetFingerprint(mol)
                return list(fp)
            else:
                print(f"Invalid SMILES: {smiles}")
        else:
            print("SMILES is None, skipping...")
    except Exception as e:
        print(f"Error generating ECFP for SMILES {smiles}: {e}")
    return None  

df.shape

(688, 4)

In [27]:

# converting each compound in my df to SMILES, then ECFP 
rows_to_drop = [] 
compound_to_fingerprint = {}

for index, row in df.iterrows():  
    compound_name = row["compound_name"]
    
    try: 
        smiles = get_smiles_from_pubchem(compound_name)
        if smiles: 
            ecfp = smiles_to_ecfp(smiles)
            if ecfp: 
                if hasattr(ecfp, "tolist"):
                    ecfp = ecfp.tolist()
                # insert that into the row 
                compound_to_fingerprint[compound_name] = ecfp
            else: 
                rows_to_drop.append(index)
        else: 
            rows_to_drop.append(index)
    except Exception as e: 
        print(f"Error at row {index} (compound: {compound_name}):")


df = df.drop(index=rows_to_drop).reset_index(drop=True)
df['fingerprint'] = df['compound_name'].map(compound_to_fingerprint)

df.head() 



Error fetching SMILES for Cryptotanshinone: 'PUGREST.ServerBusy'


Unnamed: 0,compound_name,dosage,species,avg_lifespan_change_percent,fingerprint
0,(R)-2-hydroxyglutarate,8 mM,Caenorhabditis elegans,42.9,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"(R,R)-cis-Diethyl tetrahydro-2,8-chrysenediol",33 µM,Caenorhabditis elegans,7.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,(S)-2-hydroxyglutarate,8 mM,Caenorhabditis elegans,31.8,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"1,2,4-triazolo[1,5-a]pyridine",500 nM,Caenorhabditis elegans,12.0,"[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
4,"1,7-dimethylxanthine",50 µg/mL,Caenorhabditis elegans,14.75,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [32]:
df = df [['compound_name', 'dosage', 'fingerprint', 'avg_lifespan_change_percent']]

df.head(5)

Unnamed: 0,compound_name,dosage,fingerprint,avg_lifespan_change_percent
0,(R)-2-hydroxyglutarate,8 mM,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",42.9
1,"(R,R)-cis-Diethyl tetrahydro-2,8-chrysenediol",33 µM,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7.0
2,(S)-2-hydroxyglutarate,8 mM,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",31.8
3,"1,2,4-triazolo[1,5-a]pyridine",500 nM,"[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",12.0
4,"1,7-dimethylxanthine",50 µg/mL,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",14.75
