In [2]:
import pandas as pd 

In [10]:
df = pd.read_csv('dataset.csv')

species_counts = df["species"].value_counts()
print(species_counts.head(3))

species
Caenorhabditis elegans     1559
Drosophila melanogaster     926
Mus musculus                277
Name: count, dtype: int64


Let's isolate our dataset to only rows that with the Canorhabiditis elegans species, since it has the most data points 

In [21]:
df = df[df["species"] == "Caenorhabditis elegans"].reset_index(drop=True)
df.head(5) 

Unnamed: 0,id,compound_name,species,strain,dosage,age_at_initiation,treatment_duration,avg_lifespan_change_percent,avg_lifespan_significance,max_lifespan_change_percent,max_lifespan_significance,gender_new,weight_change_percent,weight_change_significance,ITP,pubmed_id,notes,last_modified
0,226,EUK-8,Caenorhabditis elegans,N2,0.05 mM,,,-12.9,S,,,Unknown,,,No,12521609,Lifespan assay conducted on solid medium. EUK-...,2024-10-07 15:17:17
1,223,EUK-8,Caenorhabditis elegans,N2,0.5 mM,,,-33.1,S,,,Unknown,,,No,12521609,Lifespan assay conducted on solid medium. EUK-...,2024-10-07 15:17:07
2,1558,Ursolic acid,Caenorhabditis elegans,N2,25 µM,,,32.48,S,,,Unknown,,,No,27773812,The lifespan assay was performed at 20 °C. Wor...,2024-06-20 17:07:48
3,1785,Ursolic acid,Caenorhabditis elegans,N2,25 µM,,,31.3,S,,,Unknown,,,No,28673026,,2024-06-20 17:07:42
4,1976,Tetracycline HCL,Caenorhabditis elegans,N2,100 µM,,,13.7,S,,,Unknown,,,No,33008901,,2024-06-20 17:04:06


We can now group by strain, then aggregate the data in each of these groups 

In [33]:
grouped_strain = df.groupby('strain').agg(avg_lifespan_change_percent = ('avg_lifespan_change_percent', 'mean'),
                                   avg_lifespan_change_percent_std = ('avg_lifespan_change_percent', 'std'),
                                   avg_lifespan_change_percent_sem = ('avg_lifespan_change_percent', 'sem'),
                                   avg_lifespan_change_percent_median = ('avg_lifespan_change_percent', 'median'),
                                   avg_lifespan_change_percent_min = ('avg_lifespan_change_percent', 'min'),
                                   avg_lifespan_change_percent_max = ('avg_lifespan_change_percent', 'max')).reset_index()

grouped_strain.head()

Unnamed: 0,strain,avg_lifespan_change_percent,avg_lifespan_change_percent_std,avg_lifespan_change_percent_sem,avg_lifespan_change_percent_median,avg_lifespan_change_percent_min,avg_lifespan_change_percent_max
0,CB5586,7.5,3.535534,2.5,7.5,5.0,10.0
1,CF512,12.706667,3.799596,2.193698,14.24,8.38,15.5
2,JK1107,0.1,,,0.1,0.1,0.1
3,JK1107 (glp-1),11.0,,,11.0,11.0,11.0
4,JU775,31.073333,6.547857,3.780407,34.4,23.53,35.29


Let's now group by dosage, then aggregate the data in each of these groups. For now, we will have this just for observation. Since we are setting this up for 

In [32]:
grouped_dosage = df.groupby('dosage').agg(avg_lifespan_change_percent = ('avg_lifespan_change_percent', 'mean'),
                                   avg_lifespan_change_percent_std = ('avg_lifespan_change_percent', 'std'),
                                   avg_lifespan_change_percent_sem = ('avg_lifespan_change_percent', 'sem'),
                                   avg_lifespan_change_percent_median = ('avg_lifespan_change_percent', 'median'),
                                   avg_lifespan_change_percent_min = ('avg_lifespan_change_percent', 'min'),
                                   avg_lifespan_change_percent_max = ('avg_lifespan_change_percent', 'max'),).reset_index()

grouped_dosage.head()

Unnamed: 0,dosage,avg_lifespan_change_percent,avg_lifespan_change_percent_std,avg_lifespan_change_percent_sem,avg_lifespan_change_percent_median,avg_lifespan_change_percent_min,avg_lifespan_change_percent_max
0,0.0001,41.0,,,41.0,41.0,41.0
1,0.001,36.7,,,36.7,36.7,36.7
2,0.0016 mM,9.09,,,9.09,9.09,9.09
3,0.005,29.0,,,29.0,29.0,29.0
4,0.0075,33.0,,,33.0,33.0,33.0


Now, lets define a few functions to convert the compounds to molecular fingerprints. We only have to do this for rows taht involve our specific species. 

In [34]:
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
import ssl

# Disable SSL verification (only if you have certificate issues)
ssl._create_default_https_context = ssl._create_unverified_context


def get_smiles_from_pubchem(compound_name):
    """Fetch canonical SMILES for a compound name from PubChem."""
    try:
        compound = pcp.get_compounds(compound_name, 'name')
        if compound:
            return compound[0].canonical_smiles
    except Exception as e:
        print(f"Error fetching SMILES for {compound_name}: {e}")
    return None 

def smiles_to_ecfp(smiles, radius=2, n_bits=2048):
    """Generate ECFP (Morgan) fingerprint as a list of bits (0/1) from a SMILES string."""
    try:
        if smiles:
            mol = Chem.MolFromSmiles(smiles)
            if mol:
                generator = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
                fp = generator.GetFingerprint(mol)
                return list(fp)
            else:
                print(f"Invalid SMILES: {smiles}")
        else:
            print("SMILES is None, skipping...")
    except Exception as e:
        print(f"Error generating ECFP for SMILES {smiles}: {e}")
    return None  

In [35]:
df.head() 

Unnamed: 0,id,compound_name,species,strain,dosage,age_at_initiation,treatment_duration,avg_lifespan_change_percent,avg_lifespan_significance,max_lifespan_change_percent,max_lifespan_significance,gender_new,weight_change_percent,weight_change_significance,ITP,pubmed_id,notes,last_modified
0,226,EUK-8,Caenorhabditis elegans,N2,0.05 mM,,,-12.9,S,,,Unknown,,,No,12521609,Lifespan assay conducted on solid medium. EUK-...,2024-10-07 15:17:17
1,223,EUK-8,Caenorhabditis elegans,N2,0.5 mM,,,-33.1,S,,,Unknown,,,No,12521609,Lifespan assay conducted on solid medium. EUK-...,2024-10-07 15:17:07
2,1558,Ursolic acid,Caenorhabditis elegans,N2,25 µM,,,32.48,S,,,Unknown,,,No,27773812,The lifespan assay was performed at 20 °C. Wor...,2024-06-20 17:07:48
3,1785,Ursolic acid,Caenorhabditis elegans,N2,25 µM,,,31.3,S,,,Unknown,,,No,28673026,,2024-06-20 17:07:42
4,1976,Tetracycline HCL,Caenorhabditis elegans,N2,100 µM,,,13.7,S,,,Unknown,,,No,33008901,,2024-06-20 17:04:06


Now, we can convert all the compounds being tested on this most common species into molecular fingerprints

In [36]:
# First get unique compounds and their fingerprints
unique_compounds = df['compound_name'].unique()
compound_to_fingerprint = {}

print("Converting compounds to fingerprints...")
for compound in unique_compounds:
    print(f"Processing {compound}")
    smiles = get_smiles_from_pubchem(compound)
    if smiles:
        fingerprint = smiles_to_ecfp(smiles)
        if fingerprint:
            compound_to_fingerprint[compound] = fingerprint
            print(f"Successfully generated fingerprint for {compound}")
        else:
            print(f"Failed to generate fingerprint for {compound}")
    else:
        print(f"Failed to get SMILES for {compound}")

# Create a new dataframe with compound fingerprints
fingerprint_df = pd.DataFrame.from_dict(compound_to_fingerprint, orient='index')
fingerprint_df.index.name = 'compound_name'
fingerprint_df = fingerprint_df.reset_index()

# Now merge the fingerprints back with the original dataframe
df_with_fingerprints = df.merge(fingerprint_df, on='compound_name', how='left')

# Show the first few rows of the merged dataframe
print("\nShape of dataframe with fingerprints:", df_with_fingerprints.shape)
df_with_fingerprints.head()

Converting compounds to fingerprints...
Processing EUK-8
Successfully generated fingerprint for EUK-8
Processing Ursolic  acid
Failed to get SMILES for Ursolic  acid
Processing Tetracycline HCL
Successfully generated fingerprint for Tetracycline HCL
Processing Sesame extract
Failed to get SMILES for Sesame extract
Processing Royal Jelly
Failed to get SMILES for Royal Jelly
Processing Cannabidiol
Successfully generated fingerprint for Cannabidiol
Processing Berberine
Successfully generated fingerprint for Berberine
Processing Astragalus extract
Failed to get SMILES for Astragalus extract
Processing Rhodiola rosea extract
Failed to get SMILES for Rhodiola rosea extract
Processing Apple extract
Failed to get SMILES for Apple extract
Processing Blueberry extract
Failed to get SMILES for Blueberry extract
Processing Cardamonin
Successfully generated fingerprint for Cardamonin
Processing Alpha-ketoglutarate
Successfully generated fingerprint for Alpha-ketoglutarate
Processing Inositol
Succes

Unnamed: 0,id,compound_name,species,strain,dosage,age_at_initiation,treatment_duration,avg_lifespan_change_percent,avg_lifespan_significance,max_lifespan_change_percent,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,226,EUK-8,Caenorhabditis elegans,N2,0.05 mM,,,-12.9,S,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,223,EUK-8,Caenorhabditis elegans,N2,0.5 mM,,,-33.1,S,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1558,Ursolic acid,Caenorhabditis elegans,N2,25 µM,,,32.48,S,,...,,,,,,,,,,
3,1785,Ursolic acid,Caenorhabditis elegans,N2,25 µM,,,31.3,S,,...,,,,,,,,,,
4,1976,Tetracycline HCL,Caenorhabditis elegans,N2,100 µM,,,13.7,S,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We should probably drop the columns with no ECFP available

Now, with our ECFPs, aggregated features, and grouped species, we can construct a new dataframe in which we can start training and testing our model on 