In [25]:
import pandas as pd 
import numpy as np
import pubchempy as pcp
import ssl
import re 

from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.datasets import make_regression

In [26]:
ssl._create_default_https_context = ssl._create_unverified_context

# Define Helper Functions
We define two helper functions:
1. `get_smiles_from_pubchem(compound_name)`: Fetches the **SMILES representation** of a compound from PubChem.
2. `smiles_to_ecfp(smiles)`: Converts a **SMILES string** into an **ECFP (Extended-Connectivity Fingerprint)**.
These functions will help us convert chemical names into numerical representations suitable for machine learning.
3. `extract_numeric_value(dosage_str)`: Extracts a numeric value from a dosage string.


In [27]:
def get_smiles_from_pubchem(compound_name):
    """Fetch canonical SMILES for a compound name from PubChem."""
    try:
        compound = pcp.get_compounds(compound_name, 'name')
        if compound:
            return compound[0].canonical_smiles
    except Exception as e:
        print(f"Error fetching SMILES for {compound_name}: {e}")
    return None

def smiles_to_ecfp(smiles, radius=2, n_bits=2048):
    """Generate ECFP (Morgan) fingerprint as a list of bits (0/1) from a SMILES string."""
    try:
        if smiles:
            mol = Chem.MolFromSmiles(smiles)
            if mol:
                generator = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
                fp = generator.GetFingerprint(mol)
                return list(fp)
            else:
                print(f"Invalid SMILES: {smiles}")
        else:
            print("SMILES is None, skipping...")
    except Exception as e:
        print(f"Error generating ECFP for SMILES {smiles}: {e}")
    return None

def get_dosage_value(dosage_str):
    """Extract numeric value from dosage string"""
    if pd.isna(dosage_str):
        return None
    match = re.search(r'([\d.]+)', str(dosage_str))
    return float(match.group(1)) if match else None

# something I could try is to make unitl death like 80 years old or something arbiturarily 
def get_treatment_duration(treatment_duration):
    # this column is a little bit messy because it has many 
    # values of "until death" which cannot be quantified numerically
    """Extract numeric value from treatment duration string"""
    if pd.isna(treatment_duration):
        return None
    match = re.search(r'(\d+)', str(treatment_duration))
    return int(match.group(1)) if match else None #im also returning None on cases "until death"

''' only about a quarter of hte data has this column filled'''
# def get_age_at_start(age_at_initiation): 
#     # this column is mesaured in MONTHS ! 
#     """Extract numeric value from age at initiation string"""
#     if pd.isna(age_at_initiation):
#         return None
#     match = re.search(r'(\d+)', str(age_at_initiation))
#     return int(match.group(1)) if match else None

def get_gender(gender_new):
    # this column is a little bit messy because it has many 
    # values of "until death" which cannot be quantified numerically
    """Extract numeric value from gender string"""
    if pd.isna(gender_new):
        return None
    return 1 if str(gender_new).lower() == 'male' else 0 if str(gender_new).lower() == 'female' else -1
    


# Load Dataset
We load the dataset containing information about different chemical compounds.
We'll take a look at the first few rows to understand its structure.

In [28]:
# Load the dataset
df = pd.read_csv('dataset.csv')

# Show the first 5 rows
df.head()

Unnamed: 0,id,compound_name,species,strain,dosage,age_at_initiation,treatment_duration,avg_lifespan_change_percent,avg_lifespan_significance,max_lifespan_change_percent,max_lifespan_significance,gender_new,weight_change_percent,weight_change_significance,ITP,pubmed_id,notes,last_modified
0,1731,Glucomannan Hydrolysate,Drosophila melanogaster,B18,0.25% w/v,,,20.2,S,,,Male,,,No,30252027,,2024-10-12 09:39:44
1,1732,Glucomannan Hydrolysate,Drosophila melanogaster,B18,0.25% w/v,,,14.88,S,,,Female,,,No,30252027,,2024-10-12 09:39:34
2,1734,Glucomannan Hydrolysate,Drosophila melanogaster,Canton-S,0.25% w/v,,,17.66,S,,,Female,,,No,30252027,,2024-10-12 09:39:22
3,1733,Glucomannan Hydrolysate,Drosophila melanogaster,Oregon-R,0.25% w/v,,,11.7,S,,,Female,,,No,30252027,,2024-10-12 09:39:06
4,1735,Glucomannan Hydrolysate,Drosophila melanogaster,DGRP-21,0.25% w/v,,,13.4,S,,,Female,,,No,30252027,,2024-10-12 09:38:47


# Convert Compound Names to SMILES and Generate ECFP
We iterate through each compound name in our dataset, retrieve its SMILES notation using PubChem, and convert it into an ECFP fingerprint using RDKit. In addition we also fill in the other columns with additional features. We will drop the target column in the later cells. Here, we focus on inserting the numeric representation of additional feature data. 

In [None]:
# Take a small subset of 2000 rows for testing
df = df.copy()

# Initialize new columns for SMILES and ECFP fingerprints
df['smiles'] = None
df['ecfp'] = None

# Convert compound names to SMILES and ECFP fingerprints
for idx, row in df.iterrows():
    compound_name = row['compound_name']
    smiles = get_smiles_from_pubchem(compound_name)
    df.at[idx, 'smiles'] = smiles
    
    if smiles:
        ecfp_bits = smiles_to_ecfp(smiles)
        df.at[idx, 'ecfp'] = ecfp_bits

    df.at[idx, 'dosage'] = get_dosage_value(row['dosage'])
    df.at[idx, 'gender_new'] = get_gender(row['gender_new'])
    df.at[idx, 'treatment_duration'] = get_treatment_duration(row['treatment_duration'])

        
# Show a preview of the updated data
df.head()

Error fetching SMILES for Glucomannan Hydrolysate: 'PUGREST.ServerBusy'


# Prepare Features (X) and Target (y)
We extract:
- **Features (`X`)**: The 2048-bit ECFP fingerprint for each compound.
- **Target (`y`)**: The predicted lifespan change percentage.


In [31]:
feature_columns = ['ecfp', 'dosage', 'gender_new', 'treatment_duration']
target_column = 'avg_lifespan_change_percent'

X = df[feature_columns].to_numpy() 
y = df[target_column].to_numpy()

# Print dataset shapes
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (3423, 4)
y shape: (3423,)


# Train & Evaluate Random Forest Model
We train a **Random Forest Regressor** using **2-fold cross-validation**.
We evaluate the model using **Mean Absolute Error (MAE)**.


In [30]:
# Define model
model = RandomForestRegressor(n_estimators=10, random_state=42)

# Perform cross-validation, train on 2/3 and test on 1/3
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

# Cross-validation with negative mean absolute error
scores_mae = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_absolute_error')

# Convert negative values to positive
mae_scores = -scores_mae

# Print evaluation results
print("MAE (CV folds):", mae_scores)
print("Mean MAE:", mae_scores.mean())


MAE (CV folds): [14.53814832 15.93296255 14.168551  ]
Mean MAE: 14.879887290498393


Let's test train a Gradinet Boosting model instead 

In [None]:
# Define the Gradient Boosting model
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Perform cross-validation
scores_gbr = cross_val_score(gbr_model, X, y, cv=kfold, scoring='neg_mean_absolute_error')

# Convert negative scores to positive
mae_scores_gbr = -scores_gbr

# Print results
print("Gradient Boosting MAE (K-Fold 2/3 Train, 1/3 Test):", mae_scores_gbr)
print("Mean MAE:", mae_scores_gbr.mean())