In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from chemprop.cli import train, predict
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [3]:
# Load the dataset
df = pd.read_csv('../dataset.csv')

# Filter for Caenorhabditis elegans species
df = df[df["species"] == "Caenorhabditis elegans"].reset_index(drop=True)

# Drop rows with missing target values
df = df.dropna(subset=['avg_lifespan_change_percent'])

In [4]:
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
import ssl

# Disable SSL verification (only if you have certificate issues)
ssl._create_default_https_context = ssl._create_unverified_context

def get_smiles_from_pubchem(compound_name):
    """Fetch canonical SMILES for a compound name from PubChem."""
    try:
        compound = pcp.get_compounds(compound_name, 'name')
        if compound:
            return compound[0].canonical_smiles
    except Exception as e:
        print(f"Error fetching SMILES for {compound_name}: {e}")
    return None

# Add SMILES to the DataFrame
df['smiles'] = df['compound_name'].apply(get_smiles_from_pubchem)
df = df.dropna(subset=['smiles'])  # Drop rows where SMILES is not available

Error fetching SMILES for Metformin: 'PUGREST.ServerBusy'


In [39]:
import sys
from chemprop.cli.main import main
import pandas as pd

# Prepare your test data
test_data = df[['smiles']]
test_data.to_csv('test_data.csv', index=False)

# Simulate command-line arguments by overriding sys.argv
sys.argv = [
    'chemprop', 'predict',
    '--test_path', 'test_data.csv',
    '--checkpoint_dir', 'chemprop_model',
    '--preds_path', 'predictions.csv'
]

# Call the main function
main()

# # Load the predictions
# predictions = pd.read_csv('predictions.csv')
# y_pred_chemprop = predictions['target']


usage: chemprop predict [-h] [--logfile [LOGFILE]] [-v] [-q]
                        [-s SMILES_COLUMNS [SMILES_COLUMNS ...]]
                        [-r REACTION_COLUMNS [REACTION_COLUMNS ...]]
                        [--no-header-row] [-n NUM_WORKERS] [-b BATCH_SIZE]
                        [--accelerator ACCELERATOR] [--devices DEVICES]
                        [--rxn-mode {REAC_PROD,REAC_PROD_BALANCE,REAC_DIFF,REAC_DIFF_BALANCE,PROD_DIFF,PROD_DIFF_BALANCE}]
                        [--multi-hot-atom-featurizer-mode {V1,V2,ORGANIC,RIGR}]
                        [--keep-h] [--add-h]
                        [--molecule-featurizers {morgan_binary,morgan_count,rdkit_2d,v1_rdkit_2d,v1_rdkit_2d_normalized,charge} [{morgan_binary,morgan_count,rdkit_2d,v1_rdkit_2d,v1_rdkit_2d_normalized,charge} ...]]
                        [--descriptors-path DESCRIPTORS_PATH]
                        [--no-descriptor-scaling] [--no-atom-feature-scaling]
                        [--no-atom-descriptor-scaling]


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [2]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

# Use the same split as before for consistency
X = df[['smiles']]
y = df['avg_lifespan_change_percent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate R² score
r2_chemprop = r2_score(y_test, y_pred_chemprop)
print(f"Chemprop R² Score: {r2_chemprop:.3f}")

# Plot the results
plt.figure(figsize=(6, 6))
sns.regplot(x=y_test, y=y_pred_chemprop, scatter_kws={'s': 10}, line_kws={'color': 'red'})
plt.xlabel("Actual Lifespan Change (%)")
plt.ylabel("Predicted Lifespan Change (%)")
plt.title("Test Set Correlation: Actual vs. Predicted Lifespan Change (Chemprop)")
plt.show()

NameError: name 'df' is not defined

In [None]:
# Original Random Forest R² Score
print(f"Random Forest R² Score: {r2:.3f}")

# Chemprop R² Score
print(f"Chemprop R² Score: {r2_chemprop:.3f}")