In [2]:
import pandas as pd
from rdkit import RDLogger
import warnings
import subprocess
import os
import tempfile

# Suppress RDKit logging
RDLogger.DisableLog('rdApp.*')
warnings.filterwarnings('ignore')

# Create a small dataset with a single SMILES
smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"  # This is aspirin as an example
df = pd.DataFrame({'SMILES': [smiles]})

# Specify the path to the trained model
model_path = 'chemprop_model_pca.pt'  # Update this path to your model file

# Create a temporary file for input CSV
with tempfile.NamedTemporaryFile(delete=False, mode='w', newline='') as input_tempfile:
    df.to_csv(input_tempfile, index=False)
    input_path = input_tempfile.name

# Run chemprop predict using subprocess and capture the output
command = f"chemprop_predict --test_path {input_path} --checkpoint_path {model_path} --preds_path -"

# Run the command and capture the output directly
result = subprocess.run(command, shell=True, capture_output=True, text=True)

# Clean up the temporary input file
os.remove(input_path)

# Check for errors during prediction
if result.returncode != 0:
    print("Error during prediction:", result.stderr)
else:
    # Print the raw output to examine it
    print("Raw output from chemprop_predict:")
    print(result.stdout)  # This will show the actual content returned by the command

    try:
        # Attempt to parse the output as JSON
        predictions = pd.read_csv(pd.compat.StringIO(result.stdout))  # This assumes CSV output

        # Display results
        print("\nResults:")
        print(predictions)
        print(predictions['pIC50'][0])  # Assuming 'pIC50' is the prediction column
    except Exception as e:
        print("Error parsing prediction output:", e)
        print("The output is not in expected format.")




Raw output from chemprop_predict:
Loading training args
Setting molecule featurization parameters to default.
Loading data
Validating SMILES
Test size = 1
Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".
Saving predictions to -
Elapsed time = 0:00:05

Error parsing prediction output: module 'pandas.compat' has no attribute 'StringIO'
The output is not in expected format.


In [3]:
import pandas as pd
from rdkit import RDLogger
import warnings
import subprocess
import io

# Suppress RDKit logging
RDLogger.DisableLog('rdApp.*')
warnings.filterwarnings('ignore')

# Create a small dataset with single SMILES
smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"  # This is aspirin as an example
df = pd.DataFrame({'SMILES': [smiles]})

# Specify the path to the trained model
model_path = 'chemprop_model_pca.pt'  # Update this path to your model file

# Prepare the input for ChemProp predict
input_csv = 'input.csv'
df.to_csv(input_csv, index=False)

# Run chemprop predict using subprocess and capture the output
command = f"chemprop_predict --test_path {input_csv} --checkpoint_path {model_path} --preds_path predictions.csv"
result = subprocess.run(command, shell=True, capture_output=True, text=True)

# Check for errors during prediction
if result.returncode != 0:
    print("Error during prediction:", result.stderr)
else:
    # Read predictions directly from the output (using the predictions.csv file created)
    with open('predictions.csv', 'r') as f:
        predictions = pd.read_csv(f)
    
    # Display results
    print("\nResults:")
    print(predictions)
    print(predictions['pIC50'][0])  # Assuming 'pIC50' is the prediction column



Results:
                     SMILES     pIC50
0  CC(=O)OC1=CC=CC=C1C(=O)O  5.999985
5.999985482158474


In [10]:
import pandas as pd
from rdkit import RDLogger
import warnings
import os

# Suppress RDKit logging
RDLogger.DisableLog('rdApp.*')
warnings.filterwarnings('ignore')

# Create a small dataset with single SMILES
smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"  # This is aspirin as an example
df = pd.DataFrame({'SMILES': [smiles]})
df.to_csv('input.csv', index=False)

# Specify the path to the trained model
model_path = 'chemprop_model_pca.pt'  # Update this path to your model file
 
# Run chemprop predict using command line
os.system(f"chemprop_predict --test_path {smiles} --checkpoint_path {model_path} --preds_path {pred}")
print("predic",pred)
# Read and display results
if os.path.exists('chemprop_predictions.csv'):
    predictions = pd.read_csv('chemprop_predictions.csv')
    print("\nResults:")
    print(predictions)
    print(predictions['pIC50'][0])
else:
    print("Prediction failed to generate output file")


NameError: name 'pred' is not defined

In [2]:
import torch
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator

# Define the same descriptors used during training
required_descriptors = [
    'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v',
    'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'EState_VSA1', 'EState_VSA10', 'EState_VSA11', 'EState_VSA2',
    'EState_VSA3', 'EState_VSA4', 'EState_VSA5', 'EState_VSA6', 'EState_VSA7', 'EState_VSA8', 'EState_VSA9',
    'ExactMolWt', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'FractionCSP3', 'HallKierAlpha',
    'HeavyAtomCount', 'HeavyAtomMolWt', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'MaxAbsEStateIndex',
    'MaxAbsPartialCharge', 'MaxEStateIndex', 'MaxPartialCharge', 'MinAbsEStateIndex', 'MinAbsPartialCharge',
    'MinEStateIndex', 'MinPartialCharge', 'MolLogP', 'MolMR', 'MolWt', 'NHOHCount', 'NOCount', 'NumAliphaticCarbocycles',
    'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticCarbocycles', 'NumAromaticHeterocycles',
    'NumAromaticRings', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms', 'NumRadicalElectrons',
    'NumRotatableBonds', 'NumSaturatedCarbocycles', 'NumSaturatedHeterocycles', 'NumSaturatedRings',
    'NumValenceElectrons', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14',
    'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9',
    'RingCount', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7',
    'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'SlogP_VSA12', 'SlogP_VSA2', 'SlogP_VSA3',
    'SlogP_VSA4', 'SlogP_VSA5', 'SlogP_VSA6', 'SlogP_VSA7', 'SlogP_VSA8', 'SlogP_VSA9', 'TPSA', 'VSA_EState1',
    'VSA_EState10', 'VSA_EState2', 'VSA_EState3', 'VSA_EState4', 'VSA_EState5', 'VSA_EState6', 'VSA_EState7',
    'VSA_EState8', 'VSA_EState9', 'fr_Al_COO', 'fr_Al_OH', 'fr_Al_OH_noTert', 'fr_ArN', 'fr_Ar_COO', 'fr_Ar_N',
    'fr_Ar_NH', 'fr_Ar_OH', 'fr_COO', 'fr_COO2', 'fr_C_O', 'fr_C_O_noCOO', 'fr_C_S', 'fr_HOCCN', 'fr_Imine',
    'fr_NH0', 'fr_NH1', 'fr_NH2', 'fr_N_O', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_Nhpyrrole', 'fr_SH',
    'fr_aldehyde', 'fr_alkyl_carbamate', 'fr_alkyl_halide', 'fr_allylic_oxid', 'fr_amide', 'fr_amidine', 'fr_aniline',
    'fr_aryl_methyl', 'fr_azide', 'fr_azo', 'fr_barbitur', 'fr_benzene', 'fr_benzodiazepine', 'fr_bicyclic', 'fr_diazo',
    'fr_dihydropyridine', 'fr_epoxide', 'fr_ester', 'fr_ether', 'fr_furan', 'fr_guanido', 'fr_halogen', 'fr_hdrzine',
    'fr_hdrzone', 'fr_imidazole', 'fr_imide', 'fr_isocyan', 'fr_isothiocyan', 'fr_ketone', 'fr_ketone_Topliss',
    'fr_lactam', 'fr_lactone', 'fr_methoxy', 'fr_morpholine', 'fr_nitrile', 'fr_nitro', 'fr_nitro_arom',
    'fr_nitro_arom_nonortho', 'fr_nitroso', 'fr_oxazole', 'fr_oxime', 'fr_para_hydroxylation', 'fr_phenol',
    'fr_phenol_noOrthoHbond', 'fr_phos_acid', 'fr_phos_ester', 'fr_piperdine', 'fr_piperzine', 'fr_priamide',
    'fr_prisulfonamd', 'fr_pyridine', 'fr_quatN', 'fr_sulfide', 'fr_sulfonamd', 'fr_sulfone', 'fr_term_acetylene',
    'fr_tetrazole', 'fr_thiazole', 'fr_thiocyan', 'fr_thiophene', 'fr_unbrch_alkane', 'fr_urea', 'qed'
]

# Create a descriptor calculator
descriptor_calculator = MolecularDescriptorCalculator(required_descriptors)

# Function to calculate descriptors for a single SMILES
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"Invalid SMILES: {smiles}")
    return np.array(descriptor_calculator.CalcDescriptors(mol))

# Load the saved PyTorch model
model = YourModelClass(*args)  # Pass any required arguments for initialization

# Step 2: Load the state_dict from the saved dictionary
checkpoint = torch.load('chemprop_model_pca.pt', map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['state_dict'])  # Adjust the key if needed

# Step 3: Set the model to evaluation mode
model.eval()

# Predict function
def predict_single_smiles(smiles):
    try:
        # Calculate descriptors
        descriptors = calculate_descriptors(smiles)
        # Convert to tensor and add batch dimension
        input_tensor = torch.tensor(descriptors, dtype=torch.float32).unsqueeze(0)
        # Predict
        with torch.no_grad():
            prediction = model(input_tensor)
        return prediction.item()  # Convert tensor to scalar
    except Exception as e:
        print(f"Error during prediction: {e}")
        return None

# Example usage
smiles_example = "CCO"  # Replace with your SMILES string
predicted_pIC50 = predict_single_smiles(smiles_example)
print(f"Predicted pIC50 for SMILES '{smiles_example}': {predicted_pIC50}")


NameError: name 'YourModelClass' is not defined