In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pandas rdkit-pypi scikit-learn

Collecting rdkit-pypi
  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2022.9.5


In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [4]:
file_path = '/content/drive/My Drive/bioML/acetylcholinesterase2.csv'  # Update with your file path
df2 = pd.read_csv(file_path)


In [5]:
df2

Unnamed: 0,molecule_chembl_id,canonical_smiles,class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,CHEMBL2311603,Cl.NC1CC[Si]2(CCCCC2)CC1,inactive,183.371,2.7402,1.0,1.0,4.504456
1,CHEMBL2311602,Cl.NC1CC[Si]2(CCCC2)CC1,inactive,169.344,2.3501,1.0,1.0,4.016374
2,CHEMBL2041317,Cl.NC1CCC2(CCCCC2)CC1,inactive,167.296,2.8382,1.0,1.0,4.071092
3,CHEMBL2311604,Cl.NC1CCC2(CCCC2)CC1,inactive,153.269,2.4481,1.0,1.0,3.496754
4,CHEMBL2324172,Oc1ccc(CNC23CC4CC(CC(C4)C2)C3)c(O)c1,inactive,273.376,3.1562,3.0,3.0,4.453457
5,CHEMBL2324191,Oc1ccc(CNC23CC4CC(CC(C4)C2)C3)cc1,inactive,257.377,3.4506,2.0,2.0,3.779892
6,CHEMBL959,CC(N)C12CC3CC(CC(C3)C1)C2,inactive,179.307,2.55,1.0,1.0,4.966576
7,CHEMBL660,NC12CC3CC(CC(C3)C1)C2,inactive,151.253,1.9139,1.0,1.0,3.30103
8,CHEMBL3088159,NC1CCC2(CC1)C1CC3CC(C1)CC2C3,inactive,219.372,3.3302,1.0,1.0,4.0
9,CHEMBL3088160,c1csc(-c2cc(CNC34CC5CC(CC(C5)C3)C4)no2)c1,inactive,314.454,4.4615,1.0,4.0,4.0


In [6]:
# Drop any rows with NaN values
df2.dropna(inplace=True)

# Function to calculate molecular descriptors
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * 4  # Return None if the molecule is invalid
    return [
        Descriptors.MolWt(mol),              # Molecular Weight
        Descriptors.MolLogP(mol),            # LogP
        Descriptors.NumHDonors(mol),         # Number of Hydrogen Donors
        Descriptors.NumHAcceptors(mol),      # Number of Hydrogen Acceptors
    ]

# Apply the descriptor calculation to the 'canonical_smiles' column
descriptors = df2['canonical_smiles'].apply(calculate_descriptors)

# Create a DataFrame for descriptors
descriptor_df = pd.DataFrame(descriptors.tolist(), columns=['MW', 'LogP', 'HDonors', 'HAcceptors'])

# Combine the descriptors with the original DataFrame
df = pd.concat([df2.reset_index(drop=True), descriptor_df], axis=1)

# Select features and target
X = df[['MW', 'LogP', 'HDonors', 'HAcceptors']]
y = df['pIC50']  # Assuming you're predicting pIC50

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse:.2f}')

Root Mean Squared Error: 0.41


In [7]:
predictions = pd.DataFrame({
    'SMILES': df2['canonical_smiles'].iloc[X_test.index].reset_index(drop=True),  # Original SMILES from the test set
    'Predicted pIC50': y_pred  # Predictions made by the model
})

# Display the predictions DataFrame
print(predictions)

                                      SMILES  Predicted pIC50
0       N=C(N)N1CCC(C=C2C3CC4CC(C3)CC2C4)CC1         5.268790
1            Cl.N=C(N)N1CC2C3CCC(C4CC34)C2C1         5.468847
2            Cl.NC1CCC2(CC1)C1CC3CC(C1)CC2C3         4.387015
3                   C1CC2C1C1CCC2C2CNCC12.Cl         5.317896
4               NC1CCC2(CC1)C1CC3CC(C1)CC2C3         4.515401
5  c1csc(-c2cc(CNC34CC5CC(CC(C5)C3)C4)no2)c1         4.356819
6             CCC(N)(CC)C12CC3CC(CC(C3)C1)C2         4.431682


In [8]:
predictions.to_csv('predicted_smiles.csv', index=False)

In [9]:
file_path = '/content/drive/My Drive/bioML/predicted_smiles2.csv'  # Change the path as necessary

# Move the file to Google Drive
!cp predicted_smiles.csv "{file_path}"