In [None]:
!pip install -q chembl_webresource_client tqdm
!mamba install -c conda-forge rdkit -y


Looking for: ['rdkit']

conda-forge/linux-64                                        Using cache
[?25l[2K[0G[+] 0.0s
[2K[1A[2K[0Gconda-forge/noarch                                            No change
[?25h
Pinned packages:
  - python 3.11.*
  - python 3.11.*
  - python_abi 3.11.* *cp311*
  - cuda-version 12.*


Transaction

  Prefix: /usr/local

  All requested packages already installed

[?25l[2K[0G[?25h

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import Descriptors

In [None]:
target = new_client.target
activity = new_client.activity

# Search target: ASK1
ask1_target = target.search("ASK1")[0]
target_chembl_id = ask1_target['target_chembl_id']
print(f"Target ChEMBL ID: {target_chembl_id}")

# Fetch activities
activities = activity.filter(target_chembl_id=target_chembl_id, standard_type="IC50")
df = pd.DataFrame(activities)
print(f"Total raw activities: {len(df)}")


Target ChEMBL ID: CHEMBL5285
Total raw activities: 2485


In [None]:
df = df[['molecule_chembl_id', 'canonical_smiles', 'standard_value']]
df = df.dropna()

# Convert standard_value to numeric
df['standard_value'] = pd.to_numeric(df['standard_value'], errors='coerce')

# Drop invalid / negative values
df = df.dropna(subset=['standard_value'])
df = df[df['standard_value'] > 0]

# Convert IC50 (nM) → pIC50
df['pIC50'] = -np.log10(df['standard_value'] * 1e-9)

# Add binary Label (1 if pIC50 ≥ 6)
df['Label'] = df['pIC50'].apply(lambda x: 1 if x >= 6 else 0)

print(f"Curated dataset shape: {df.shape}")
df.head()

Curated dataset shape: (2473, 5)


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,pIC50,Label
0,CHEMBL471375,Cn1cc(/C=C2\C(=O)Nc3ccccc32)c2ccccc21,100000.0,4.0,0
1,CHEMBL471375,Cn1cc(/C=C2\C(=O)Nc3ccccc32)c2ccccc21,82700.0,4.082494,0
2,CHEMBL388978,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,1000.0,6.0,1
3,CHEMBL361708,NC1=NC(=O)/C(=C2\CCNC(=O)c3[nH]c(Br)cc32)N1,7000.0,5.154902,0
4,CHEMBL388978,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,37.0,7.431798,1


In [None]:
def calc_rdkit_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [np.nan]*6
    return [
        Descriptors.MolWt(mol),           # MW
        Descriptors.MolLogP(mol),         # LogP
        Descriptors.NumHDonors(mol),      # HBD
        Descriptors.NumHAcceptors(mol),   # HBA
        Descriptors.TPSA(mol),            # TPSA
        Descriptors.NumRotatableBonds(mol)# RotB
    ]

desc_list = []
for smi in tqdm(df['canonical_smiles'], desc="Calculating RDKit descriptors"):
    desc_list.append(calc_rdkit_descriptors(smi))

desc_df = pd.DataFrame(desc_list, columns=['MW','LogP','HBD','HBA','TPSA','RotB'])
df = pd.concat([df.reset_index(drop=True), desc_df], axis=1)

Calculating RDKit descriptors: 100%|██████████| 2473/2473 [00:02<00:00, 1021.48it/s]


In [None]:
final_cols = ['molecule_chembl_id', 'canonical_smiles', 'standard_value', 'pIC50',
              'MW','LogP','HBD','HBA','TPSA','RotB','Label']

final_df = df[final_cols]
final_df.to_csv("ASK1_dataset.csv", index=False)
print("Dataset saved as ASK1_dataset.csv")
final_df.head()

Dataset saved as ASK1_dataset.csv


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,pIC50,MW,LogP,HBD,HBA,TPSA,RotB,Label
0,CHEMBL471375,Cn1cc(/C=C2\C(=O)Nc3ccccc32)c2ccccc21,100000.0,4.0,274.323,3.671,1,2,34.03,1,0
1,CHEMBL471375,Cn1cc(/C=C2\C(=O)Nc3ccccc32)c2ccccc21,82700.0,4.082494,274.323,3.671,1,2,34.03,1,0
2,CHEMBL388978,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,1000.0,6.0,466.541,4.354,2,6,69.45,2,1
3,CHEMBL361708,NC1=NC(=O)/C(=C2\CCNC(=O)c3[nH]c(Br)cc32)N1,7000.0,5.154902,324.138,0.0663,4,4,112.37,0,0
4,CHEMBL388978,CN[C@@H]1C[C@H]2O[C@@](C)([C@@H]1OC)n1c3ccccc3...,37.0,7.431798,466.541,4.354,2,6,69.45,2,1


In [None]:
from sklearn.model_selection import train_test_split

X = df[['MW','LogP','HBD','HBA','TPSA','RotB']].values
y = df['Label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (1978, 6), Test shape: (495, 6)
