In [2]:
# Importing Essential libraries
import pandas as pd
import numpy as np
from chembl_webresource_client.new_client import new_client
import time
from tqdm.notebook import tqdm  # For progress bars

#### Let's first search for our target protein in ChEMBL

In [3]:
# Initialize ChEMBL client
target = new_client.target

# Search for Mycobacterium tuberculosis targets
targets = target.search('Mycobacterium tuberculosis')
targets_df = pd.DataFrame.from_dict(targets)

# Display first few rows of our targets
print("Number of targets found:", len(targets_df))
targets_df[['target_chembl_id', 'organism', 'target_type', 'pref_name']].head()

Number of targets found: 116


Unnamed: 0,target_chembl_id,organism,target_type,pref_name
0,CHEMBL360,Mycobacterium tuberculosis,ORGANISM,Mycobacterium tuberculosis
1,CHEMBL2111188,Mycobacterium tuberculosis H37Rv,ORGANISM,Mycobacterium tuberculosis H37Rv
2,CHEMBL613086,Mycobacterium tuberculosis variant bovis,ORGANISM,Mycobacterium tuberculosis variant bovis
3,CHEMBL612960,Mycobacterium tuberculosis variant microti,ORGANISM,Mycobacterium tuberculosis variant microti
4,CHEMBL615052,Mycobacterium tuberculosis variant bovis BCG,ORGANISM,Mycobacterium tuberculosis variant bovis BCG


### Get bioactivity data for a specific target

In [7]:
def get_bioactivity_data(target_chembl_id):
    """
    Retrieve bioactivity data for a specific target from ChEMBL
    """
    # Initialize activity client
    activity = new_client.activity
    
    # Get activities for target
    activities = activity.filter(
        target_chembl_id=target_chembl_id,
        standard_type="IC50" 
    )
    
    # Convert to DataFrame
    df = pd.DataFrame.from_dict(activities)
    
    return df

In [8]:
df = get_bioactivity_data("CHEMBL360")

In [9]:
def clean_bioactivity_data(df):
    """
    Clean and process bioactivity data
    
    Parameters:
    df (pandas.DataFrame): Raw bioactivity data
    
    Returns:
    pandas.DataFrame: Cleaned bioactivity data
    """
    # Select relevant columns
    selected_columns = [
        'molecule_chembl_id',
        'canonical_smiles',
        'standard_type',
        'standard_value',
        'standard_units',
        'standard_relation',
        'assay_type',
        'assay_description'
    ]
    
    df_clean = df[selected_columns].copy()
    
    # Remove entries without SMILES or IC50 values
    df_clean = df_clean.dropna(subset=['canonical_smiles', 'standard_value'])
    
    # Convert IC50 to nM if in other units
    mask_micromolar = df_clean['standard_units'] == 'uM'
    df_clean.loc[mask_micromolar, 'standard_value'] *= 1000
    
    # Add pIC50 column (-log10(IC50[M]))
    df_clean['pIC50'] = -np.log10(df_clean['standard_value'] * 1e-9)  # Convert nM to M
    
    return df_clean

In [10]:
def prepare_final_dataset(chembl_data, save_path='tb_dataset.csv'):
    """
    Prepare final dataset combining ChEMBL data with additional filters
    
    Parameters:
    chembl_data (pandas.DataFrame): Cleaned ChEMBL bioactivity data
    save_path (str): Path to save the final dataset
    
    Returns:
    pandas.DataFrame: Final processed dataset
    """
    # Additional filtering criteria
    df_final = chembl_data.copy()
    
    # Remove duplicates, keeping entry with median pIC50 if multiple exists
    df_final = df_final.groupby('canonical_smiles').agg({
        'pIC50': 'median',
        'molecule_chembl_id': 'first',
        'standard_relation': 'first',
        'assay_type': 'first',
        'assay_description': 'first'
    }).reset_index()
    
    # Save to CSV
    df_final.to_csv(save_path, index=False)
    
    return df_final

In [11]:
def main():
    # 1. Choose your target (example with InhA, a known TB drug target)
    target_chembl_id = 'CHEMBL2366516'  # Replace with your chosen target
    
    # 2. Get bioactivity data
    print("Fetching bioactivity data...")
    bioactivity_data = get_bioactivity_data(target_chembl_id)
    print(f"Retrieved {len(bioactivity_data)} bioactivity entries")
    
    # 3. Clean the data
    print("\nCleaning and processing data...")
    cleaned_data = clean_bioactivity_data(bioactivity_data)
    print(f"Cleaned dataset contains {len(cleaned_data)} entries")
    
    # 4. Prepare final dataset
    print("\nPreparing final dataset...")
    final_dataset = prepare_final_dataset(cleaned_data)
    print(f"Final dataset contains {len(final_dataset)} unique compounds")
    
    # 5. Basic statistics
    print("\nDataset statistics:")
    print(f"pIC50 range: {final_dataset['pIC50'].min():.2f} - {final_dataset['pIC50'].max():.2f}")
    print(f"pIC50 mean ± std: {final_dataset['pIC50'].mean():.2f} ± {final_dataset['pIC50'].std():.2f}")

if __name__ == "__main__":
    main()

Fetching bioactivity data...
Retrieved 0 bioactivity entries

Cleaning and processing data...


KeyError: "None of [Index(['molecule_chembl_id', 'canonical_smiles', 'standard_type',\n       'standard_value', 'standard_units', 'standard_relation', 'assay_type',\n       'assay_description'],\n      dtype='object')] are in the [columns]"