# Imports

In [9]:
# Standard library imports
import os

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Descriptors

# Local imports

# Look at ChemBL Data

In [10]:
# Get json file path
cur_dir = os.path.dirname(os.path.realpath('__file__'))
json_path = f'{cur_dir}/data/chembl_data.json'

# Load into dataframe
df = pd.read_json(json_path)
print(df.head())

                target  target_id      assay_id  \
0  Cytochrome P450 3A4  CHEMBL340  CHEMBL883800   
1  Cytochrome P450 3A4  CHEMBL340  CHEMBL883800   
2  Cytochrome P450 3A4  CHEMBL340  CHEMBL883800   
3  Cytochrome P450 3A4  CHEMBL340  CHEMBL618439   
4  Cytochrome P450 3A4  CHEMBL340  CHEMBL659093   

                                              smiles  type relation    value  \
0  CC(=O)N1CCN(c2ccc(OCC3COC(Cn4ccnc4)(c4ccc(Cl)c...  IC50        =   1260.0   
1  CC(=O)N1CCN(c2ccc(OC[C@@H]3CO[C@](Cn4ccnc4)(c4...  IC50        =    897.0   
2  CC(=O)N1CCN(c2ccc(OCC3COC(Cn4ccnc4)(c4ccc(Cl)c...  IC50        =    786.0   
3  CC(=O)N1CCN(c2ccc(OC[C@H]3CO[C@@](Cn4ccnc4)(c4...  IC50        =    570.0   
4      Cc1ccc(-c2ncc(Cl)cc2-c2ccc(S(C)(=O)=O)cc2)cn1  IC50        >  50000.0   

  unit  molregno  doc_id               journal                            doi  \
0   nM    255904   11347            J Med Chem            10.1021/jm00093a015   
1   nM    156650   11347            J Med Chem    

In [11]:
# Investigate number of entries with a document listed
chembl_df = df[
    (df['doi'].notnull())
].reset_index(drop=True)
print(f'\nNumber of entries with a document attached: {len(chembl_df)} / {len(df)}')

# Journal information (suggesting similar criteria for acceptance)
journals = chembl_df['journal'].dropna().unique()
print(f'\nNumber of journals within supporting documents: {len(journals)}')
print(f'Journal names: {journals}')


Number of entries with a document attached: 9804 / 10969

Number of journals within supporting documents: 12
Journal names: ['J Med Chem' 'Bioorg Med Chem Lett' 'Bioorg Med Chem' 'J Nat Prod'
 'Eur J Med Chem' 'Antimicrob Agents Chemother' 'ACS Med Chem Lett'
 'Chem Res Toxicol' 'Med Chem Res' 'Medchemcomm' 'Drug Metab Dispos'
 'RSC Med Chem']


In [12]:
# Get the molecular weight from a smiles string
def calculate_mol_weight(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Descriptors.ExactMolWt(mol)

# Known unit conversion factors
unit_dict = {'uM': 1, 'mM': 1e3, 'nM': 1e-3}

# Function to get unit conversion factor
def get_conversion_factor(unit, mol_weight=None):
    if unit == 'ug ml-1':
        return 1e3 / mol_weight
    elif unit == 'mg/ml':
        return 1e6 / mol_weight
    else:
        return unit_dict.get(unit)

# Function to convert units
def convert_units(row):
    smiles, value, unit = row['smiles'], row['value'], row['unit']
    mol_weight = calculate_mol_weight(smiles)
    conversion_factor = get_conversion_factor(unit, mol_weight)
    if conversion_factor:
        row['value'] = value * conversion_factor
        row['unit'] = 'uM'
    
    return row

# Allowed unit types
allowed_units = ['uM', 'nM']
invalid_dois = [
    '10.1016/j.ejmech.2007.10.034', '10.1016/j.bmcl.2012.08.044', 
    '10.1021/acsmedchemlett.8b00220', '10.1016/j.ejmech.2008.12.004',
    '10.1016/j.bmcl.2015.01.005'
]

# Correct discovered incorrect unit type for doi
chembl_df.loc[chembl_df['doi'] == '10.1021/jm049696n', 'unit'] = 'nM'
chembl_df.loc[chembl_df['doi'] == '10.1021/jm900521k', 'value'] /= 1e6
chembl_df.loc[chembl_df['doi'] == '10.1021/jm900521k', 'unit'] = 'uM'

# Convert ic50 value to correct units
chembl_df = chembl_df.apply(convert_units, axis=1)

# Remove rows with data not of interest
chembl_df = chembl_df[
    (chembl_df['type'] == 'IC50') &
    (chembl_df['value'].notna()) &
    (chembl_df['value'] != 0) &
    (chembl_df['unit'].notna()) &
    (chembl_df['unit'].isin(allowed_units)) &
    (chembl_df['relation'].notna()) &
    (chembl_df['relation'] == '=') &
    ~(chembl_df['doi'].isin(invalid_dois)) &
    (chembl_df['description'].notna())
].sort_values(by='value', ascending=False)

# Add log value column
chembl_df['log_value'] = chembl_df['value'].apply(np.log10)
cols = chembl_df.columns.insert(7, 'log_value')[:-1]
chembl_df = chembl_df.reindex(columns=cols)

print(f'Total number of activities for target: {len(chembl_df)}')

Total number of activities for target: 5184


In [None]:
# Add scaffold smiles column
def get_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Chem.MurckoScaffold.MurckoScaffoldSmiles(mol)

chembl_df['scaffold_smiles'] = chembl_df['smiles'].apply(get_scaffold)

# Move column to after smiles
cols = chembl_df.columns.insert(4, 'scaffold_smiles')[:-1]
chembl_df = chembl_df.reindex(columns=cols)

In [13]:
# Save data for regression task
chembl_df.to_csv(f'{cur_dir}/data/regression.csv')
chembl_df.to_json(f'{cur_dir}/data/regression.json', orient='records')
print(chembl_df.info())
chembl_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 5184 entries, 6352 to 2676
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   target                  5184 non-null   object 
 1   target_id               5184 non-null   object 
 2   assay_id                5184 non-null   object 
 3   smiles                  5184 non-null   object 
 4   type                    5184 non-null   object 
 5   relation                5184 non-null   object 
 6   value                   5184 non-null   float64
 7   log_value               5184 non-null   float64
 8   unit                    5184 non-null   object 
 9   molregno                5184 non-null   int64  
 10  doc_id                  5184 non-null   int64  
 11  journal                 5092 non-null   object 
 12  doi                     5184 non-null   object 
 13  description             5184 non-null   object 
 14  assay_type              5184 non-null   ob

Unnamed: 0,target,target_id,assay_id,smiles,type,relation,value,log_value,unit,molregno,...,journal,doi,description,assay_type,assay_test_type,assay_organism,assay_tissue,assay_cell_type,assay_confidence_score,data_validity_comment
6352,Cytochrome P450 3A4,CHEMBL340,CHEMBL3817398,NCCc1cccnc1,IC50,=,5000.0,3.69897,uM,704986,...,J Med Chem,10.1021/acs.jmedchem.5b01146,Inhibition of C-terminal four-histidine tagged...,A,,Homo sapiens,,,9,Outside typical range
6353,Cytochrome P450 3A4,CHEMBL340,CHEMBL3817398,c1ccncc1,IC50,=,4000.0,3.60206,uM,13349,...,J Med Chem,10.1021/acs.jmedchem.5b01146,Inhibition of C-terminal four-histidine tagged...,A,,Homo sapiens,,,9,Outside typical range
6354,Cytochrome P450 3A4,CHEMBL340,CHEMBL3817398,NCCCCCC(=O)NCc1cccnc1,IC50,=,1000.0,3.0,uM,2088882,...,J Med Chem,10.1021/acs.jmedchem.5b01146,Inhibition of C-terminal four-histidine tagged...,A,,Homo sapiens,,,9,Outside typical range
229,Cytochrome P450 3A4,CHEMBL340,CHEMBL664808,CC(C)C[C@H](C=O)NC(=O)[C@@H](NS(=O)(=O)c1ccc(F...,IC50,=,531.0,2.725095,uM,282942,...,J Med Chem,10.1021/jm0201924,Inhibition of Cytochrome P450 3A4 as BQ substrate,A,,,,,8,Outside typical range
8766,Cytochrome P450 3A4,CHEMBL340,CHEMBL4717445,O=C(Nc1cccc(Cl)c1)N1CCOc2ccc(-c3ccncc3)cc2C1,IC50,=,325.0,2.511883,uM,2524229,...,Bioorg Med Chem Lett,10.1016/j.bmcl.2016.04.018,Inhibition of human liver microsomes CYP3A4 us...,A,,Homo sapiens,Liver,,9,Outside typical range


In [14]:
print('Unique targets: ', chembl_df['target'].unique())
print('Unique target ids: ', chembl_df['target_id'].unique())
print('Unique types: ', chembl_df['type'].unique())
print('Length of data: ', len(chembl_df))
print('Number of unique smiles: ', len(chembl_df['smiles'].unique()))
print('Number of unique values: ', len(chembl_df['value'].unique()))
print('Number of unique dois: ', len(chembl_df['doi'].unique()))

Unique targets:  ['Cytochrome P450 3A4']
Unique target ids:  ['CHEMBL340']
Unique types:  ['IC50']
Length of data:  5184
Number of unique smiles:  4318
Number of unique values:  1122
Number of unique dois:  1042


In [15]:
print(chembl_df['data_validity_comment'].value_counts())

data_validity_comment
Outside typical range            38
Potential transcription error     5
Name: count, dtype: int64


In [16]:
print(chembl_df[chembl_df.doi == '']['value'])
print(chembl_df[chembl_df.smiles == '']['value'])

Series([], Name: value, dtype: float64)
Series([], Name: value, dtype: float64)
