# Imports

In [67]:
# Standard library imports
import os

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Descriptors

# Local imports

# Look at ChemBL Data

In [68]:
# Get json file path
cur_dir = os.path.dirname(os.path.realpath('__file__'))
json_path = f'{cur_dir}/data/chembl_data.json'

# Load into dataframe
df = pd.read_json(json_path)
print(df.head())

                target  target_id      assay_id  \
0  Cytochrome P450 3A4  CHEMBL340  CHEMBL883800   
1  Cytochrome P450 3A4  CHEMBL340  CHEMBL883800   
2  Cytochrome P450 3A4  CHEMBL340  CHEMBL883800   
3  Cytochrome P450 3A4  CHEMBL340  CHEMBL618439   
4  Cytochrome P450 3A4  CHEMBL340  CHEMBL659093   

                                              smiles  type relation    value  \
0  CC(=O)N1CCN(c2ccc(OCC3COC(Cn4ccnc4)(c4ccc(Cl)c...  IC50        =   1260.0   
1  CC(=O)N1CCN(c2ccc(OC[C@@H]3CO[C@](Cn4ccnc4)(c4...  IC50        =    897.0   
2  CC(=O)N1CCN(c2ccc(OCC3COC(Cn4ccnc4)(c4ccc(Cl)c...  IC50        =    786.0   
3  CC(=O)N1CCN(c2ccc(OC[C@H]3CO[C@@](Cn4ccnc4)(c4...  IC50        =    570.0   
4      Cc1ccc(-c2ncc(Cl)cc2-c2ccc(S(C)(=O)=O)cc2)cn1  IC50        >  50000.0   

  unit  molregno  doc_id               journal                            doi  
0   nM    255904   11347            J Med Chem            10.1021/jm00093a015  
1   nM    156650   11347            J Med Chem      

In [69]:
# Investigate number of entries with a document listed
chembl_df = df[
    (df['doi'].notnull())
].reset_index(drop=True)
print(f'\nNumber of entries with a document attached: {len(chembl_df)} / {len(df)}')

# Journal information (suggesting similar criteria for acceptance)
journals = chembl_df['journal'].dropna().unique()
print(f'\nNumber of journals within supporting documents: {len(journals)}')
print(f'Journal names: {journals}')


Number of entries with a document attached: 9804 / 10969

Number of journals within supporting documents: 12
Journal names: ['J Med Chem' 'Bioorg Med Chem Lett' 'Bioorg Med Chem' 'J Nat Prod'
 'Eur J Med Chem' 'Antimicrob Agents Chemother' 'ACS Med Chem Lett'
 'Chem Res Toxicol' 'Med Chem Res' 'Medchemcomm' 'Drug Metab Dispos'
 'RSC Med Chem']


In [80]:
# Get the molecular weight from a smiles string
def calculate_mol_weight(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Descriptors.ExactMolWt(mol)

# Known unit conversion factors
unit_dict = {'mM': 1e3, 'nM': 1e-3}

# Function to get unit conversion factor
def get_conversion_factor(unit, mol_weight=None):
    if unit == 'ug ml-1':
        return 1e3 / mol_weight
    elif unit == 'mg/ml':
        return 1e6 / mol_weight
    else:
        return unit_dict.get(unit)

# Function to convert units
def convert_units(row):
    smiles, value, unit = row['smiles'], row['value'], row['unit']
    
    if unit in desired_units:
        row['unit'] = 'uM'
        return row
    
    mol_weight = calculate_mol_weight(smiles)
    conversion_factor = get_conversion_factor(unit, mol_weight)
    if conversion_factor:
        row['value'] = value * conversion_factor
        row['unit'] = 'uM'
    
    return row

# Allowed unit types
allowed_units = [
    'uM', 'nM'
]



# Convert ic50 value to correct units
chembl_df = chembl_df.apply(convert_units, axis=1)

# Remove rows with data not of interest
chembl_df = chembl_df[
    (chembl_df['type'] == 'IC50') &
    (chembl_df['value'].notna()) &
    (chembl_df['value'] != 0) &
    (chembl_df['value'] <= 100) &
    (chembl_df['unit'].notna()) &
    (chembl_df['unit'].isin(allowed_units)) &
    (chembl_df['relation'].notna()) &
    (chembl_df['relation'] == '=')
]

print(f'Total number of activities for target: {len(chembl_df)}')

Total number of activities for target: 5172


In [92]:
# Save data for regression task
df = chembl_df.copy()
df['log_value'] = df['value'].apply(np.log10)
cols = df.columns.insert(7, 'log_value')[:-1]
df = df.reindex(columns=cols)
df.to_csv(f'{cur_dir}/data/regression.csv')
df.to_json(f'{cur_dir}/data/regression.json', orient='records')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 5172 entries, 0 to 9803
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   target     5172 non-null   object 
 1   target_id  5172 non-null   object 
 2   assay_id   5172 non-null   object 
 3   smiles     5172 non-null   object 
 4   type       5172 non-null   object 
 5   relation   5172 non-null   object 
 6   value      5172 non-null   float64
 7   log_value  5172 non-null   float64
 8   unit       5172 non-null   object 
 9   molregno   5172 non-null   int64  
 10  doc_id     5172 non-null   int64  
 11  journal    5080 non-null   object 
 12  doi        5172 non-null   object 
dtypes: float64(2), int64(2), object(9)
memory usage: 694.7+ KB
None


Unnamed: 0,target,target_id,assay_id,smiles,type,relation,value,log_value,unit,molregno,doc_id,journal,doi
0,Cytochrome P450 3A4,CHEMBL340,CHEMBL883800,CC(=O)N1CCN(c2ccc(OCC3COC(Cn4ccnc4)(c4ccc(Cl)c...,IC50,=,1.26,0.100371,uM,255904,11347,J Med Chem,10.1021/jm00093a015
1,Cytochrome P450 3A4,CHEMBL340,CHEMBL883800,CC(=O)N1CCN(c2ccc(OC[C@@H]3CO[C@](Cn4ccnc4)(c4...,IC50,=,0.897,-0.047208,uM,156650,11347,J Med Chem,10.1021/jm00093a015
2,Cytochrome P450 3A4,CHEMBL340,CHEMBL883800,CC(=O)N1CCN(c2ccc(OCC3COC(Cn4ccnc4)(c4ccc(Cl)c...,IC50,=,0.786,-0.104577,uM,255904,11347,J Med Chem,10.1021/jm00093a015
3,Cytochrome P450 3A4,CHEMBL340,CHEMBL618439,CC(=O)N1CCN(c2ccc(OC[C@H]3CO[C@@](Cn4ccnc4)(c4...,IC50,=,0.57,-0.244125,uM,156572,11347,J Med Chem,10.1021/jm00093a015
7,Cytochrome P450 3A4,CHEMBL340,CHEMBL659099,Cn1cc(Cc2cn(CC(=O)N(CC(=O)O)Cc3ccc(-c4ccc(C(F)...,IC50,=,31.0,1.491362,uM,140912,5045,Bioorg Med Chem Lett,10.1016/s0960-894x(02)00473-0


In [93]:
# Look at the number of entries per journal
print('Number of entries for journal:')
for journal in journals:
    print(f'\t{journal}: {len(df[df["journal"] == journal])}')

Number of entries for journal:
	J Med Chem: 1931
	Bioorg Med Chem Lett: 2241
	Bioorg Med Chem: 251
	J Nat Prod: 60
	Eur J Med Chem: 204
	Antimicrob Agents Chemother: 2
	ACS Med Chem Lett: 287
	Chem Res Toxicol: 0
	Med Chem Res: 0
	Medchemcomm: 24
	Drug Metab Dispos: 74
	RSC Med Chem: 6
