# Imports

In [33]:
# Standard library imports
import os

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Descriptors

# Local imports

# Look at ChemBL Data

In [34]:
# Get json file path
cur_dir = os.path.dirname(os.path.realpath('__file__'))
json_path = f'{cur_dir}/data/chembl340_all_activities_annotated.json'

# Load into DataFrame
chembl_df = pd.read_json(json_path)
print(chembl_df.head())

     assay               target  target_id  num_activities  \
0   774749  Cytochrome P450 3A4      17045             871   
1  1528855  Cytochrome P450 3A4      17045             190   
2  1640603  Cytochrome P450 3A4      17045             165   
3  1919651  Cytochrome P450 3A4      17045             106   
4  1637154  Cytochrome P450 3A4      17045              85   

                                          activities document  
0  [{'molregno': 97, 'relation': None, 'value': N...      NaN  
1  [{'molregno': 2032608, 'relation': '=', 'value...      NaN  
2  [{'molregno': 2099593, 'relation': '>', 'value...      NaN  
3  [{'molregno': 115, 'relation': '=', 'value': 1...      NaN  
4  [{'molregno': 2099838, 'relation': '=', 'value...      NaN  


In [35]:
# Investigate targets
unique_targets = chembl_df.target.unique()
print(
    f'Number of unique targets: {len(unique_targets)}',
    f'\nNumber of total rows: {len(chembl_df)}'
)

# We see only one target
print(f'\nTarget: {unique_targets[0]}')

# Investigate number of entries with a document listed
chembl_with_doc_df = chembl_df[
    chembl_df.document.notnull()
]
print(f'\nNumber of entries with a document attached: {len(chembl_with_doc_df)}')

Number of unique targets: 1 
Number of total rows: 2394

Target: Cytochrome P450 3A4

Number of entries with a document attached: 2344


In [41]:
activities_df = pd.json_normalize(chembl_with_doc_df['activities'])
docs_df = pd.json_normalize(chembl_with_doc_df['document'])
combined_df = pd.concat(
    [chembl_with_doc_df.drop(columns=['activities', 'document']), 
     activities_df,
     docs_df
], axis=1)
print(combined_df.head())
combined_df.to_csv('combined_df.csv')

        assay               target  target_id  num_activities  \
8    828928.0  Cytochrome P450 3A4    17045.0            52.0   
9    993439.0  Cytochrome P450 3A4    17045.0            53.0   
10   823567.0  Cytochrome P450 3A4    17045.0            48.0   
11  1707938.0  Cytochrome P450 3A4    17045.0            48.0   
12  2246941.0  Cytochrome P450 3A4    17045.0            48.0   

                                                    0  \
8   {'molregno': 3965, 'relation': '>', 'value': 2...   
9   {'molregno': 398473, 'relation': '=', 'value':...   
10  {'molregno': 452343, 'relation': '=', 'value':...   
11  {'molregno': 1280306, 'relation': '=', 'value'...   
12  {'molregno': 88100, 'relation': '=', 'value': ...   

                                                    1  \
8   {'molregno': 38898, 'relation': '>', 'value': ...   
9   {'molregno': 398475, 'relation': '=', 'value':...   
10  {'molregno': 576498, 'relation': None, 'value'...   
11  {'molregno': 1280307, 'relation': 

In [20]:
# Look at the supporting documents
docs = chembl_with_doc_df.document
docs_df = pd.json_normalize(docs)
print(f'Information provided by document: {docs_df.columns.tolist()}')

# Journal information (suggesting similar criteria for acceptance)
journals = docs_df.journal.dropna().unique()
print(f'\nNumber of journals within supporting documents: {len(journals)}')
print(f'Journal names: {journals}')

Information provided by document: ['doi', 'pmid', 'journal', 'abstract', 'stupid_response']

Number of journals within supporting documents: 11
Journal names: ['Bioorg Med Chem Lett' 'J Med Chem' 'Bioorg Med Chem' 'ACS Med Chem Lett'
 'Eur J Med Chem' 'J Nat Prod' 'Drug Metab Dispos' 'Medchemcomm'
 'Antimicrob Agents Chemother' 'RSC Med Chem' 'Med Chem Res']


In [21]:
# Look at all datasets for activities
activities = chembl_with_doc_df.activities.to_numpy().ravel().tolist()
activities = [item for sublist in activities for item in sublist]

# Create activities df
activities_df = pd.json_normalize(activities)
print(f'Total number of activities for target: {len(activities_df)}')
print(activities_df.head())

Total number of activities for target: 10080
   molregno relation  value units standard_type  \
0   1353266        =  0.084    uM          IC50   
1   1353267        =  0.200    uM          IC50   
2   1353268        =  0.180    uM          IC50   
3   1353269        =  0.041    uM          IC50   
4   1353270        =  0.028    uM          IC50   

                                       compound_name  \
0  Thiazol-5-ylmethyl(2S,3R)-3-hydroxy-4-(N-methy...   
1  thiazol-5-ylmethyl(2S,3R)-4-(2-(3-(dimethylami...   
2  thiazol-5-ylmethyl(2S,3R)-4-(2-((2-(dimethylam...   
3  Thiazol-5-ylmethyl(2S,3R)-3-hydroxy-4-(N-isobu...   
4  Thiazol-5-ylmethyl(2S,3R)-4-(2-(ethylamino)-N-...   

                                     compound_smiles  
0  CN(C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1cncs1)C...  
1  CC(C)CN(C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1cn...  
2  CC(C)CN(C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1cn...  
3  CNc1nc2ccc(C(=O)N(CC(C)C)C[C@@H](O)[C@H](Cc3cc...  
4  CCNc1nc2ccc(C(=O)N(CC(C)C)C[C@@H]

In [22]:
# Get the molecular weight from a smiles string
def calculate_mol_weight(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Descriptors.ExactMolWt(mol)

# Known unit conversion factors
unit_dict = {
    'mM': 1e3,
    'nM': 1e-3
}

# Function to get unit conversion factor
def get_conversion_factor(unit, mol_weight=None):
    if unit == 'ug ml-1':
        return 1e3 / mol_weight
    elif unit == 'mg/ml':
        return 1e6 / mol_weight
    else:
        return unit_dict.get(unit)

# Units to convert to
desired_units = ['uM', '10^-6 mol/L', 'microM', 'umol/L']

# Function to convert units
def convert_units(row):
    smiles, value, unit = row['smiles'], row['value'], row['unit']
    
    if unit in desired_units:
        row['unit'] = desired_units[0]
        return row
    
    mol_weight = calculate_mol_weight(smiles)
    conversion_factor = get_conversion_factor(unit, mol_weight)
    if conversion_factor:
        row['value'] = value * conversion_factor
        row['unit'] = desired_units[0]
    
    return row

In [23]:
# Fix activities data
activities_df.columns = [
    'molregno', 'relation', 'value', 'unit', 'standard_type',
    'compound_name', 'smiles'
]

# Allowed unit types
allowed_units = [
    'uM', '10^-6 mol/L', 'microM', 'umol/L', 
    'nM', 'mM', 'ug ml-1', # 'mg/ml'
]

# Remove incomplete rows
activities_df = activities_df[
    (activities_df['value'].notna()) &
    (activities_df['value'] != 0) &
    (activities_df['unit'].notna()) &
    (activities_df['unit'].isin(allowed_units)) &
    (activities_df['relation'].notna())
]

# Convert ic50 value to correct units
activities_df = activities_df.apply(convert_units, axis=1)

# Sort values
activities_df = activities_df.sort_values(by='value', ascending=True)

In [24]:
# Combine data into an overall dataframe
combined_df = pd.concat([
    chembl_with_doc_df.drop(columns=['document', 'activities']), 
    activities_df,
    docs_df
], axis=1)
print(f'Columns: {combined_df.columns.tolist()}\n')
print(combined_df.head())

Columns: ['assay', 'target', 'target_id', 'num_activities', 'molregno', 'relation', 'value', 'unit', 'standard_type', 'compound_name', 'smiles', 'doi', 'pmid', 'journal', 'abstract', 'stupid_response']

        assay               target  target_id  num_activities   molregno  \
8    828928.0  Cytochrome P450 3A4    17045.0            52.0  1353274.0   
9    993439.0  Cytochrome P450 3A4    17045.0            53.0  1353275.0   
10   823567.0  Cytochrome P450 3A4    17045.0            48.0  1353276.0   
11  1707938.0  Cytochrome P450 3A4    17045.0            48.0  1353277.0   
12  2246941.0  Cytochrome P450 3A4    17045.0            48.0        NaN   

   relation  value unit standard_type  \
8         =  0.026   uM          IC50   
9         =  0.026   uM          IC50   
10        =  0.040   uM          IC50   
11        =  0.140   uM          IC50   
12      NaN    NaN  NaN           NaN   

                                        compound_name  \
8   Thiazol-5-ylmethyl(2S,3R)-4-(N-(

In [25]:
# Investigate dois for relational data with < or >
greater_than_df = combined_df[combined_df['relation'] == '>']
less_than_df = combined_df[combined_df['relation'] == '<']

# Compare these values to combined_df
print(greater_than_df.value.describe())
print(less_than_df.value.describe())

count     3984.000000
mean        84.232968
std       1151.385925
min          0.005000
25%         10.000000
50%         25.000000
75%         40.250000
max      50000.000000
Name: value, dtype: float64
count    156.000000
mean       8.428385
std       28.243117
min        0.000100
25%        0.100000
50%        1.000000
75%       10.000000
max      200.000000
Name: value, dtype: float64


In [27]:
# Separate relational data based on previous findings
df = combined_df.copy()
df = df[
    (df['relation'] == '=')
]
df = df.sort_values(by='value', ascending=True).reset_index(drop=True)

# Save data for regression task
df = df[['smiles', 'value', 'doi']]
df = df[df['value'] < 1000]
df['log_value'] = df['value'].apply(np.log10)
df.to_csv(f'{cur_dir}/data/regression.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 5118 entries, 0 to 5117
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   smiles     5118 non-null   object 
 1   value      5118 non-null   float64
 2   doi        1346 non-null   object 
 3   log_value  5118 non-null   float64
dtypes: float64(2), object(2)
memory usage: 199.9+ KB


Unnamed: 0,smiles,value,doi,log_value
0,CC(C)[C@@H]1COC[C@H](C2(OC(=O)N3CCNCC3)CC2)N1S...,0.0003,10.1021/acs.jmedchem.5b00963,-3.522879
1,Cc1cc(N2CCC(O)CC2)cc2[nH]c(-c3c(NC[C@@H](O)c4c...,0.0004,10.1021/jm401430e,-3.39794
2,O=C(NC1CCC(=Cc2cccc(Oc3ccc(C(F)(F)F)cn3)c2)CC1...,0.0004,,-3.39794
3,COC(=O)NC1CCN(c2cc(C)c3nc(-c4c(NC[C@@H](O)c5cc...,0.0005,10.1016/j.bmc.2015.03.038,-3.30103
4,Cc1cc(N2CCC(N3C[C@@H]4C[C@@H]3CO4)CC2)cc2[nH]c...,0.0005,10.1021/acs.jmedchem.5b00132,-3.30103
