# Imports

In [36]:
# Standard library imports
import os

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Descriptors

# Local imports

# Look at ChemBL Data

In [37]:
# Get json file path
cur_dir = os.path.dirname(os.path.realpath('__file__'))
json_path = f'{cur_dir}/data/chembl340_all_activities_annotated.json'

# Load into DataFrame
chembl_df = pd.read_json(json_path)

# Investigate targets
unique_targets = chembl_df.target.unique()
print(
    f'Number of unique targets: {len(unique_targets)}',
    f'\nNumber of total rows: {len(chembl_df)}'
)

# We see only one target
print(f'\nTarget: {unique_targets[0]}')

# Investigate number of entries with a document listed
chembl_with_doc_df = chembl_df[
    (chembl_df.document.notnull()) &
    (chembl_df.target == unique_targets[0])
].reset_index(drop=True)
chembl_with_doc_df['original_index'] = np.arange(len(chembl_with_doc_df))
print(f'\nNumber of entries with a document attached: {len(chembl_with_doc_df)}')
chembl_with_doc_df.to_json(f'{cur_dir}/data/chembl_CYP3A4_with_doi.json', orient='records')

# Look at the supporting documents
docs = chembl_with_doc_df.document
docs_df = pd.json_normalize(docs)
print(f'Information provided by document: {docs_df.columns.tolist()}')

# Journal information (suggesting similar criteria for acceptance)
journals = docs_df.journal.dropna().unique()
print(f'\nNumber of journals within supporting documents: {len(journals)}')
print(f'Journal names: {journals}')

Number of unique targets: 1 
Number of total rows: 2394

Target: Cytochrome P450 3A4

Number of entries with a document attached: 2344
Information provided by document: ['doi', 'pmid', 'journal', 'abstract', 'stupid_response']

Number of journals within supporting documents: 11
Journal names: ['Bioorg Med Chem Lett' 'J Med Chem' 'Bioorg Med Chem' 'ACS Med Chem Lett'
 'Eur J Med Chem' 'J Nat Prod' 'Drug Metab Dispos' 'Medchemcomm'
 'Antimicrob Agents Chemother' 'RSC Med Chem' 'Med Chem Res']


In [40]:
# Extract activities and preserve original indices
activities = chembl_with_doc_df.activities.to_numpy().ravel().tolist()

# Flatten the list of activities while preserving indices
flattened_activities, original_indices, activity_indices = [], [], []
for index, sublist in enumerate(activities):
    for activity_index, item in enumerate(sublist):
        flattened_activities.append(item)
        original_indices.append(index)
        activity_indices.append(activity_index)
        
activities_df = pd.DataFrame(flattened_activities)
activities_df['original_index'] = original_indices
activities_df['activity_index'] = activity_indices

# Get the molecular weight from a smiles string
def calculate_mol_weight(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Descriptors.ExactMolWt(mol)

# Known unit conversion factors
unit_dict = {
    'mM': 1e3,
    'nM': 1e-3
}

# Function to get unit conversion factor
def get_conversion_factor(unit, mol_weight=None):
    if unit == 'ug ml-1':
        return 1e3 / mol_weight
    elif unit == 'mg/ml':
        return 1e6 / mol_weight
    else:
        return unit_dict.get(unit)

# Units to convert to
desired_units = ['uM', '10^-6 mol/L', 'microM', 'umol/L']

# Function to convert units
def convert_units(row):
    smiles, value, unit = row['smiles'], row['value'], row['unit']
    
    if unit in desired_units:
        row['unit'] = desired_units[0]
        return row
    
    mol_weight = calculate_mol_weight(smiles)
    conversion_factor = get_conversion_factor(unit, mol_weight)
    if conversion_factor:
        row['value'] = value * conversion_factor
        row['unit'] = desired_units[0]
    
    return row

# Fix activities data
activities_df.columns = [
    'molregno', 'relation', 'value', 'unit', 
    'standard_type', 'compound_name', 'smiles',
    'original_index', 'activity_index'
]

# Allowed unit types
allowed_units = [
    'uM', '10^-6 mol/L', 'microM', 'umol/L', 
    'nM', 'mM', 'ug ml-1', # 'mg/ml'
]

# Remove incomplete rows
activities_df = activities_df[
    (activities_df['value'].notna()) &
    (activities_df['value'] != 0) &
    (activities_df['unit'].notna()) &
    (activities_df['unit'].isin(allowed_units)) &
    (activities_df['relation'].notna()) &
    (activities_df['relation'] == '=')
]

# Convert ic50 value to correct units
activities_df = activities_df.apply(convert_units, axis=1)

print(f'Total number of activities for target: {len(activities_df)}')

Total number of activities for target: 5126


In [41]:
# Combine data into an overall dataframe
combined_df = pd.concat([
    chembl_with_doc_df.drop(columns=['document', 'activities']), 
    docs_df
], axis=1).reset_index(drop=True)
combined_df = pd.merge(activities_df, combined_df, how='left', left_on='original_index', right_on='original_index')

print(f'Columns: {combined_df.columns.tolist()}\n')
print(combined_df.head())

Columns: ['molregno', 'relation', 'value', 'unit', 'standard_type', 'compound_name', 'smiles', 'original_index', 'activity_index', 'assay', 'target', 'target_id', 'num_activities', 'doi', 'pmid', 'journal', 'abstract', 'stupid_response']

   molregno relation  value unit standard_type  \
0   1353266        =  0.084   uM          IC50   
1   1353267        =  0.200   uM          IC50   
2   1353268        =  0.180   uM          IC50   
3   1353269        =  0.041   uM          IC50   
4   1353270        =  0.028   uM          IC50   

                                       compound_name  \
0  Thiazol-5-ylmethyl(2S,3R)-3-hydroxy-4-(N-methy...   
1  thiazol-5-ylmethyl(2S,3R)-4-(2-(3-(dimethylami...   
2  thiazol-5-ylmethyl(2S,3R)-4-(2-((2-(dimethylam...   
3  Thiazol-5-ylmethyl(2S,3R)-3-hydroxy-4-(N-isobu...   
4  Thiazol-5-ylmethyl(2S,3R)-4-(2-(ethylamino)-N-...   

                                              smiles  original_index  \
0  CN(C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)OCc1cncs1)C..

In [62]:
# Get df to save
df = combined_df.copy()
df = df[df['value'] < 1000]
df['log_value'] = df['value'].apply(np.log10)

# Save data for regression task
df.to_csv(f'{cur_dir}/data/regression.csv')
df.to_json(f'{cur_dir}/data/regression.json', orient='records')

In [63]:
# Look at the number of entries per journal
print('Number of entries for journal:')
for journal in journals:
    print(f'\t{journal}: {len(df[df["journal"] == journal])}')

Number of entries for journal:
	Bioorg Med Chem Lett: 2220
	J Med Chem: 1917
	Bioorg Med Chem: 247
	ACS Med Chem Lett: 282
	Eur J Med Chem: 189
	J Nat Prod: 61
	Drug Metab Dispos: 80
	Medchemcomm: 24
	Antimicrob Agents Chemother: 2
	RSC Med Chem: 6
	Med Chem Res: 0


In [65]:
print(df.columns)

Index(['molregno', 'relation', 'value', 'unit', 'standard_type',
       'compound_name', 'smiles', 'original_index', 'activity_index', 'assay',
       'target', 'target_id', 'num_activities', 'doi', 'pmid', 'journal',
       'abstract', 'stupid_response', 'log_value'],
      dtype='object')


In [81]:
duplicates_df = df[df.duplicated(['compound_name', 'smiles'])].sort_values(by='compound_name')
duplicates_df.to_csv('duplicates.csv')

In [None]:
'''
	doi
1	10.1016/s0960-894x(98)00653-2
2	10.1016/j.bmc.2020.115349
3	10.1021/jm400288z
4	10.1016/j.bmcl.2012.03.070
5	10.1021/jm021012t
'''