# ChEMBL molecules details

Get more details on molecules from the recombined library that were found in ChEMBL:
- Get targets that these molecules are active on
- Get classification for these targets (e.g. are these targets kinases?)

In [1]:
from chembl_webresource_client.new_client import new_client
import pandas as pd
import requests
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole

## Load ChEMBL molecules

These are the molecules from the recombined library that were found in ChEMBL.

In [2]:
with open('novel_chembl_mols.txt', 'r') as f:
    lines = f.readlines()
    
molecules = [i[:-1] for i in lines]
molecules = [Chem.InchiToInchiKey(inchi) for inchi in molecules]

In [3]:
len(molecules)

121

## Get molecules details from ChEMBL

In [4]:
similarity = new_client.similarity
compound = new_client.molecule
activity = new_client.activity
target = new_client.target

### Get molecule ChEMBL IDs

In [None]:
details = []

for molecule in molecules:
    
    result = compound.get(molecule) # similarity.filter(smiles=molecule, similarity=100)
    
    details.append({
            'smiles': result['molecule_structures']['canonical_smiles'], 
            'inchi': result['molecule_structures']['standard_inchi'],
            'atc_classification': result['atc_classifications'], 
            'first_approval': result['first_approval'],
            'chembl_id': result['molecule_chembl_id']
    })
        
details = pd.DataFrame(details)

In [None]:
print(f'Number of queries: {len(molecules)}')
print(f'Number of query results: {details.shape[0]}')

In [None]:
details.head()

### Get measured activities for molecules

In [None]:
activities = activity.filter(
    molecule_chembl_id__in = list(details.chembl_id),
    type = 'IC50', 
    relation = '=', 
    assay_type = 'B'
).only(
    'activity_id','assay_chembl_id', 'assay_description', 'assay_type', \
    'molecule_chembl_id', 'type', 'units', 'relation', 'value', \
    'target_chembl_id', 'target_organism', 'target_pref_name'
)

In [None]:
len(activities)

In [None]:
bioact_df = pd.DataFrame.from_records(activities)
bioact_df.shape

In [None]:
print(f'Number of unique molecules: {len(bioact_df.molecule_chembl_id.unique())}')

In [None]:
bioact_df.units.unique()

In [None]:
def convert_to_nM(unit, bioactivity):
    
    conversion_factors = {
        "pM": 1e-3,
        "10'-11M": 1e-2,
        "10'-10M": 1e-1,
        "nM": 1e+0,
        "10'-8M": 1e+1,
        "10'-1microM": 1e+2,
        "10'-7M": 1e+2,
        "uM": 1e+3,
        "/uM": 1e+3,
        "10'-6M": 1e+3,
        "10'1 uM": 1e+4,
        "10'2 uM": 1e+5,
        "mM": 1e+6,
        "mmol/L": 1e+6,
        "M": 1e+9
    }
    
    try:
        return float(bioactivity) * conversion_factors[unit]
        
    except KeyError:
        print(f'Unit not recognized: {unit}')
        return

In [None]:
bioact_df.rename(columns={"units": "units_original", "value": "value_original"}, inplace=True)

In [None]:
# Convert all to nM
bioactivity_nM = []
for i, row in bioact_df.iterrows():
    bioact_nM = convert_to_nM(row['units_original'], row['value_original'])
    bioactivity_nM.append(bioact_nM)
bioact_df['value'] = bioactivity_nM
bioact_df['units'] = 'nM'
bioact_df.shape

In [None]:
bioact_df.head()

In [None]:
bioact_df = bioact_df.astype({'value': 'float'})

### Get all active molecules

In [None]:
activity_threshold = 500

In [None]:
bioact_df_active = bioact_df[bioact_df.value <= activity_threshold].copy()  # try out with 500, 50, 5

In [None]:
bioact_df_active.shape

In [None]:
len(bioact_df_active.molecule_chembl_id.unique())

In [None]:
bioact_df_active.head()

#### Check number of unique target names

In [None]:
bioact_df_active.target_pref_name.value_counts()

In [None]:
len(bioact_df_active.target_pref_name)

In [None]:
len(bioact_df_active.target_pref_name.value_counts())

#### Check number of unique ChEMBL target IDs

In [None]:
bioact_df_active.target_chembl_id.value_counts()

In [None]:
len(bioact_df_active.target_chembl_id)

In [None]:
len(bioact_df_active.target_chembl_id.value_counts())

### Multiple assay tests per molecule

In [None]:
bioact_df_active.groupby(by='molecule_chembl_id')['value'].describe()

In [None]:
len(bioact_df_active.molecule_chembl_id.unique())

### Get all ATC classified molecules

In [None]:
# check if first_approval field is not empty
details[details.first_approval == details.first_approval]

In [None]:
# check if atc classified field is not empty
details[details.atc_classification.str.len() != 0]

### Get all approved drugs

In [None]:
approved_drugs = compound.filter(molecule_chembl_id__in = list(details.chembl_id), max_phase=4)

In [None]:
len(approved_drugs)

In [None]:
approved_mol = approved_drugs[0]

In [None]:
chembl_id = approved_mol['molecule_chembl_id']
chembl_id

In [None]:
activities = activity.filter(molecule_chembl_id = chembl_id) \
              .filter(type = 'IC50') \
              .filter(assay_type = 'B') \
              .only('activity_id','assay_chembl_id', 'assay_description', 'assay_type', \
                    'molecule_chembl_id', 'type', 'units', 'relation', 'value', \
                    'target_chembl_id', 'target_organism')

In [None]:
act = pd.DataFrame(list(activities))
act

### Get ChEMBL target details: Protein target classification

In order to access the protein target classification, we follow the instructions given here:

https://github.com/chembl/chembl_webservices_2/issues/95

In [None]:
target_chembl_ids = list(bioact_df_active.target_chembl_id.unique())
print(len(target_chembl_ids))
target_chembl_ids

#### Define request functions for `target`, `target_components` and `protein_class` ChEMBL endpoints

In [None]:
def extract_component_id_from_target(target_chembl_id):
    """
    Go to `target` endpoint and extract `component_id`
    """
    
    target_url = f'https://www.ebi.ac.uk/chembl/api/data/target/{target_chembl_id}.json'
    #print(target_url)
    
    response = requests.get(target_url)
    response.raise_for_status()
    result = response.json()

    component_ids = [i['component_id'] for i in result['target_components']]
    return component_ids


In [None]:
def extract_protein_classification_id_from_target_components(component_id):
    """
    Go to `target_components` endpoint and extract `protein_classification_id`
    """
    
    target_components_url = f'https://www.ebi.ac.uk/chembl/api/data/target_component/{component_id}.json'
    #print(target_components_url)

    response = requests.get(target_components_url)
    response.raise_for_status()  # this line checks for potential errors
    result = response.json()

    protein_classification_ids = [i['protein_classification_id'] for i in result['protein_classifications']]

    return protein_classification_ids

In [None]:
def get_protein_target_classification_from_protein_class(protein_classification_id):
    """
    Go to `protein_class` endpoint and extract protein target classification.
    """

    protein_class_url = f'https://www.ebi.ac.uk/chembl/api/data/protein_class/{protein_classification_id}.json'
    #print(protein_class_url)

    response = requests.get(protein_class_url)
    response.raise_for_status()  # this line checks for potential errors
    result = response.json()

    return pd.Series(result)

In [None]:
def get_protein_target_classifications(target_chembl_ids):
    """
    Get protein target classifications for a list of target ChEMBL IDs (in the form of a DataFrame).
    """

    results = []

    for target_chembl_id in target_chembl_ids:
        #print(target_chembl_id)

        # Go to `target` endpoint and extract `component_id`
        component_ids = extract_component_id_from_target(target_chembl_id)
        
        if len(component_ids) != 1:
            print(f'{target_chembl_id}: {len(component_ids)} component IDs for target ChEMBL ID.')

        for component_id in component_ids:
            #print(component_id)

            # Go to `target_components` endpoint and extract `protein_classification_id`
            protein_classification_ids = extract_protein_classification_id_from_target_components(component_id)

            if len(protein_classification_ids) != 1:
                print(f'{target_chembl_id}: {len(protein_classification_ids)} protein classification IDs for target ChEMBL ID.\n')    
            
            for protein_classification_id in protein_classification_ids:
                #print(protein_classification_id)

                # Go to `protein_class` endpoint and extract protein target classification.
                protein_target_classification = get_protein_target_classification_from_protein_class(protein_classification_id)

                # Add ID details
                protein_target_classification['target_chembl_id'] = target_chembl_id
                protein_target_classification['component_id'] = component_id
                protein_target_classification['protein_classification_id'] = protein_classification_id

                results.append(protein_target_classification)
                
    return pd.DataFrame(results)

#### Get protein target classification for target ChEMBL IDs

Check for each query, how many protein target classification entries are available!

In [None]:
len(target_chembl_ids)

In [None]:
protein_target_classifications = get_protein_target_classifications(target_chembl_ids)

In [None]:
protein_target_classifications.shape

In [None]:
len(protein_target_classifications.target_chembl_id.unique())

#### Look at protein target classifications

In [None]:
protein_target_classifications

#### Per target ChEMBL ID, drop duplicated content (manually!)

Get indices that we want to drop - this is a manual step because we need to decide which entry per target ChEMBL ID we would like to keep/toss.

In [None]:
protein_target_classifications.groupby(by='target_chembl_id').apply(lambda x: x if (x.shape[0] > 1) else None).dropna(how='all')

In [None]:
# Manual step!!
drop_class_ix = [6, 11, 17, 19]

In [None]:
protein_target_classifications.shape

In [None]:
protein_target_classifications.drop(drop_class_ix, inplace=True)

In [None]:
protein_target_classifications.shape

#### Show enzyme group distribution

In [None]:
protein_target_classifications.groupby(by='l2').size()

### Now combine molecule with target information!

In [None]:
protein_target_classifications.shape

In [None]:
bioact_df_active.shape

In [None]:
molecules_targets_details = pd.merge(bioact_df_active, protein_target_classifications, on='target_chembl_id', how='left')

In [None]:
molecules_targets_details.sort_values(by='molecule_chembl_id')

In [None]:
columns_of_interest = 'molecule_chembl_id target_chembl_id assay_chembl_id target_organism target_pref_name l2 l3 l4 l4 l5 l6 value units assay_description'.split()

In [None]:
molecules_of_interest = molecules_targets_details[columns_of_interest]
molecules_of_interest.shape

In [None]:
molecules_of_interest

#### Filter molecules of interest

- Human targets
- Kinases

In [None]:
molecules_selected = molecules_of_interest[molecules_of_interest.target_organism == 'Homo sapiens'].copy()
molecules_selected.drop(['target_organism'], axis=1, inplace=True)
molecules_selected.shape

In [None]:
molecules_selected

In [None]:
molecules_selected = molecules_selected[molecules_selected.l2 == 'Kinase'].copy()
molecules_selected.drop(['l2'], axis=1, inplace=True)
molecules_selected.shape

In [None]:
molecules_selected.sort_values(by=['molecule_chembl_id', 'value'])

In [None]:
len(molecules_selected.molecule_chembl_id.unique())

In [None]:
molecules_selected.groupby(by='molecule_chembl_id').mean().sort_values('value')

In [None]:
molecules_selected