**Notebook - Preparing MOA Label Data**

This notebook loads compound mechanism of action information obtained from ChEMBL and the Drug Repurposing Hub and alters then to align with the information from the JUMP dataset metadata.

# Imports:

In [11]:
import pandas as pd
import numpy as np
import rdkit.Chem.inchi as inchi

# ChEMBL MOA Data:

See foot of https://github.com/jump-cellpainting/compound-annotator for data creation steps.

## ChEMBL annotation file

In [3]:
# Read in the ChEMBL annotation file:
chembl_df = pd.read_csv("../data/metadata/chembl/chembl_annotation.csv.gz")

In [4]:
# columns to drop from ChEMBL data:
drop_cols = ['assay_chembl_id','target_chembl_id','assay_type','pchembl_value',
             'confidence_score', 'pref_name']
chembl_df = chembl_df.drop(drop_cols, axis=1).drop_duplicates().reset_index(drop=True)
print(chembl_df.shape)
chembl_df.head(2)

(556272, 2)


Unnamed: 0,molecule_chembl_id,standard_inchi_key
0,CHEMBL4107559,UVVXRMZCPKQLAO-OAHLLOKOSA-N
1,CHEMBL4108338,OZBMIGDQBBMIRA-CQSZACIVSA-N


## ChEMBL Kinase Mechanism Data:
Source: https://www.ebi.ac.uk/chembl/g/#browse/mechanisms_of_action

In [6]:
chembl_kinase = pd.read_csv('../data/metadata/chembl/chembl_kinase_moa.csv')
print(chembl_kinase.shape)
chembl_kinase.head(2)

(1022, 12)


Unnamed: 0,molecule_chembl_id,Parent Molecule Name,Parent Molecule Type,Max Phase,Mechanism of Action,Target Name,moa,Smiles,target_chembl_id,Action Type,Target Type,Target Organism
0,CHEMBL4650485,PIRTOBRUTINIB,Small molecule,3,Tyrosine-protein kinase BTK inhibitor,Tyrosine-protein kinase BTK,BTK inhibitor,COc1ccc(F)cc1C(=O)NCc1ccc(-c2nn([C@@H](C)C(F)(...,CHEMBL5251,INHIBITOR,SINGLE PROTEIN,Homo sapiens
1,CHEMBL3991932,PEXMETINIB,Small molecule,1,Tyrosine-protein kinase TIE-2 inhibitor,Tyrosine-protein kinase TIE-2,TIE inhibitor,Cc1ccc(-n2nc(C(C)(C)C)cc2NC(=O)NCc2cc(F)ccc2Oc...,CHEMBL4128,INHIBITOR,SINGLE PROTEIN,Homo sapiens


## Combine MOA with inchi_key data:
'standard_inchi_key' in ChEMBL can be matched to 'Metadata_InChIKey' in the JUMP compound metadata, see here for more details: https://github.com/jump-cellpainting/compound-annotator

In [7]:
chembl_data = pd.merge(chembl_kinase, chembl_df, on='molecule_chembl_id', how='inner')
print(chembl_data.shape)
chembl_data.head(2)

(672, 13)


Unnamed: 0,molecule_chembl_id,Parent Molecule Name,Parent Molecule Type,Max Phase,Mechanism of Action,Target Name,moa,Smiles,target_chembl_id,Action Type,Target Type,Target Organism,standard_inchi_key
0,CHEMBL3991932,PEXMETINIB,Small molecule,1,Tyrosine-protein kinase TIE-2 inhibitor,Tyrosine-protein kinase TIE-2,TIE inhibitor,Cc1ccc(-n2nc(C(C)(C)C)cc2NC(=O)NCc2cc(F)ccc2Oc...,CHEMBL4128,INHIBITOR,SINGLE PROTEIN,Homo sapiens,LNMRSSIMGCDUTP-UHFFFAOYSA-N
1,CHEMBL3991932,PEXMETINIB,Small molecule,1,MAP kinase p38 alpha inhibitor,MAP kinase p38 alpha,p38 MAPK inhibitor,Cc1ccc(-n2nc(C(C)(C)C)cc2NC(=O)NCc2cc(F)ccc2Oc...,CHEMBL260,INHIBITOR,SINGLE PROTEIN,Homo sapiens,LNMRSSIMGCDUTP-UHFFFAOYSA-N


In [None]:
# Rename columns to align with Drug Repo. Data:
chembl_data.rename(columns={"standard_inchi_key": "Metadata_InChIKey", "Smiles":"smiles",
                          "Target Name": "target", "Max Phase": "clinical_phase"}, inplace=True)
chembl_data['moa_src'] = 'chembl'
# Drop any duplicates with same ID and moa:
chembl_data = chembl_data.drop_duplicates(subset=['Metadata_InChIKey', 'moa'])

# Drug Repurposing Hub MOA Data:
Sourced from https://clue.io/repurposing#download-data
and https://github.com/jump-cellpainting/compound-annotator/blob/main/data/repurposing_samples_20200324_cleaned.txt.gz

## Drug Dataframe
Contains MOA labels.

In [8]:
drug_df = pd.read_csv('../data/metadata/clue_io/repurposing_drugs_20200324_cleaned.txt', 
                      encoding="ISO-8859-1", sep="\t", comment="!")

print(drug_df.shape)
drug_df.head(2)

(6798, 6)


Unnamed: 0,pert_iname,clinical_phase,moa,target,disease_area,indication
0,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease
1,(R)-(-)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4A|PDE4B|PDE4C|PDE4D|PDE5A,,


## Sample Dataframe
Contains 'InChlKey', used to match to JUMP data.

In [9]:
sample_df = pd.read_csv('../data/metadata/clue_io/repurposing_samples_20200324_cleaned.txt', 
                        encoding="ISO-8859-1", sep="\t", comment="!")

print(sample_df.shape)
sample_df.head(2)

(13553, 12)


Unnamed: 0,broad_id,pert_iname,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,InChIKey,pubchem_cid,deprecated_broad_id
0,BRD-K76022557-003-28-9,(R)-(-)-apomorphine,0,98.9,MedChemEx,HY-12723A,Apomorphine (hydrochloride hemihydrate),267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,
1,BRD-K76022557-003-02-7,(R)-(-)-apomorphine,0,97.34,Tocris,2073,(R)-(-)-Apomorphine hydrochloride,267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,


## Merge the Drug and Sample Dataframes:
See https://github.com/broadinstitute/lincs-cell-painting/tree/master/metadata/moa for previous usage of this approach.

In [12]:
drug_samp = drug_df.merge(sample_df, on="pert_iname", how="inner").reset_index(drop=True)

# Move broad_id to first column
col_order = drug_samp.columns.tolist()
col_order.insert(0, col_order.pop(col_order.index("broad_id")))
drug_samp = drug_samp.loc[:, col_order].assign(
    InChIKey14=drug_samp.InChIKey.apply(
        lambda x: inchi.InchiToInchiKey(x) if (x.startswith("InChI")) else x
    ).apply(lambda x: str(x)[:14])
)

# Output to file
output_file = "../data/metadata/clue_io/repurposing_moa"
drug_samp.rename(columns={"InChIKey": "Metadata_InChIKey"}, inplace=True)
# drug_samp.to_csv(f"{output_file}.csv", index=False)

print(drug_samp.shape)
drug_samp.head(2)

(13553, 18)


Unnamed: 0,broad_id,pert_iname,clinical_phase,moa,target,disease_area,indication,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,Metadata_InChIKey,pubchem_cid,deprecated_broad_id,InChIKey14
0,BRD-K76022557-003-28-9,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease,0,98.9,MedChemEx,HY-12723A,Apomorphine (hydrochloride hemihydrate),267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,,VMWNQDUVQKEIOC
1,BRD-K76022557-003-02-7,(R)-(-)-apomorphine,Launched,dopamine receptor agonist,ADRA2A|ADRA2B|ADRA2C|CALY|DRD1|DRD2|DRD3|DRD4|...,neurology/psychiatry,Parkinson's Disease,0,97.34,Tocris,2073,(R)-(-)-Apomorphine hydrochloride,267.126,CN1CCc2cccc-3c2[C@H]1Cc1ccc(O)c(O)c-31,VMWNQDUVQKEIOC-CYBMUJFWSA-N,6005.0,,VMWNQDUVQKEIOC


## Clean the DR Hub Data:

In [None]:
# Drop any rows where the compound key or the moa is Nan:
drug_samp = drug_samp.dropna(subset=['Metadata_InChIKey', 'moa'])

# Fill any Nan values in target column with 'Not listed'.:
drug_samp[['target']] = drug_samp[['target']].fillna(value='Not listed')

# Drop duplicates where Metadata_InChIKey and moa are the same (i.e. just a different vendor etc.)
drug_samp = drug_samp.drop_duplicates(subset=['Metadata_InChIKey', 'moa'])

# Subsetting the dataframe to only include inhibitors:
dhub_inhibitors = drug_samp[drug_samp['moa'].str.contains("inhibitor")].reset_index(drop=True)

# Assigning moa source:
dhub_inhibitors['moa_src'] = 'dr_hub'

# Manual changes to long-form MOA IDs:
Detailed below are the manual changes made to the above databases:

- Replaced "glycogen synthase kinase inhibitor" with GSK inhibitor.
- Replaced "Aurora kinase inhibitor" with AURK inhibitor.
- Replaced "rho associated kinase inhibitor" with ROCK inhibitor.
- Replaced "ALK tyrosine kinase receptor inhibitor" with ALK inhibitor.
- Replaced "Bruton's tyrosine kinase (BTK) inhibitor" with BTK inhibitor.
- Replaced "PDGFR tyrosine kinase receptor inhibitor" with PDGFR inhibitor.
- Altered any numbered CDK inhibitors i.e., "CDK9 inhibitor" to just "CDK inhibitor".
- Altered moas according to target column, i.e. EGFR|ERBB2 target = EGFR inhibitor|HER2 inhibitor
- Separated any BCR-ABL inhibitors into BCR and ABL by targets.

# Saving to Csv:

In [None]:
# Save to csv file:
chembl_data.to_csv('../data/metadata/chembl/chembl_cleaned.csv', index=False)

In [None]:
# Save to CSV file:
dhub_inhibitors.to_csv("../data/metadata/clue_io/dhub_inhibitors.csv", index=False)