**Notebook - Aligning MOA to Jump Data**

This notebook loads the JUMP-CP metadata and cross-references it with the Broad Drug Repurposing Hub and ChEMBL data to match compound chemical IDs ("InChIKey") in order to relate mechanisms of action to specific datapoints.

# Imports:

In [2]:
import pandas as pd
import numpy as np

# Load JUMP metadata:

In [3]:
cwp_data = pd.read_csv('../data/cwp_data.csv')
print(cwp_data.shape)
cwp_data.head(2)

(945604, 8)


Unnamed: 0,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_Batch,Metadata_PlateType
0,JCP2022_000001,AAAHWCWPZPSPIW-UHFFFAOYSA-N,InChI=1S/C25H31N5O2/c1-4-23-26-14-16-30(23)24-...,source_1,UL001783,C29,Batch5_20221030,COMPOUND
1,JCP2022_000013,AABSTWCOLWSFRA-UHFFFAOYSA-N,InChI=1S/C17H19N5O2S/c1-11-20-14(16-22(11)7-8-...,source_1,UL001783,O05,Batch5_20221030,COMPOUND


# Load ChEMBL and Drug Repo. Hub (DRH) Data:

In [4]:
# Load ChEMBL Data:
chembl_moa = pd.read_csv("../data/metadata/chembl/chembl_cleaned.csv")
print(chembl_moa.shape)
chembl_moa.head(2)

(564, 14)


Unnamed: 0,molecule_chembl_id,Parent Molecule Name,Parent Molecule Type,clinical_phase,Mechanism of Action,target,moa,smiles,target_chembl_id,Action Type,Target Type,Target Organism,Metadata_InChIKey,moa_src
0,CHEMBL3991932,PEXMETINIB,Small molecule,1,Tyrosine-protein kinase TIE-2 inhibitor,Tyrosine-protein kinase TIE-2,TIE inhibitor,Cc1ccc(-n2nc(C(C)(C)C)cc2NC(=O)NCc2cc(F)ccc2Oc...,CHEMBL4128,INHIBITOR,SINGLE PROTEIN,Homo sapiens,LNMRSSIMGCDUTP-UHFFFAOYSA-N,chembl
1,CHEMBL3991932,PEXMETINIB,Small molecule,1,MAP kinase p38 alpha inhibitor,MAP kinase p38 alpha,p38 MAPK inhibitor,Cc1ccc(-n2nc(C(C)(C)C)cc2NC(=O)NCc2cc(F)ccc2Oc...,CHEMBL260,INHIBITOR,SINGLE PROTEIN,Homo sapiens,LNMRSSIMGCDUTP-UHFFFAOYSA-N,chembl


In [5]:
# Load Drug Repo. Hub Data:
dhub_inhibitors = pd.read_csv("../data/metadata/clue_io/dhub_inhibitors.csv")
print(dhub_inhibitors.shape)
dhub_inhibitors.head(2)

(3205, 19)


Unnamed: 0,broad_id,pert_iname,clinical_phase,moa,target,disease_area,indication,qc_incompatible,purity,vendor,catalog_no,vendor_name,expected_mass,smiles,Metadata_InChIKey,pubchem_cid,deprecated_broad_id,InChIKey14,moa_src
0,BRD-K75516118-001-04-1,(R)-(-)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4A|PDE4B|PDE4C|PDE4D|PDE5A,,,0,93.92,Tocris,1349,(R)-(-)-Rolipram,275.152,COc1ccc(cc1OC1CCCC1)[C@@H]1CNC(=O)C1,HJORMJIFDVBMOB-LBPRGKRZSA-N,448055.0,,HJORMJIFDVBMOB,dr_hub
1,BRD-K65856711-001-05-9,(S)-(+)-rolipram,Phase 1,phosphodiesterase inhibitor,PDE4B|PDE4D,,,0,94.78,Tocris,1350,(S)-(+)-Rolipram,275.152,COc1ccc(cc1OC1CCCC1)[C@H]1CNC(=O)C1,HJORMJIFDVBMOB-GFCCVEGCSA-N,158758.0,,HJORMJIFDVBMOB,dr_hub


# Merging DRH and ChEMBL MOA Data:

In [7]:
# Columns that both datasets share which will be used to merge:
keep_cols = ['moa', 'target', 'smiles', 'clinical_phase', 'Metadata_InChIKey', 'moa_src']
ds_reduced = dhub_inhibitors[keep_cols]
ch_reduced = chembl_moa[keep_cols]

In [8]:
# Concatenate the two dataframes together:
moa_combined = pd.concat([ds_reduced, ch_reduced]).reset_index(drop=True)
print(moa_combined.shape)
# Split out any datapoints with >1 moa:
moa_combined['moa'] = moa_combined['moa'].str.split('|')
moa_combined = moa_combined.explode('moa')
print(moa_combined.shape)
# Drop any duplicates with same ID and moa:
moa_combined = moa_combined.drop_duplicates(subset=['Metadata_InChIKey', 'moa'])
print(moa_combined.shape)
# Drop any rows with n/a values in:
moa_combined = moa_combined.dropna(subset=['Metadata_InChIKey', 'moa'])
print(moa_combined.shape)
moa_combined.head(2)

(3769, 6)
(4225, 6)
(3954, 6)
(3954, 6)


Unnamed: 0,moa,target,smiles,clinical_phase,Metadata_InChIKey,moa_src
0,phosphodiesterase inhibitor,PDE4A|PDE4B|PDE4C|PDE4D|PDE5A,COc1ccc(cc1OC1CCCC1)[C@@H]1CNC(=O)C1,Phase 1,HJORMJIFDVBMOB-LBPRGKRZSA-N,dr_hub
1,phosphodiesterase inhibitor,PDE4B|PDE4D,COc1ccc(cc1OC1CCCC1)[C@H]1CNC(=O)C1,Phase 1,HJORMJIFDVBMOB-GFCCVEGCSA-N,dr_hub


### Drop Columns Which Don't Contain Selected Kinase Inhibitors:

In [9]:
# Only retain the kinase inhibitors of interest:
ki_subs = moa_combined[moa_combined['moa'].str.contains(
    "PI3K|EGFR|HER2|p38 MAPK|JAK|RAF|AURK|ROCK|ALK|SRC|MEK|GSK|CDK|Bcr-Abl|PDGFR|FGFR|BTK|AKT|mTOR")
                      ].reset_index(drop=True)
print(ki_subs.shape)
ki_subs.head(3)

(767, 6)


Unnamed: 0,moa,target,smiles,clinical_phase,Metadata_InChIKey,moa_src
0,GSK inhibitor,GSK3A|GSK3B,COc1ccc2c(NC(=O)Nc3cccc(n3)C(F)(F)F)ccnc2c1,Preclinical,VQPBIJGXSXEOCU-UHFFFAOYSA-N,dr_hub
1,AKT inhibitor,AKT1|PKIA|PRKACA,Cc1[nH]nc2ccc(cc12)-c1cncc(OC[C@@H](N)Cc2ccccc...,Preclinical,BPNUQXPIQBZCMR-IBGZPJMESA-N,dr_hub
2,CDK inhibitor,CDK4|CDK6,CCN1CCN(Cc2ccc(Nc3ncc(F)c(n3)-c3cc(F)c4nc(C)n(...,Launched,UZWDCWONPYILKI-UHFFFAOYSA-N,dr_hub


### Re-combining compounds with >1 MOA:

In [10]:
# Function to combine the moa's of datapoints with the same Metadata_InChIKey:
def combine_duplicates(group):
    # Find the unique values in each column
    moa = '|'.join(sorted(set('|'.join(group['moa']).split('|'))))
    target = '|'.join(sorted(set(group['target'].fillna('-'))))
    smiles = '|'.join(sorted(set(group['smiles'].fillna('-'))))
    clinical_phase = '|'.join(sorted(set(group['clinical_phase'].astype(str))))
    moa_src = '|'.join(sorted(set(group['moa_src'])))

    # Return a new dataframe with the combined data
    return pd.DataFrame({
        'moa': [moa],
        'target': [target],
        'smiles': [smiles],
        'clinical_phase': [clinical_phase],
        'Metadata_InChIKey': [group['Metadata_InChIKey'].iloc[0]],
        'moa_src': [moa_src],
    })

In [11]:
ki_comb = ki_subs.groupby('Metadata_InChIKey').apply(combine_duplicates).reset_index(drop=True) 
print(ki_comb.shape)
ki_comb.head(3)

(606, 6)


Unnamed: 0,moa,target,smiles,clinical_phase,Metadata_InChIKey,moa_src
0,EGFR inhibitor,EGFR,O=C1NC(=O)c2cc(Nc3ccccc3)c(Nc3ccccc3)cc12,Preclinical,AAALVYBICLMAMA-UHFFFAOYSA-N,dr_hub
1,EGFR inhibitor,EGFR|NR1I2,COCCOc1cc2ncnc(Nc3cccc(c3)C#C)c2cc1OCCOC,Launched,AAKJLRGGTJKAMG-UHFFFAOYSA-N,dr_hub
2,mTOR inhibitor,MTOR,CO[C@@H](C)Cn1c2c(cnc3ccc(cc23)-c2cncc(c2)C(C)...,Phase 2,ACCFLVVUVBJNGT-AWEZNQCLSA-N,dr_hub


### Checking duplicates have been merged correctly:

In [16]:
# Select one duplicate Metadata_InChIKey to inspect:
dupe_id = 'ZBNZXTGUTAYRHI-UHFFFAOYSA-N'

# Display it in original dataframe:
ki_subs[ki_subs['Metadata_InChIKey']==dupe_id]

Unnamed: 0,moa,target,smiles,clinical_phase,Metadata_InChIKey,moa_src
189,Bcr-Abl inhibitor,ABL1|ABL2|BLK|EPHA2|FGR|FRK|FYN|HCK|KIT|LCK|LY...,Cc1nc(Nc2ncc(s2)C(=O)Nc2c(C)cccc2Cl)cc(n1)N1CC...,Launched,ZBNZXTGUTAYRHI-UHFFFAOYSA-N,dr_hub
190,PDGFR inhibitor,ABL1|ABL2|BLK|EPHA2|FGR|FRK|FYN|HCK|KIT|LCK|LY...,Cc1nc(Nc2ncc(s2)C(=O)Nc2c(C)cccc2Cl)cc(n1)N1CC...,Launched,ZBNZXTGUTAYRHI-UHFFFAOYSA-N,dr_hub
191,SRC inhibitor,ABL1|ABL2|BLK|EPHA2|FGR|FRK|FYN|HCK|KIT|LCK|LY...,Cc1nc(Nc2ncc(s2)C(=O)Nc2c(C)cccc2Cl)cc(n1)N1CC...,Launched,ZBNZXTGUTAYRHI-UHFFFAOYSA-N,dr_hub


In [14]:
# Check it's been combined correctly to form one datapoint with multiple moas:
ki_comb[ki_comb['Metadata_InChIKey']==dupe_id]

Unnamed: 0,moa,target,smiles,clinical_phase,Metadata_InChIKey,moa_src
585,Bcr-Abl inhibitor|PDGFR inhibitor|SRC inhibitor,ABL1|ABL2|BLK|EPHA2|FGR|FRK|FYN|HCK|KIT|LCK|LY...,Cc1nc(Nc2ncc(s2)C(=O)Nc2c(C)cccc2Cl)cc(n1)N1CC...,Launched,ZBNZXTGUTAYRHI-UHFFFAOYSA-N,dr_hub


# Cross-referencing moa to metadata InChlKey:

### Align JUMP Data with MOAs from DRH/ Chembl:

In [18]:
cwp_ki = pd.merge(cwp_data, ki_comb, on='Metadata_InChIKey', how='inner')
print(cwp_ki.shape)
cwp_ki.head(3)

(19765, 13)


Unnamed: 0,Metadata_JCP2022,Metadata_InChIKey,Metadata_InChI,Metadata_Source,Metadata_Plate,Metadata_Well,Metadata_Batch,Metadata_PlateType,moa,target,smiles,clinical_phase,moa_src
0,JCP2022_037716,IVUGFMLRJOCGAS-UHFFFAOYSA-N,InChI=1S/C28H21N7OS/c1-17-15-24(37-16-17)25-20...,source_1,UL001783,B03,Batch5_20221030,COMPOUND,AURK inhibitor,AURKA|AURKB|AURKC,Cc1csc(c1)-c1nnc(Nc2ccc(Oc3ncccc3-c3ccnc(N)n3)...,Phase 1,dr_hub
1,JCP2022_037716,IVUGFMLRJOCGAS-UHFFFAOYSA-N,InChI=1S/C28H21N7OS/c1-17-15-24(37-16-17)25-20...,source_1,UL001783,B46,Batch5_20221030,COMPOUND,AURK inhibitor,AURKA|AURKB|AURKC,Cc1csc(c1)-c1nnc(Nc2ccc(Oc3ncccc3-c3ccnc(N)n3)...,Phase 1,dr_hub
2,JCP2022_037716,IVUGFMLRJOCGAS-UHFFFAOYSA-N,InChI=1S/C28H21N7OS/c1-17-15-24(37-16-17)25-20...,source_1,UL001783,J03,Batch5_20221030,COMPOUND,AURK inhibitor,AURKA|AURKB|AURKC,Cc1csc(c1)-c1nnc(Nc2ccc(Oc3ncccc3-c3ccnc(N)n3)...,Phase 1,dr_hub


# !! Save/load Data:

In [19]:
cwp_ki.to_csv('../data/cwp_ki_moa.csv', index=False)