In [1]:
# conda env: pyg (Python 3.9.16)
import sys
from datacat4ml.const import *

import os
import pandas as pd
import numpy as np
import tmap as tm
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
from faerun import Faerun

from rdkit import Chem
from rdkit.Chem import AllChem

In [4]:
def calc_fp_in_df(df, radius=2, nbits=1024):
    '''
    Cacluate the fingerprint based on column 'canonical_smiles' in the dataframe and add it to the dataframe
    '''
    new_df = df.copy()
    # Calculate the fingerprint based on the canonical_smiles
    new_df['mol'] = new_df['canonical_smiles_by_Std'].apply(Chem.MolFromSmiles)
    new_df['ecfp4'] = new_df['mol'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, radius, nbits))

    print(f'The shape of df is {new_df.shape}')
    
    return new_df 

In [49]:
def tmap_plot(df, title:str ='tmap', fp:str='ecfp4',
              hf:int=1024, nb:int=32,
              category_col:str='target_chembl_id',
              node_size:float=1/32, mmm_repeats:int=2, steps:int=5, k:int=1000,
              shader:str='smoothCircle',  point_scale:float=2.5, max_point_size:int=10,
              colormap:list=['Set1', 'tab20', 'turbo', 'tab10' ]):
    '''determining the layout of the TMAP, and plotting the TMAP'''
    lf = tm.LSHForest(hf, nb) # a locality-sensitive hashing forest, used for approximate nearest neighbor search is initialized with parameters 512 and 32.
    fp = np.array(df[fp])  # converting the FPs to vector units
    fps = []
    for i in fp:
        vec = tm.VectorUint(i)
        fps.append(vec)
    lf.batch_add(fps)
    lf.index()
    cfg = tm.LayoutConfiguration()  # configuration parameters for TMAP layout
    cfg.node_size = node_size  # size of nodes which affects the magnitude of their repelling force.
    cfg.mmm_repeats = mmm_repeats  # number of repeats of the per-level layout algorithm
    cfg.sl_extra_scaling_steps = steps  # sets the number of repeats of the scaling
    cfg.k = k  # number of nearest neighbours used to create the k-nearest neighbour graph. The higher the value, the more computationally expensive.
    cfg.sl_scaling_type = tm.RelativeToAvgLength  # Defines the relative scale of the graph

    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)

    # create a dict for the categories in 'dataset' column
    category_labels, category_data = Faerun.create_categories(df[category_col])
    print(f'category_labels: {category_labels}')

    # TMAP for Aline's compounds based on the three categories, aromatic ring fraction, heavy atom counts and qualitative estimation drug-likeliness
    f = Faerun(
        view="front",
        coords=False,
        title="",
        clear_color="#FFFFFF"
    )

    f.add_scatter(
        title,
        {
            "x": x,
            "y": y,
            "c": [category_data, # categories

                df['tid'].values.tolist(), # aromatic fraction
                df['assay_id'].values.tolist(), # heavy atom counts
                df['threshold'].values.tolist(), # qualitative estimation of drug likeliness

                ],
            "labels": df['canonical_smiles_by_Std'].values.tolist(), # SMILES as labels
        },
        shader="sphere",
        point_scale=5,
        max_point_size=20,
        legend_labels=[category_labels, None, None, None],
        categorical=[True, False, False, False],
        colormap=colormap,
        series_title=[category_col, 'tid', 'assay_id', 'threshold'],
        has_legend=True,
    )
    f.add_tree("AlineDB_TMAP_tree", {"from": s, "to": t}, point_helper=title)
    f.plot(title, template='smiles')

    #labels = [] # the widget labels
    #for i, s in enumerate(df['canonical_smiles']):
    #    labels.append(
    #        s
    #        + "__"
    #        # convert df['tid'].values to a list of strings and append it to the labels
    #        + str(df['tid'].values.tolist()[i])
    #        + "__"
    #        + str(df['assay_id'].tolist()[i])
    #        + "__"
    #        + str(df['pchembl_value'].tolist()[i])
    #        )


# Load files

## mor_effect 

In [2]:
import glob

# Find all files starting with 'ki_target_CHEMBL233' in the FEATURIZE_DATA_DIR
file_pattern = os.path.join(CURA_CAT_DATASETS_DIR, 'cls', 'mor*csv')
file_list = glob.glob(file_pattern)

mor_df = pd.DataFrame()
for file in file_list:
    df = pd.read_csv(file)
    
    mor_df = pd.concat([mor_df, df], axis=0)

print(f'The shape of mor_df is {mor_df.shape}')
mor_df.columns


The shape of mor_df is (7497, 45)


Index(['Unnamed: 0', 'assay_id', 'assay_chembl_id', 'tid', 'target_chembl_id',
       'standard_type', 'standard_relation', 'standard_value',
       'standard_units', 'pchembl_value', 'assay_type', 'assay_category',
       'assay_organism', 'assay_tax_id', 'assay_strain', 'assay_tissue',
       'assay_cell_type', 'assay_subcellular_fraction', 'bao_format',
       'variant_id', 'assay_test_type', 'assay_desc', 'cell_id', 'tissue_id',
       'curated_by', 'relationship_type', 'aidx', 'confidence_score',
       'molregno', 'compound_chembl_id', 'canonical_smiles', 'assay_info_hash',
       'canonical_smiles_by_Std', 'molecular_weight', 'num_atoms',
       'pStandard_value', 'max_num_atoms', 'max_molecular_weight',
       'activity_string', 'activity', 'threshold', 'target', 'effect', 'assay',
       'std_type'],
      dtype='object')

In [5]:
mor_with_fp = calc_fp_in_df(mor_df)

The shape of df is (7497, 47)


In [5]:
mor_with_fp

Unnamed: 0.1,Unnamed: 0,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,standard_relation,standard_value,standard_units,pchembl_value,...,max_molecular_weight,activity_string,activity,threshold,target,effect,assay,std_type,mol,ecfp4
0,0,1352793,CHEMBL3268193,129,CHEMBL233,IC50,>,24000.0,nM,,...,558.542,inactive,0.0,5.505344,mor,antag,B_arrest,IC50,<rdkit.Chem.rdchem.Mol object at 0x7fed3fcbd1c0>,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,1,1352793,CHEMBL3268193,129,CHEMBL233,IC50,>,32000.0,nM,,...,558.542,inactive,0.0,5.505344,mor,antag,B_arrest,IC50,<rdkit.Chem.rdchem.Mol object at 0x7fed3fcbd160>,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
2,2,1352793,CHEMBL3268193,129,CHEMBL233,IC50,>,32000.0,nM,,...,558.542,inactive,0.0,5.505344,mor,antag,B_arrest,IC50,<rdkit.Chem.rdchem.Mol object at 0x7fed3fcbd0a0>,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
3,3,1352793,CHEMBL3268193,129,CHEMBL233,IC50,>,32000.0,nM,,...,558.542,inactive,0.0,5.505344,mor,antag,B_arrest,IC50,<rdkit.Chem.rdchem.Mol object at 0x7fed3fcbda00>,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
4,4,1352793,CHEMBL3268193,129,CHEMBL233,IC50,=,10300.0,nM,4.99,...,558.542,inactive,0.0,5.505344,mor,antag,B_arrest,IC50,<rdkit.Chem.rdchem.Mol object at 0x7fed3fcbd040>,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,151,1686080,CHEMBL4036559,129,CHEMBL233,EC50,,,,,...,898.083,inactive,0.0,5.000000,mor,agon,G_cAMP,EC50,<rdkit.Chem.rdchem.Mol object at 0x7fed3f9b6fa0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
431,331,2026352,CHEMBL4680510,129,CHEMBL233,EC50,,,,,...,898.083,inactive,0.0,5.000000,mor,agon,G_cAMP,EC50,<rdkit.Chem.rdchem.Mol object at 0x7fed3f9b7040>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
432,332,2026352,CHEMBL4680510,129,CHEMBL233,EC50,,,,,...,898.083,inactive,0.0,5.000000,mor,agon,G_cAMP,EC50,<rdkit.Chem.rdchem.Mol object at 0x7fed3f9b70a0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
433,524,2150927,CHEMBL5035389,129,CHEMBL233,EC50,,,,,...,898.083,inactive,0.0,5.000000,mor,agon,G_cAMP,EC50,<rdkit.Chem.rdchem.Mol object at 0x7fed3f9b7100>,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [15]:
mor_with_fp['effect'].value_counts()

bind     5236
agon     1821
antag     440
Name: effect, dtype: int64

In [7]:
mor_with_fp.columns

Index(['Unnamed: 0', 'assay_id', 'assay_chembl_id', 'tid', 'target_chembl_id',
       'standard_type', 'standard_relation', 'standard_value',
       'standard_units', 'pchembl_value', 'assay_type', 'assay_category',
       'assay_organism', 'assay_tax_id', 'assay_strain', 'assay_tissue',
       'assay_cell_type', 'assay_subcellular_fraction', 'bao_format',
       'variant_id', 'assay_test_type', 'assay_desc', 'cell_id', 'tissue_id',
       'curated_by', 'relationship_type', 'aidx', 'confidence_score',
       'molregno', 'compound_chembl_id', 'canonical_smiles', 'assay_info_hash',
       'canonical_smiles_by_Std', 'molecular_weight', 'num_atoms',
       'pStandard_value', 'max_num_atoms', 'max_molecular_weight',
       'activity_string', 'activity', 'threshold', 'target', 'effect', 'assay',
       'std_type', 'mol', 'ecfp4'],
      dtype='object')

In [None]:
#title = os.path.join(CURA_CAT_DATASETS_DIR, 'cls', 'mor_tmap')
title = 'mor_effect_tmap'
colormap = ListedColormap(["#E2516E", "#4F77ED", "#BAE1E5"])
tmap_plot(mor_with_fp, 
          title=title, 
          category_col='effect', 
          node_size=1/50, point_scale=2.5, max_point_size=10, k=1000, 
          colormap=colormap) # 512, 32
# 2048, 32

category_labels: [(0, 'agon'), (1, 'antag'), (2, 'bind')]


## mor_effect_assay

In [8]:
mor_bind_df = mor_with_fp[mor_with_fp['effect'] == 'bind']
print(f'The shape of mor_bind_df is {mor_bind_df.shape}')
mor_bind_df['assay'].value_counts()

The shape of mor_bind_df is (5236, 47)


RBA    5236
Name: assay, dtype: int64

In [7]:
mor_agon_df = mor_with_fp[mor_with_fp['effect'] == 'agon']
print(f'The shape of mor_agon_df is {mor_agon_df.shape}')
mor_agon_df['assay'].value_counts()

The shape of mor_agon_df is (1821, 47)


G_GTP       980
G_cAMP      490
B_arrest    207
G_Ca        144
Name: assay, dtype: int64

In [25]:
mor_antag_df = mor_with_fp[mor_with_fp['effect'] == 'antag']
mor_antag_df['assay'].value_counts()

G_GTP       400
B_arrest     40
Name: assay, dtype: int64

In [None]:
colormap = ListedColormap(["#BAE1E5"])
tmap_plot(mor_bind_df, 
          title='mor_bind_assay_tmap', 
          category_col='assay', 
          node_size=1/50, point_scale=1.0, colormap=colormap) # 512, 32

category_labels: [(0, 'RBA')]


  data_c[s] = (data_c[s] - min_c[s]) / (max_c[s] - min_c[s])


In [None]:
#grey = "#D3D3D3"
# green_dark = "#15B392"
# green_mid = "#73EC8B"
# gresn_light = "#D2FF72"
# orange = "#FF8000"

#colormap = ListedColormap(["#FF7F3E", "#FFFBE6", "#347928","#C0EBA6"])
colormap = ListedColormap(["#FF7F3E", "#FFFBE6", "#8E4DE4","#85B8E7"])

tmap_plot(mor_agon_df, 
          title='mor_agon_assay_tmap', 
          category_col='assay', 
          hf=256, nb=32,
          node_size=1/25, point_scale=50.0, colormap=colormap) # 256, 32

category_labels: [(0, 'B_arrest'), (1, 'G_Ca'), (2, 'G_GTP'), (3, 'G_cAMP')]


In [62]:
colormap = ListedColormap(["#FF7F3E","#8E4DE4"])
tmap_plot(mor_antag_df, 
          title='mor_antag_assay_tmap', 
          category_col='assay', 
          hf=128, nb=128,
          node_size=1/25, point_scale=50.0, colormap=colormap) # 128, 32

category_labels: [(0, 'B_arrest'), (1, 'G_GTP')]


# mor_effect_assay_type

In [65]:
mor_bind_RBA_df = mor_bind_df[mor_bind_df['assay'] == 'RBA']
mor_bind_RBA_df['standard_type'].value_counts()

Ki      4654
IC50     582
Name: standard_type, dtype: int64

In [68]:
colormap = ListedColormap(["#E3516E","#1942C6"])
tmap_plot(mor_bind_RBA_df, 
          title='mor_bind_RBA_type_tmap', 
          category_col='standard_type', 
          hf=512, nb=32,
          node_size=1/25, point_scale=50.0, colormap=colormap) # 128, 32

category_labels: [(0, 'IC50'), (1, 'Ki')]
