In [1]:
# inner module import
import sys
sys.path.append("/storage/homefs/yc24j783/datacat4ml/datacat4ml")
from const import FETCH_DATA_DIR, FETCH_FIG_DIR, FEATURIZE_DATA_DIR, FEATURIZE_FIG_DIR

import os
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as pl

from rdkit import Chem
from rdkit.Chem import AllChem

#-------------------------------------------------------------------------------------------------
# categorical TMAP based on three categories of SMILES
#-------------------------------------------------------------------------------------------------
import tmap as tm
from faerun import Faerun
from mapchiral.mapchiral import encode
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import rdmolops
from matplotlib.colors import ListedColormap
from matplotlib.colors import LinearSegmentedColormap
from matplotlib import cm
import json
import random
from mhfp.encoder import MHFPEncoder
from mhfp.lsh_forest import LSHForestHelper

In [2]:
mor_chembl_id = 'CHEMBL233'
kor_chembl_id = 'CHEMBL237'
dor_chembl_id = 'CHEMBL236'
nor_chembl_id = 'CHEMBL2014'

# Functions

In [3]:
def tmap_plot_maedeh(df, title:str, fp:str='map4c',
              node_size:float=1/32, mmm_repeats:int=2, steps:int=5, k:int=15,
              shader:str='smoothCircle',  point_scale:float=2.5, max_point_size:int=10):
    '''determining the layout of the TMAP, and plotting the TMAP'''
    lf = tm.LSHForest(2048, 32) # a locality-sensitive hashing forest, used for approximate nearest neighbor search is initialized with parameters 512 and 32.
    ECFP4 = np.array(df[fp])  # converting the FPs to vector units
    fps = []
    for i in ECFP4:
        vec = tm.VectorUint(i)
        fps.append(vec)
    lf.batch_add(fps)
    lf.index()
    cfg = tm.LayoutConfiguration()  # configuration parameters for TMAP layout
    cfg.node_size = node_size  # size of nodes which affects the magnitude of their repelling force.
    cfg.mmm_repeats = mmm_repeats  # number of repeats of the per-level layout algorithm
    cfg.sl_extra_scaling_steps = steps  # sets the number of repeats of the scaling
    cfg.k = k  # number of nearest neighbours used to create the k-nearest neighbour graph
    cfg.sl_scaling_type = tm.RelativeToAvgLength  # Defines the relative scale of the graph

    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)

    # create a Faerun object
    category_labels, category_data = Faerun.create_categories(df['activity'])

    # TMAP for Aline's compounds based on the three categories, aromatic ring fraction, heavy atom counts and qualitative estimation drug-likeliness
    f = Faerun(
        view="front",
        coords=False,
        title="",
        clear_color="#FFFFFF"
    )

    # 
    labels = [] # the widget labels
    for i, smi in enumerate(df['canonical_smiles']):
        labels.append(
            smi
            + "__"
            # convert df['tid'].values to a list of strings and append it to the labels
            + "assay_chembl_id" + str(df['assay_chembl_id'].tolist()[i])
            + "__"
            + "target_chembl_id" + str(df['target_chembl_id'].tolist()[i])
            + "__"
            + "pchembl_value: " + str(df['pchembl_value'].tolist()[i])
            + "__"
            + "assay_type" + str(df['assay_type'].tolist()[i])
            + "__"
            + "assay_category" + str(df['assay_category'].tolist()[i])
            + "__"
            + "target_chembl_id: " + str(df['target_chembl_id'].tolist()[i])
            )
    f.add_scatter(
        title,
        {
            "x": x,
            "y": y,
            "c": [category_data, # categories

                df['tid'].values.tolist(), 
                df['assay_id'].values.tolist(), 
                df['pchembl_value'].values.tolist(),
                ],
            "labels": labels, # SMILES as labels
        },
        point_scale=point_scale,
        max_point_size=max_point_size,
        shader=shader,
        legend_labels=[category_labels, None, None, None],
        categorical=[True, False, False, False],
        colormap=['Set1', 'rainbow', 'rainbow', 'rainbow'],
        series_title=['activity', 'tid', 'assay_id', 'pchembl_value'],
        has_legend=True,
    )
    f.add_tree('Tree', {"from": s, "to": t}, point_helper=title)
    f.plot(title, path=FEATURIZE_FIG_DIR, template='smiles')

In [None]:
def tmap_plot(df, title:str,
              node_size:float=1/32, mmm_repeats:int=2, steps:int=5, k:int=15,
              shader:str='smoothCircle',  point_scale:float=2.5, max_point_size:int=10):
    '''determining the layout of the TMAP, and plotting the TMAP'''

    # create a layout configuration
    lf = tm.LSHForest(2048, 32) # a locality-sensitive hashing forest, used for approximate nearest neighbor search is initialized with parameters 512 and 32.
    ECFP4 = np.array(df['fp'])  # converting the FPs to vector units
    fps = []
    for i in ECFP4:
        vec = tm.VectorUint(i)
        fps.append(vec)
    lf.batch_add(fps)
    lf.index()
    cfg = tm.LayoutConfiguration()  # configuration parameters for TMAP layout
    cfg.node_size = node_size  # size of nodes which affects the magnitude of their repelling force.
    cfg.mmm_repeats = mmm_repeats  # number of repeats of the per-level layout algorithm
    cfg.sl_extra_scaling_steps = steps  # sets the number of repeats of the scaling
    cfg.k = k  # number of nearest neighbours used to create the k-nearest neighbour graph
    cfg.sl_scaling_type = tm.RelativeToAvgLength  # Defines the relative scale of the graph

    print("Creating layout")
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)
    
    # create a Faerun object
    f = Faerun(
        view="front",
        coords=False,
        title="",
        clear_color="#FFFFFF"
    )

    # label the points
    labels = []
    for i, s in enumerate(df['canonical_smiles']):
        labels.append(
            s
            + "__"
            + df['target_chembl_id'][i])

    # color the points
    target_chembl_id_map = dict([(y, x+1) for x, y in enumerate(sorted(set(df['target_chembl_id'])))])
    classes = ['CHEMBL2014', 'CHEMBL233', 'CHEMBL236', 'CHEMBL237' ]
    i = 0
    for key, value in target_chembl_id_map.items():
        if key not in classes:
            target_chembl_id_map[key] = 0
        else:
            i += 1
            target_chembl_id_map[key] = i

    legend_labels =[
        (0, 'Others'),
        (1, 'Nociceptin receptor'),
        (2, 'Mu opioid receptor'),
        (3, 'Delta opioid receptor'),
        (4, 'Kappa opioid receptor'),
    ]

    vals = [int(target_chembl_id_map[x]) for x in df['target_chembl_id']]

    print("Adding scatter")
    # add the scatter plot
    f.add_scatter(
        title,
        {
            "x": x,
            "y": y,
            "c": vals,
            "labels": labels, # the point labels
        },
        point_scale=point_scale,
        max_point_size=max_point_size,
        shader=shader,
        legend_labels=legend_labels, # the color legend
        categorical=True,
        colormap="tab10",
        has_legend=True,
        title_index=1
    )
    print("Plotting")
    f.add_tree('Tree', {"from": s, "to": t}, point_helper=title)
    f.plot(title, template='smiles')

# Plot tmap

In [4]:
# load the data
ic50_maxcur_df = pd.read_pickle(os.path.join(FEATURIZE_DATA_DIR, 'ic50_maxcur_fp.pkl'))
ki_maxcur_df = pd.read_pickle(os.path.join(FEATURIZE_DATA_DIR, 'ki_maxcur_fp.pkl'))
ec50_maxcur_df = pd.read_pickle(os.path.join(FEATURIZE_DATA_DIR, 'ec50_maxcur_fp.pkl'))
print(f'The shape of the ic50_maxcur_df is {ic50_maxcur_df.shape}')
#print(f'The shape of the ki_maxcur_df is {ki_maxcur_df.shape}')
#print(f'The shape of the ec50_maxcur_df is {ec50_maxcur_df.shape}')

The shape of the ic50_maxcur_df is (65485, 30)


In [30]:
ic50_maxcur_df.head(1)
# display the first 10 columns of the ic50_mincur_df.head(1)
ic50_maxcur_df.head(2).iloc[:, :15]

Unnamed: 0,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,pchembl_value,assay_type,assay_category,assay_organism,assay_tax_id,assay_strain,assay_tissue,assay_cell_type,assay_subcellular_fraction,bao_format
0,39161,CHEMBL650687,50,CHEMBL213,IC50,7.53,B,,Canis lupus familiaris,9615.0,,,,,BAO_0000357
1,59914,CHEMBL671073,72,CHEMBL217,IC50,5.01,B,,,,,,A9L,,BAO_0000219


In [9]:
ic50_maxcur_df['assay_type'].value_counts()

B    62105
F    30177
A      332
T        2
Name: assay_type, dtype: int64

In [32]:
ki_maxcur_df['assay_category'].value_counts()

None                                 141215
Selectivity assay                       228
confirmatory                             96
other                                    59
Affinity biochemical assay                9
screening                                 7
Affinity on-target cellular assay         1
Name: assay_category, dtype: int64

In [11]:
ic50_maxcur_df['assay_organism'].value_counts()

Homo sapiens                      69366
None                              20733
Cavia porcellus                    1144
Oryctolagus cuniculus               525
Canis lupus familiaris              299
Sus scrofa                          169
Bos taurus                          153
Mus musculus                         54
Cricetulus griseus                   48
Cercopithecidae                      36
Rattus norvegicus                    23
frog                                 14
Gerbillinae                          11
guinea pig                           10
Ovis aries                            8
Rodentia sp.                          5
Human immunodeficiency virus 1        5
Mesocricetus auratus                  4
Macaca mulatta                        4
Callithrix                            2
Bos javanicus                         1
Meriones                              1
Chlorocebus aethiops                  1
Name: assay_organism, dtype: int64

In [12]:
ic50_maxcur_df['assay_tax_id'].value_counts()

9606     67934
None     20832
32644     1333
10141     1144
9986       525
9615       299
9823       179
9913       153
10090       54
10029       48
9527        36
10116       23
8292        14
10045       11
9940         8
69158        5
11676        5
10036        4
9544         4
9481         2
9906         1
10046        1
9534         1
Name: assay_tax_id, dtype: int64

In [31]:
ic50_maxcur_df['assay_strain'].value_counts()

None    92611
Ba-L        5
Name: assay_strain, dtype: int64

In [15]:
ic50_maxcur_df['assay_tissue'].value_counts()

None                 90381
Plasma                1037
Blood                  261
Stomach                191
Brain                  105
Spleen                 100
Liver                   84
Venous blood            75
Vas deferens            63
Retina                  43
Cortex                  34
Kidney                  31
Gastric mucosa          26
Cerebral cortex         22
Uterus                  21
Artery                  20
Ileum                   16
Striatum                16
Hippocampus             13
Adrenal tissue          12
Brain/Plasma            12
Aorta                   11
Cardiac atrium           9
Heart                    6
Choroid plexus           5
Blood/Plasma             4
Prostate gland           4
Cerebellum               3
Lung                     2
Frontal cortex           2
Cerebellar cortex        2
Trachea                  1
Colon                    1
Corpus striatum          1
Olfactory bulb           1
Placenta                 1
Name: assay_tissue, dtype: i

In [24]:
ic50_mincur_df['tissue_id'].value_counts()

None        90529
1969         1024
178           205
945           191
955           105
2106          100
2107           83
1000           63
966            43
1851           34
2113           31
1199           26
956            22
995            21
1637           20
2435           16
2116           16
10000000       13
18303          12
10000013       12
947            11
2081            9
948             6
1886            5
2367            4
10000041        4
2037            3
1870            2
2048            2
1155            1
2264            1
369             1
3126            1
Name: tissue_id, dtype: int64

In [16]:
ic50_maxcur_df['assay_cell_type'].value_counts()

None      41163
CHO       19149
HEK293    10853
CHO-K1     6345
THP-1      1081
          ...  
chem1         1
SW-620        1
293           1
Y-79          1
PBL           1
Name: assay_cell_type, Length: 229, dtype: int64

In [25]:
ic50_maxcur_df['cell_id'].value_counts()

None    51839
449     17961
722      9420
485      5510
559      1063
        ...  
757         1
422         1
472         1
750         1
5639        1
Name: cell_id, Length: 87, dtype: int64

In [17]:
ic50_maxcur_df['assay_subcellular_fraction'].value_counts()

None             90399
Membrane          1685
Cell membrane      328
Microsome           88
Microsomes          78
Mitochondria        22
microsomes          16
Name: assay_subcellular_fraction, dtype: int64

In [18]:
ic50_maxcur_df['bao_format'].value_counts()

BAO_0000219    55606
BAO_0000357    21449
BAO_0000019    11103
BAO_0000249     2619
BAO_0000366      832
BAO_0000221      751
BAO_0000218      129
BAO_0000251      105
BAO_0000252       22
Name: bao_format, dtype: int64

In [5]:
ic50_maxcur_df['variant_id'].value_counts()

None    65485
Name: variant_id, dtype: int64

In [20]:
ic50_maxcur_df['assay_test_type'].value_counts()

None        86404
In vitro     6199
In vivo        13
Name: assay_test_type, dtype: int64

In [21]:
ic50_maxcur_df['assay_desc'].value_counts()

Inhibition of mTOR                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [23]:
ic50_maxcur_df['assay_test_type'].value_counts()

None        86404
In vitro     6199
In vivo        13
Name: assay_test_type, dtype: int64

In [27]:
ic50_maxcur_df['curated_by'].value_counts()

Autocuration    62481
Expert          24395
Intermediate     5740
Name: curated_by, dtype: int64

In [28]:
ic50_mincur_df['relationship_type'].value_counts()

D    67934
H    24682
Name: relationship_type, dtype: int64

In [29]:
ic50_mincur_df['aidx'].value_counts()

CLD0                                     71856
8349_1_pol_2137                            121
8461_1_pol_1406                            101
6523_1_pol_8650                            100
7828_1_pol_5447                            100
                                         ...  
1229911                                      1
1230086                                      1
1230252                                      1
1227939                                      1
GSK778_cross_screeing_panel_CHRM2_Ant        1
Name: aidx, Length: 2444, dtype: int64

In [12]:
mor_ic50_mincur_df = ic50_mincur_df[ic50_mincur_df['target_chembl_id'] == mor_chembl_id]

In [28]:
tmap_plot_maedeh(ic50_mincur_df, title='ic50_mincur_ecfp4', fp='ecfp4')

In [27]:
tmap_plot_maedeh(ic50_mincur_df, title='ic50_mincur_map4c', fp='map4c')

  data_c[s] = (data_c[s] - min_c[s]) / (max_c[s] - min_c[s])


#### Tmap: GPCR vs OR

In [None]:
or_ic50_mincur_df = ic50_mincur_df[ic50_mincur_df['target_chembl_id'].isin([mor_chembl_id, kor_chembl_id, dor_chembl_id, nor_chembl_id])]
df = ic50_mincur_df.copy()
node_size = 1/32
mmm_repeats = 2
steps = 5
k = 1000
shader = 'smoothCircle'
title = 'IC50_mincur'
point_scale = 2.5
max_point_size = 10

In [None]:
# create a layout configuration
lf = tm.LSHForest(512, 32) # a locality-sensitive hashing forest, used for approximate nearest neighbor search is initialized with parameters 512 and 32.
ECFP4 = np.array(df['map4c'])  # converting the FPs to vector units
fps = []
for i in ECFP4:
    vec = tm.VectorUint(i)
    fps.append(vec)
lf.batch_add(fps)
lf.index()

cfg = tm.LayoutConfiguration()  # configuration parameters for TMAP layout
cfg.node_size = node_size  # size of nodes which affects the magnitude of their repelling force.
cfg.mmm_repeats = mmm_repeats  # number of repeats of the per-level layout algorithm
cfg.sl_extra_scaling_steps = steps  # sets the number of repeats of the scaling
cfg.k = k  # number of nearest neighbours used to create the k-nearest neighbour graph
cfg.sl_scaling_type = tm.RelativeToAvgLength  # Defines the relative scale of the graph

print("Creating layout")
x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)

target_chembl_id_labels, target_chembl_id_data = Faerun.create_categories(df['target_chembl_id'])
ors = ['CHEMBL2014', 'CHEMBL233', 'CHEMBL236', 'CHEMBL237' ]
ors_labels = [(7, "Other")]
target_chembl_id_map = [7]*len(target_chembl_id_data)
value = 1
for i, name in target_chembl_id_labels:
    if i not in ors:
        v=value
        if v == 7:
            v = 0
        ors_labels.append((v, name))
        target_chembl_id_map[i] = v
        value += 1

        
## create a Faerun object
#f = Faerun(
#    view="front",
#    coords=False,
#    title="",
#    clear_color="#FFFFFF"
#)
#
## label the points
#labels = []
#for i, s in enumerate(df['canonical_smiles']):
#    labels.append(
#        s
#        + "__"
#        + df['target_chembl_id'].values[i])
#
## color the points
#target_chembl_id_map = dict([(y, x+1) for x, y in enumerate(sorted(set(df['target_chembl_id'])))])
#classes = ['CHEMBL2014', 'CHEMBL233', 'CHEMBL236', 'CHEMBL237' ]
#i = 0
#for key, value in target_chembl_id_map.items():
#    if key not in classes:
#        target_chembl_id_map[key] = 0
#    else:
#        i += 1
#        target_chembl_id_map[key] = i
#
#legend_labels =[
#    (0, 'Others'),
#    (1, 'Nociceptin receptor'),
#    (2, 'Mu opioid receptor'),
#    (3, 'Delta opioid receptor'),
#    (4, 'Kappa opioid receptor'),
#]
#
#vals = [int(target_chembl_id_map[x]) for x in df['target_chembl_id']]
#
#category_labels, category_data = Faerun.create_categories(vals)
#
#print("Adding scatter")
## add the scatter plot
#f.add_scatter(
#    title,
#    {"x": x,
#    "y": y,
#    "c": category_data,
#    "labels": labels, # the point labels
#    },
#    point_scale=point_scale,
#    max_point_size=max_point_size,
#    shader=shader,
#    legend_labels=category_labels, # the color legend
#    categorical=True,
#    colormap="tab10",
#    has_legend=True,
#)
#print("Plotting")
#f.add_tree('Tree', {"from": s, "to": t}, point_helper=title)
#f.plot(title, template='smiles')

In [None]:
target_chembl_id_labels, target_chembl_id_data = Faerun.create_categories(df['target_chembl_id'])
print(f'target_chembl_id_data is {target_chembl_id_data}')  
ors = ['CHEMBL2014', 'CHEMBL233', 'CHEMBL236', 'CHEMBL237' ]
ors_labels = [(7, "Other")]
target_chembl_id_map = [7]*len(target_chembl_id_data)
value = 1
for i, name in target_chembl_id_labels:
    if i  in ors:
        v=value
        if v == 7:
            v = 0
        ors_labels.append((v, name))
        target_chembl_id_map[i] = v
        value += 1

print('The value_counts for list of target_chembl_id_map is:{}\n'.format(pd.Series(ors_labels).value_counts()))
target_chembl_id_data = [target_chembl_id_map[val] for val in enumerate(target_chembl_id_data)]
target_chembl_id_data

In [None]:
ors_labels