In [1]:
# inner module import
import sys
sys.path.append("/storage/homefs/yc24j783/datacat4ml/datacat4ml")
from const import FETCH_DATA_DIR, FETCH_FIG_DIR, FEATURIZE_DATA_DIR, FEATURIZE_FIG_DIR

import os
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as pl

from rdkit import Chem
from rdkit.Chem import AllChem

#-------------------------------------------------------------------------------------------------
# categorical TMAP based on three categories of SMILES
#-------------------------------------------------------------------------------------------------
import tmap as tm
from faerun import Faerun
from mapchiral.mapchiral import encode
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import rdmolops
from matplotlib.colors import ListedColormap
from matplotlib.colors import LinearSegmentedColormap
from matplotlib import cm
import json
import random
from mhfp.encoder import MHFPEncoder
from mhfp.lsh_forest import LSHForestHelper

In [2]:
mor_chembl_id = 'CHEMBL233'
kor_chembl_id = 'CHEMBL237'
dor_chembl_id = 'CHEMBL236'
nor_chembl_id = 'CHEMBL2014'

# Functions

In [9]:
def tmap_plot_maedeh(df, title:str, fp:str='map4c',
              node_size:float=1/32, mmm_repeats:int=2, steps:int=5, k:int=15,
              shader:str='smoothCircle',  point_scale:float=2.5, max_point_size:int=10):
    '''determining the layout of the TMAP, and plotting the TMAP'''
    lf = tm.LSHForest(2048, 32) # a locality-sensitive hashing forest, used for approximate nearest neighbor search is initialized with parameters 512 and 32.
    ECFP4 = np.array(df[fp])  # converting the FPs to vector units
    fps = []
    for i in ECFP4:
        vec = tm.VectorUint(i)
        fps.append(vec)
    lf.batch_add(fps)
    lf.index()
    cfg = tm.LayoutConfiguration()  # configuration parameters for TMAP layout
    cfg.node_size = node_size  # size of nodes which affects the magnitude of their repelling force.
    cfg.mmm_repeats = mmm_repeats  # number of repeats of the per-level layout algorithm
    cfg.sl_extra_scaling_steps = steps  # sets the number of repeats of the scaling
    cfg.k = k  # number of nearest neighbours used to create the k-nearest neighbour graph
    cfg.sl_scaling_type = tm.RelativeToAvgLength  # Defines the relative scale of the graph

    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)

    # create a Faerun object
    category_labels, category_data = Faerun.create_categories(df['activity'])

    # TMAP for Aline's compounds based on the three categories, aromatic ring fraction, heavy atom counts and qualitative estimation drug-likeliness
    f = Faerun(
        view="front",
        coords=False,
        title="",
        clear_color="#FFFFFF"
    )

    labels = [] # the widget labels
    for i, s in enumerate(df['canonical_smiles']):
        labels.append(
            s
            + "__"
            # convert df['tid'].values to a list of strings and append it to the labels
            + str(df['tid'].values.tolist()[i])
            + "__"
            + str(df['assay_id'].tolist()[i])
            + "__"
            + str(df['assay_tax_id'].tolist()[i])
            )
    f.add_scatter(
        title,
        {
            "x": x,
            "y": y,
            "c": [category_data, # categories

                df['tid'].values.tolist(), # aromatic fraction
                df['assay_id'].values.tolist(), # heavy atom counts
                df['assay_tax_id'].values.tolist(), # qualitative estimation of drug likeliness

                ],
            "labels": labels, # SMILES as labels
        },
        point_scale=point_scale,
        max_point_size=max_point_size,
        shader=shader,
        legend_labels=[category_labels, None, None, None],
        categorical=[True, False, False, False],
        colormap=['Set1', 'rainbow', 'Set1', 'rainbow' ],
        #colormap=['Set1', 'tab20', 'turbo', 'tab10' ],
        series_title=['activity', 'tid', 'assay id', 'assay tax id'],
        has_legend=True,
    )
    f.add_tree('Tree', {"from": s, "to": t}, point_helper=title)
    f.plot(title, path=FEATURIZE_FIG_DIR, template='smiles') # redpoint 1


In [None]:
def tmap_plot(df, title:str,
              node_size:float=1/32, mmm_repeats:int=2, steps:int=5, k:int=15,
              shader:str='smoothCircle',  point_scale:float=2.5, max_point_size:int=10):
    '''determining the layout of the TMAP, and plotting the TMAP'''

    # create a layout configuration
    lf = tm.LSHForest(2048, 32) # a locality-sensitive hashing forest, used for approximate nearest neighbor search is initialized with parameters 512 and 32.
    ECFP4 = np.array(df['fp'])  # converting the FPs to vector units
    fps = []
    for i in ECFP4:
        vec = tm.VectorUint(i)
        fps.append(vec)
    lf.batch_add(fps)
    lf.index()
    cfg = tm.LayoutConfiguration()  # configuration parameters for TMAP layout
    cfg.node_size = node_size  # size of nodes which affects the magnitude of their repelling force.
    cfg.mmm_repeats = mmm_repeats  # number of repeats of the per-level layout algorithm
    cfg.sl_extra_scaling_steps = steps  # sets the number of repeats of the scaling
    cfg.k = k  # number of nearest neighbours used to create the k-nearest neighbour graph
    cfg.sl_scaling_type = tm.RelativeToAvgLength  # Defines the relative scale of the graph

    print("Creating layout")
    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)
    
    # create a Faerun object
    f = Faerun(
        view="front",
        coords=False,
        title="",
        clear_color="#FFFFFF"
    )

    # label the points
    labels = []
    for i, s in enumerate(df['canonical_smiles']):
        labels.append(
            s
            + "__"
            + df['target_chembl_id'][i])

    # color the points
    target_chembl_id_map = dict([(y, x+1) for x, y in enumerate(sorted(set(df['target_chembl_id'])))])
    classes = ['CHEMBL2014', 'CHEMBL233', 'CHEMBL236', 'CHEMBL237' ]
    i = 0
    for key, value in target_chembl_id_map.items():
        if key not in classes:
            target_chembl_id_map[key] = 0
        else:
            i += 1
            target_chembl_id_map[key] = i

    legend_labels =[
        (0, 'Others'),
        (1, 'Nociceptin receptor'),
        (2, 'Mu opioid receptor'),
        (3, 'Delta opioid receptor'),
        (4, 'Kappa opioid receptor'),
    ]

    vals = [int(target_chembl_id_map[x]) for x in df['target_chembl_id']]

    print("Adding scatter")
    # add the scatter plot
    f.add_scatter(
        title,
        {
            "x": x,
            "y": y,
            "c": vals,
            "labels": labels, # the point labels
        },
        point_scale=point_scale,
        max_point_size=max_point_size,
        shader=shader,
        legend_labels=legend_labels, # the color legend
        categorical=True,
        colormap="tab10",
        has_legend=True,
        title_index=1
    )
    print("Plotting")
    f.add_tree('Tree', {"from": s, "to": t}, point_helper=title)
    f.plot(title, template='smiles')

# Plot tmap

In [4]:
# load the data
ic50_mincur_df = pd.read_pickle(os.path.join(FEATURIZE_DATA_DIR, 'ic50_mincur_fp.pkl'))
#ic50_maxcur_df = pd.read_pickle(os.path.join(FEATURIZE_DATA_DIR, 'ic50_maxcur_fp.pkl'))
#ki_mincur_df = pd.read_pickle(os.path.join(FEATURIZE_DATA_DIR, 'ki_mincur_fp.pkl'))
#ki_maxcur_df = pd.read_pickle(os.path.join(FEATURIZE_DATA_DIR, 'ki_maxcur_fp.pkl'))
#ec50_mincur_df = pd.read_pickle(os.path.join(FEATURIZE_DATA_DIR, 'ec50_mincur_fp.pkl'))
#ec50_maxcur_df = pd.read_pickle(os.path.join(FEATURIZE_DATA_DIR, 'ec50_maxcur_fp.pkl'))
print(f'The shape of the ic50_mincur_df is {ic50_mincur_df.shape}')
#print(f'The shape of the ic50_maxcur_df is {ic50_maxcur_df.shape}')
#print(f'The shape of the ki_mincur_df is {ki_mincur_df.shape}')
#print(f'The shape of the ki_maxcur_df is {ki_maxcur_df.shape}')
#print(f'The shape of the ec50_mincur_df is {ec50_mincur_df.shape}')
#print(f'The shape of the ec50_maxcur_df is {ec50_maxcur_df.shape}')
ic50_mincur_df.head(1)

The shape of the ic50_mincur_df is (92616, 30)


Unnamed: 0,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,pchembl_value,assay_type,assay_category,assay_organism,assay_tax_id,...,curated_by,relationship_type,aidx,molregno,compound_chembl_id,canonical_smiles,assay_info_hash,ecfp4,map4c,activity
0,39161,CHEMBL650687,50,CHEMBL213,IC50,7.53,B,,Canis lupus familiaris,9615,...,Autocuration,H,CLD0,100708,CHEMBL305153,CC(C)(C)NC[C@H](O)CON=C1c2ccccc2-c2ccccc21,abf141fd45ed982409d30f67c836fe94,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[9084181, 5067487, 7085307, 4876313, 20184192,...",active


In [11]:
mor_ic50_mincur_df = ic50_mincur_df[ic50_mincur_df['target_chembl_id'] == mor_chembl_id]
df = mor_ic50_mincur_df
node_size = 1/32
mmm_repeats = 2
steps = 5
k = 15
shader = 'smoothCircle'
title = 'MOR IC50 Mincur'
point_scale = 2.5
max_point_size = 10

lf = tm.LSHForest(2048, 32) # a locality-sensitive hashing forest, used for approximate nearest neighbor search is initialized with parameters 512 and 32.
ECFP4 = np.array(df['ecfp4'])  # converting the FPs to vector units
fps = []
for i in ECFP4:
    vec = tm.VectorUint(i)
    fps.append(vec)
lf.batch_add(fps)
lf.index()
cfg = tm.LayoutConfiguration()  # configuration parameters for TMAP layout
cfg.node_size = node_size  # size of nodes which affects the magnitude of their repelling force.
cfg.mmm_repeats = mmm_repeats  # number of repeats of the per-level layout algorithm
cfg.sl_extra_scaling_steps = steps  # sets the number of repeats of the scaling
cfg.k = k  # number of nearest neighbours used to create the k-nearest neighbour graph
cfg.sl_scaling_type = tm.RelativeToAvgLength  # Defines the relative scale of the graph

x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)

# create a Faerun object
category_labels, category_data = Faerun.create_categories(df['activity'])

# TMAP for Aline's compounds based on the three categories, aromatic ring fraction, heavy atom counts and qualitative estimation drug-likeliness
f = Faerun(
    view="front",
    coords=False,
    title="",
    clear_color="#FFFFFF"
)

labels = [] # the widget labels
for i, s in enumerate(df['canonical_smiles']):
    labels.append(
        s
        + "__"
        # convert df['tid'].values to a list of strings and append it to the labels
        + str(df['tid'].values.tolist()[i])
        + "__"
        + str(df['assay_id'].tolist()[i])
        + "__"
        + str(df['assay_tax_id'].tolist()[i])
        )
f.add_scatter(
    title,
    {
        "x": x,
        "y": y,
        "c": [category_data, # categories

            df['tid'].values.tolist(), # aromatic fraction
            df['assay_id'].values.tolist(), # heavy atom counts
            df['assay_tax_id'].values.tolist(), # qualitative estimation of drug likeliness

            ],
        "labels": labels, # SMILES as labels
    },
    point_scale=point_scale,
    max_point_size=max_point_size,
    shader=shader,
    legend_labels=[category_labels, None, None, None],
    categorical=[True, False, False, False],
    colormap=['Set1', 'rainbow', 'Set1', 'rainbow' ],
    #colormap=['Set1', 'tab20', 'turbo', 'tab10' ],
    series_title=['activity', 'tid', 'assay id', 'assay tax id'],
    has_legend=True,
)
f.add_tree('Tree', {"from": s, "to": t}, point_helper=title)
f.plot(title, path=FEATURIZE_FIG_DIR, template='smiles') # redpoint 1

  data_c[s] = (data_c[s] - min_c[s]) / (max_c[s] - min_c[s])


ValueError: could not convert string to float: 'None'

In [16]:
# print the number of rows where 'tid' is 'None'
print(f'The number of rows where tid is None is {df[df["tid"].isnull()].shape[0]}')
print(f'The number of rows where assay_id is None is {df[df["assay_id"].isnull()].shape[0]}')
print(f'The number of rows where assay_tax_id is None is {df[df["assay_tax_id"].isnull()].shape[0]}')
print(f'The number of rows where canonical_smiles is None is {df[df["canonical_smiles"].isnull()].shape[0]}')

The number of rows where tid is None is 0
The number of rows where assay_id is None is 0
The number of rows where assay_tax_id is None is 0
The number of rows where canonical_smiles is None is 0


In [10]:
#tmap_plot_maedeh(df=ic50_mincur_df, title='ic50_mincur', fp='map4c')
tmap_plot_maedeh(df=mor_ic50_mincur_df, title='mor_ic50_mincur', fp='ecfp4')

ValueError: could not convert string to float: 'None'

#### Tmap: GPCR vs OR

In [None]:
or_ic50_mincur_df = ic50_mincur_df[ic50_mincur_df['target_chembl_id'].isin([mor_chembl_id, kor_chembl_id, dor_chembl_id, nor_chembl_id])]
df = ic50_mincur_df.copy()
node_size = 1/32
mmm_repeats = 2
steps = 5
k = 1000
shader = 'smoothCircle'
title = 'IC50_mincur'
point_scale = 2.5
max_point_size = 10

In [None]:
# create a layout configuration
lf = tm.LSHForest(512, 32) # a locality-sensitive hashing forest, used for approximate nearest neighbor search is initialized with parameters 512 and 32.
ECFP4 = np.array(df['map4c'])  # converting the FPs to vector units
fps = []
for i in ECFP4:
    vec = tm.VectorUint(i)
    fps.append(vec)
lf.batch_add(fps)
lf.index()

cfg = tm.LayoutConfiguration()  # configuration parameters for TMAP layout
cfg.node_size = node_size  # size of nodes which affects the magnitude of their repelling force.
cfg.mmm_repeats = mmm_repeats  # number of repeats of the per-level layout algorithm
cfg.sl_extra_scaling_steps = steps  # sets the number of repeats of the scaling
cfg.k = k  # number of nearest neighbours used to create the k-nearest neighbour graph
cfg.sl_scaling_type = tm.RelativeToAvgLength  # Defines the relative scale of the graph

print("Creating layout")
x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)

target_chembl_id_labels, target_chembl_id_data = Faerun.create_categories(df['target_chembl_id'])
ors = ['CHEMBL2014', 'CHEMBL233', 'CHEMBL236', 'CHEMBL237' ]
ors_labels = [(7, "Other")]
target_chembl_id_map = [7]*len(target_chembl_id_data)
value = 1
for i, name in target_chembl_id_labels:
    if i not in ors:
        v=value
        if v == 7:
            v = 0
        ors_labels.append((v, name))
        target_chembl_id_map[i] = v
        value += 1

        
## create a Faerun object
#f = Faerun(
#    view="front",
#    coords=False,
#    title="",
#    clear_color="#FFFFFF"
#)
#
## label the points
#labels = []
#for i, s in enumerate(df['canonical_smiles']):
#    labels.append(
#        s
#        + "__"
#        + df['target_chembl_id'].values[i])
#
## color the points
#target_chembl_id_map = dict([(y, x+1) for x, y in enumerate(sorted(set(df['target_chembl_id'])))])
#classes = ['CHEMBL2014', 'CHEMBL233', 'CHEMBL236', 'CHEMBL237' ]
#i = 0
#for key, value in target_chembl_id_map.items():
#    if key not in classes:
#        target_chembl_id_map[key] = 0
#    else:
#        i += 1
#        target_chembl_id_map[key] = i
#
#legend_labels =[
#    (0, 'Others'),
#    (1, 'Nociceptin receptor'),
#    (2, 'Mu opioid receptor'),
#    (3, 'Delta opioid receptor'),
#    (4, 'Kappa opioid receptor'),
#]
#
#vals = [int(target_chembl_id_map[x]) for x in df['target_chembl_id']]
#
#category_labels, category_data = Faerun.create_categories(vals)
#
#print("Adding scatter")
## add the scatter plot
#f.add_scatter(
#    title,
#    {"x": x,
#    "y": y,
#    "c": category_data,
#    "labels": labels, # the point labels
#    },
#    point_scale=point_scale,
#    max_point_size=max_point_size,
#    shader=shader,
#    legend_labels=category_labels, # the color legend
#    categorical=True,
#    colormap="tab10",
#    has_legend=True,
#)
#print("Plotting")
#f.add_tree('Tree', {"from": s, "to": t}, point_helper=title)
#f.plot(title, template='smiles')

In [None]:
target_chembl_id_labels, target_chembl_id_data = Faerun.create_categories(df['target_chembl_id'])
print(f'target_chembl_id_data is {target_chembl_id_data}')  
ors = ['CHEMBL2014', 'CHEMBL233', 'CHEMBL236', 'CHEMBL237' ]
ors_labels = [(7, "Other")]
target_chembl_id_map = [7]*len(target_chembl_id_data)
value = 1
for i, name in target_chembl_id_labels:
    if i  in ors:
        v=value
        if v == 7:
            v = 0
        ors_labels.append((v, name))
        target_chembl_id_map[i] = v
        value += 1

print('The value_counts for list of target_chembl_id_map is:{}\n'.format(pd.Series(ors_labels).value_counts()))
target_chembl_id_data = [target_chembl_id_map[val] for val in enumerate(target_chembl_id_data)]
target_chembl_id_data

In [None]:
ors_labels