In [None]:
# inner module import
import sys
from const import FETCH_DATA_DIR, FETCH_FIG_DIR, FEATURIZE_DATA_DIR, FEATURIZE_FIG_DIR

import os
import pandas as pd
import numpy as np
import tmap as tm
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
from faerun import Faerun

# Functions

In [None]:
def calc_fp_in_df(df, radius=2, nbits=1024):
    '''
    Cacluate the fingerprint based on column 'canonical_smiles' in the dataframe and add it to the dataframe
    '''
    new_df = df.copy()
    # Calculate the fingerprint based on the canonical_smiles
    new_df['mol'] = new_df['canonical_smiles'].apply(Chem.MolFromSmiles)
    new_df['fp'] = new_df['mol'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, radius, nbits))

    print(f'The shape of df is {new_df.shape}')
    
    return new_df 

def get_activity(x):
    ''' active is defined as pchembl_value > 7 '''
    if x > 7:
        return 'active'
    elif 5 < x <= 7:
        return 'intermediate'
    else:
        return 'inactive'

In [17]:
def tmap_plot(df, title:str ='tmap', fp:str='ecfp4',category_col:str='target_chembl_id',
              node_size:float=1/32, mmm_repeats:int=2, steps:int=5, k:int=1000,
              shader:str='smoothCircle',  point_scale:float=2.5, max_point_size:int=10,
              colormap:list=['Set1', 'tab20', 'turbo', 'tab10' ]):
    '''determining the layout of the TMAP, and plotting the TMAP'''
    lf = tm.LSHForest(2048, 32) # a locality-sensitive hashing forest, used for approximate nearest neighbor search is initialized with parameters 512 and 32.
    fp = np.array(df[fp])  # converting the FPs to vector units
    fps = []
    for i in fp:
        vec = tm.VectorUint(i)
        fps.append(vec)
    lf.batch_add(fps)
    lf.index()
    cfg = tm.LayoutConfiguration()  # configuration parameters for TMAP layout
    cfg.node_size = node_size  # size of nodes which affects the magnitude of their repelling force.
    cfg.mmm_repeats = mmm_repeats  # number of repeats of the per-level layout algorithm
    cfg.sl_extra_scaling_steps = steps  # sets the number of repeats of the scaling
    cfg.k = k  # number of nearest neighbours used to create the k-nearest neighbour graph. The higher the value, the more computationally expensive.
    cfg.sl_scaling_type = tm.RelativeToAvgLength  # Defines the relative scale of the graph

    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)

    # create a dict for the categories in 'dataset' column
    category_labels, category_data = Faerun.create_categories(df[category_col])

    # TMAP for Aline's compounds based on the three categories, aromatic ring fraction, heavy atom counts and qualitative estimation drug-likeliness
    f = Faerun(
        view="front",
        coords=False,
        title="",
        clear_color="#FFFFFF"
    )

    f.add_scatter(
        title,
        {
            "x": x,
            "y": y,
            "c": [category_data, # categories

                df['tid'].values.tolist(), # aromatic fraction
                df['assay_id'].values.tolist(), # heavy atom counts
                df['pchembl_value'].values.tolist(), # qualitative estimation of drug likeliness

                ],
            "labels": df['canonical_smiles'].values.tolist(), # SMILES as labels
        },
        shader="sphere",
        point_scale=5,
        max_point_size=20,
        legend_labels=[category_labels, None, None, None],
        categorical=[True, False, False, False],
        colormap=colormap,
        series_title=[category_col, 'tid', 'assay_id', 'pchembl_value'],
        has_legend=True,
    )
    f.add_tree("AlineDB_TMAP_tree", {"from": s, "to": t}, point_helper=title)
    f.plot(title, template='smiles')

    #labels = [] # the widget labels
    #for i, s in enumerate(df['canonical_smiles']):
    #    labels.append(
    #        s
    #        + "__"
    #        # convert df['tid'].values to a list of strings and append it to the labels
    #        + str(df['tid'].values.tolist()[i])
    #        + "__"
    #        + str(df['assay_id'].tolist()[i])
    #        + "__"
    #        + str(df['pchembl_value'].tolist()[i])
    #        )


# Load the data

## Ki

In [7]:
import glob

# Find all files starting with 'ki_target_CHEMBL233' in the FEATURIZE_DATA_DIR
file_pattern = os.path.join(FEATURIZE_DATA_DIR, 'ki_maxcur', 'ki_target_CHEMBL233*.pkl')
file_list = glob.glob(file_pattern)

# for file 'ki_target_CHEMBL233_1_fp.pkl', add a column 'dataset' with value 'ki_mor_1'; In a similar fashion, for all other files, add a column 'dataset' with value 'ki_mor_2' for file 'ki_target_CHEMBL233_2_fp.pkl'. Finally, concatenate all the dataframes into a single dataframe ki_mor_df.
ki_mor_df = pd.DataFrame()
for file in file_list:
    df = pd.read_pickle(file)
    dataset = os.path.basename(file).split('_')[3]
    print(dataset)
    df['dataset'] = dataset

    ki_mor_df = pd.concat([ki_mor_df, df], axis=0)

# load  all ki data for GPCR
ki_gpcr_df = pd.read_pickle(os.path.join(FEATURIZE_DATA_DIR,'ki_maxcur_8_fp.pkl'))
ic50_gpcr_df = pd.read_pickle(os.path.join(FEATURIZE_DATA_DIR,'ic50_maxcur_8_fp.pkl'))
ec50_gpcr_df = pd.read_pickle(os.path.join(FEATURIZE_DATA_DIR,'ec50_maxcur_8_fp.pkl'))
# add a column 'target_name', for rows with 'target_chembl_id' = 'CHEMBL233', with value 'MOR' and for rows with 'target_chembl_id' = 'CHEMBL237', with value 'KOR' and for rows with 'target_chembl_id' = 'CHEMBL236', with value 'DOR'and for rows with 'target_chembl_id' = 'CHEMBL2014', with value 'NOR', the rest with value 'Others'
ki_gpcr_df['target_name'] = ki_gpcr_df['target_chembl_id'].apply(lambda x: 'MOR' if x == 'CHEMBL233' else 'KOR' if x == 'CHEMBL237' else 'DOR' if x == 'CHEMBL236' else 'NOR' if x == 'CHEMBL2014' else 'Others')

1
1022
38
639
308
36
379
1632
2942


In [3]:
# for column 'dataset' where the value is not '1', replace the value with '2'
ki_mor_df['dataset'] = ki_mor_df['dataset'].apply(lambda x: 'others' if x != '1' else x)
print(f'The length of ki_mor_df is {len(ki_mor_df)}')
print(f'The columns in ki_mor_df is {ki_mor_df.columns}')

print(f'The length of ki_gpcr_df is {len(ki_gpcr_df)}')
print(f'The columns in ki_gpcr_df is {ki_gpcr_df.columns}')

The length of ki_mor_df is 3821
The columns in ki_mor_df is Index(['assay_id', 'assay_chembl_id', 'tid', 'target_chembl_id',
       'standard_type', 'pchembl_value', 'assay_type', 'assay_category',
       'assay_organism', 'assay_tax_id', 'assay_strain', 'assay_tissue',
       'assay_cell_type', 'assay_subcellular_fraction', 'bao_format',
       'variant_id', 'assay_test_type', 'assay_desc', 'cell_id', 'tissue_id',
       'curated_by', 'relationship_type', 'aidx', 'confidence_score',
       'molregno', 'compound_chembl_id', 'canonical_smiles', 'assay_info_hash',
       'ecfp4', 'map4c', 'activity', 'dataset'],
      dtype='object')
The length of ki_gpcr_df is 139416
The columns in ki_gpcr_df is Index(['assay_id', 'assay_chembl_id', 'tid', 'target_chembl_id',
       'standard_type', 'pchembl_value', 'assay_type', 'assay_category',
       'assay_organism', 'assay_tax_id', 'assay_strain', 'assay_tissue',
       'assay_cell_type', 'assay_subcellular_fraction', 'bao_format',
       'variant

In [11]:
#ki_mor_from_gpcr_df = ki_gpcr_df[ki_gpcr_df['target_chembl_id'] == 'CHEMBL233']
#print(f'The length of ki_mor_from_gpcr_df is {len(ki_mor_df)}')
#ki_mor_from_gpcr_df.to_csv('ki_mor_from_gpcr.csv', index=False)
#ic50_mor_from_gpcr_df = ic50_gpcr_df[ic50_gpcr_df['target_chembl_id'] == 'CHEMBL233']
#print(f'The length of ic50_mor_from_gpcr_df is {len(ic50_mor_from_gpcr_df)}')
#ic50_mor_from_gpcr_df.to_csv('ic50_mor_from_gpcr.csv', index=False)
#ec50_mor_from_gpcr_df = ec50_gpcr_df[ec50_gpcr_df['target_chembl_id'] == 'CHEMBL233']
#print(f'The length of ec50_mor_from_gpcr_df is {len(ec50_mor_from_gpcr_df)}')
#ec50_mor_from_gpcr_df.to_csv('ec50_mor_from_gpcr.csv', index=False)

The length of ki_mor_from_gpcr_df is 3821
The length of ic50_mor_from_gpcr_df is 1088
The length of ec50_mor_from_gpcr_df is 1850


In [4]:
ki_mor_df['assay_category']= ki_mor_df['dataset'].apply(lambda x : 'RBA' if x in ['1', '36', '38', ] else 'Others')

## IC50

In [14]:
import glob

# Find all files starting with 'ki_target_CHEMBL233' in the FEATURIZE_DATA_DIR
file_pattern = os.path.join(FEATURIZE_DATA_DIR, 'ic50_maxcur', 'ic50_target_CHEMBL233*.pkl')
file_list = glob.glob(file_pattern)

# for file 'ki_target_CHEMBL233_1_fp.pkl', add a column 'dataset' with value 'ki_mor_1'; In a similar fashion, for all other files, add a column 'dataset' with value 'ki_mor_2' for file 'ki_target_CHEMBL233_2_fp.pkl'. Finally, concatenate all the dataframes into a single dataframe ki_mor_df.
ic50_mor_df = pd.DataFrame()
for file in file_list:
    df = pd.read_pickle(file)
    dataset = os.path.basename(file).split('_')[3]
    print(dataset)
    df['dataset'] = dataset

    ic50_mor_df = pd.concat([ic50_mor_df, df], axis=0)

# load  all ki data for GPCR
ic50_gpcr_df = pd.read_pickle(os.path.join(FEATURIZE_DATA_DIR,'ic50_maxcur_8_fp.pkl'))

1


In [15]:
print(f'The length of ic50_gpcr_df is {len(ic50_gpcr_df)}')
print(f'The columns in ic50_gpcr_df is {ic50_gpcr_df.columns}')

The length of ic50_gpcr_df is 88622
The columns in ic50_gpcr_df is Index(['assay_id', 'assay_chembl_id', 'tid', 'target_chembl_id',
       'standard_type', 'pchembl_value', 'assay_type', 'assay_category',
       'assay_organism', 'assay_tax_id', 'assay_strain', 'assay_tissue',
       'assay_cell_type', 'assay_subcellular_fraction', 'bao_format',
       'variant_id', 'assay_test_type', 'assay_desc', 'cell_id', 'tissue_id',
       'curated_by', 'relationship_type', 'aidx', 'confidence_score',
       'molregno', 'compound_chembl_id', 'canonical_smiles', 'assay_info_hash',
       'ecfp4', 'map4c', 'activity'],
      dtype='object')


# tmap

In [29]:
tmap_plot(ki_mor_df, title='ki_mor_binary_tmap', category_col='dataset', node_size=1/50, point_scale=1.0, colormap=ListedColormap(["#D62728", "#59B4C3"]))

In [27]:
#tmap_plot(ki_gpcr_df, title='ki_gpcr_tmap', category_col='target_chembl_id', node_size=1/50, point_scale=2.0, max_point_size=100, k=1000, colormap=['viridis', 'tab20', 'turbo', 'tab10'])
#tmap_plot(ki_gpcr_df, title='ki_gpcr_tmap', category_col='target_chembl_id', node_size=50, point_scale=2.0, max_point_size=20, k=1000, colormap=['viridis', 'tab20', 'turbo', 'tab10']) # Ye's parameters
#tmap_plot(ki_gpcr_df, title='ki_gpcr_tmap', category_col='target_chembl_id', node_size=1/32, point_scale=5, max_point_size=20, k=1000, colormap=['viridis', 'tab20', 'turbo', 'tab10']) # Maedeh's parameters
#tmap_plot(ic50_gpcr_df, title='ic50_gpcr_tmap', category_col='target_chembl_id', node_size=1/32, point_scale=5, max_point_size=50, k=1000, colormap=['viridis', 'tab20', 'turbo', 'tab10'])
tmap_plot(ki_gpcr_df, title='ki_gpcr_tmap', category_col='target_name', node_size=1/50, point_scale=2.5, max_point_size=10, k=1000, colormap=ListedColormap(["#211C6A", "#74E291", "#59B4C3", "#EFF396", "#D9D9D9"])) # 2048, 32