In [None]:
# conda env: pyg(Python 3.9.16)
import sys
from datacat4ml.const import FETCH_DATA_DIR, FETCH_FIG_DIR, FEAT_DATA_DIR, FEAT_FIG_DIR
from datacat4ml.Scripts.data_prep.data_categorize.categorize_regex import ki_gpcr_df, ic50_gpcr_df, ec50_gpcr_df, OR_dfs

import os
import pandas as pd
import numpy as np
import tmap as tm
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
from faerun import Faerun

from rdkit import Chem
from rdkit.Chem import AllChem

# Add identifier to the dfs for labeling in the tmap

In [None]:
ki_gpcr_df['target_name'] = ki_gpcr_df['target_chembl_id'].apply(lambda x: 'MOR' if x == 'CHEMBL233' else 'KOR' if x == 'CHEMBL237' else 'DOR' if x == 'CHEMBL236' else 'NOR' if x == 'CHEMBL2014' else 'Others')
ic50_gpcr_df['target_name'] = ic50_gpcr_df['target_chembl_id'].apply(lambda x: 'MOR' if x == 'CHEMBL233' else 'KOR' if x == 'CHEMBL237' else 'DOR' if x == 'CHEMBL236' else 'NOR' if x == 'CHEMBL2014' else 'Others')
ec50_gpcr_df['target_name'] = ec50_gpcr_df['target_chembl_id'].apply(lambda x: 'MOR' if x == 'CHEMBL233' else 'KOR' if x == 'CHEMBL237' else 'DOR' if x == 'CHEMBL236' else 'NOR' if x == 'CHEMBL2014' else 'Others')

# Functions

In [None]:
def calc_fp_in_df(df, radius=2, nbits=1024):
    '''
    Cacluate the fingerprint based on column 'canonical_smiles' in the dataframe and add it to the dataframe
    '''
    new_df = df.copy()
    # Calculate the fingerprint based on the canonical_smiles
    new_df['mol'] = new_df['canonical_smiles'].apply(Chem.MolFromSmiles)
    new_df['fp'] = new_df['mol'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, radius, nbits))

    print(f'The shape of df is {new_df.shape}')
    
    return new_df 

def tmap_plot(df, title:str ='tmap', category_col:str='target_chembl_id',
              node_size:float=1/32, mmm_repeats:int=2, steps:int=5, k:int=1000,
              shader:str='smoothCircle',  point_scale:float=2.5, max_point_size:int=10,
              colormap:list=['Set1', 'tab20', 'turbo', 'tab10' ]):
    '''determining the layout of the TMAP, and plotting the TMAP'''
    lf = tm.LSHForest(2048, 32) # a locality-sensitive hashing forest, used for approximate nearest neighbor search is initialized with parameters 512 and 32.
    
    # check if the column 'fp' is in the dataframe
    if fp not in df.columns:
        df = calc_fp_in_df(df, radius=2, nbits=1024)
    
    fp = np.array(df[fp])  # converting the FPs to vector units
    fps = []
    for i in fp:
        vec = tm.VectorUint(i)
        fps.append(vec)
    lf.batch_add(fps)
    lf.index()
    cfg = tm.LayoutConfiguration()  # configuration parameters for TMAP layout
    cfg.node_size = node_size  # size of nodes which affects the magnitude of their repelling force.
    cfg.mmm_repeats = mmm_repeats  # number of repeats of the per-level layout algorithm
    cfg.sl_extra_scaling_steps = steps  # sets the number of repeats of the scaling
    cfg.k = k  # number of nearest neighbours used to create the k-nearest neighbour graph. The higher the value, the more computationally expensive.
    cfg.sl_scaling_type = tm.RelativeToAvgLength  # Defines the relative scale of the graph

    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)

    # create a dict for the categories in 'dataset' column
    category_labels, category_data = Faerun.create_categories(df[category_col])

    # TMAP for Aline's compounds based on the three categories, aromatic ring fraction, heavy atom counts and qualitative estimation drug-likeliness
    f = Faerun(
        view="front",
        coords=False,
        title="",
        clear_color="#FFFFFF"
    )

    f.add_scatter(
        title,
        {
            "x": x,
            "y": y,
            "c": [category_data, # categories

                df['tid'].values.tolist(), # aromatic fraction
                df['assay_id'].values.tolist(), # heavy atom counts
                df['pchembl_value'].values.tolist(), # qualitative estimation of drug likeliness

                ],
            "labels": df['canonical_smiles'].values.tolist(), # SMILES as labels
        },
        shader="sphere",
        point_scale=5,
        max_point_size=20,
        legend_labels=[category_labels, None, None, None],
        categorical=[True, False, False, False],
        colormap=colormap,
        series_title=[category_col, 'tid', 'assay_id', 'pchembl_value'],
        has_legend=True,
    )
    f.add_tree("AlineDB_TMAP_tree", {"from": s, "to": t}, point_helper=title)
    f.plot(title, template='smiles')

# Plot the TMAP

In [None]:
tmap_plot(ki_gpcr_df, title='ki_gpcr_tmap', category_col='target_name', node_size=1/50, point_scale=2.5, max_point_size=10, k=1000, colormap=ListedColormap(["#211C6A", "#74E291", "#59B4C3", "#EFF396", "#D9D9D9"]))