In [2]:
# conda env: pyg (Python3.9.16)
import sys
from datacat4ml.const import DATA_DIR, FIG_DIR, FETCH_DATA_DIR , FETCH_FIG_DIR

import os
from typing import List
import string
# load ipython-sql, a Jupyter Notebook magic extension. 
%load_ext sql 
import sqlite3
from sqlite3 import connect
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from sklearn.metrics import r2_score,cohen_kappa_score,balanced_accuracy_score,\
    median_absolute_error,matthews_corrcoef
import hashlib
from collections import Counter, defaultdict

from rdkit import Chem
from rdkit.Chem import AllChem

# Connect to the database

In [2]:
%sql sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.dbf

conn = sqlite3.connect('/storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db')

# Fetch GPCR data

## uniport_ids

In [3]:
# All GPCR proteins in human
GPCR_human = pd.read_csv(os.path.join(DATA_DIR, 'GPCR_human.tsv'),sep='\t')
gpcr_uniprot_ids_list = GPCR_human['Entry'].tolist()
print(f'The number of proteins in GPCR_human is {len(gpcr_uniprot_ids_list)}')
print(f'The first 5 proteins in GPCR_human are {gpcr_uniprot_ids_list[:5]}')

The number of proteins in GPCR_human is 898
The first 5 proteins in GPCR_human are ['Q16570', 'O00590', 'P25106', 'Q9NPB9', 'P18509']


In [4]:
# Opioid receptor proteins in human
mor_uniprot_id = 'P35372'
kor_uniprot_id = 'P41145'
dor_uniprot_id = 'P41143'
nor_uniprot_id = 'P41146'

mor_chembl_id = 'CHEMBL233'
kor_chembl_id = 'CHEMBL237'
dor_chembl_id = 'CHEMBL236'
nor_chembl_id = 'CHEMBL2014'

## write the below columns to a csv file

x
- 'canonical_smiles'
- assays.
    - assay_id,
    - chembl_id --> 'assay_chembl_id',
    - doc_id, 
    
    in the maxMatch:
    - **assay_type**, e.g. B, F.
    - **assay_category**, (very sparse), e.g. Selectivity assay, Affinity biochemical assay, Affinity on-target cellular assay, Affinity phenotypic cellular assay, GPCR beta-arrestin assay, Thermal shift assay.
    - **assay_organism**, e.g. Homo sapiens, Rattus norvegicus
    - **assay_tax_id**, e.g. 9606 (for human), 1280 (for Staphylococcus aureus)
    - **assay_strain**, (median sparse), e.g. LV9, BaL.
    - **assay_tissue**, (median sparse), e.g. Brain, Plasma
    - **assay_cell_type**, (median sparse), e.g. PC-3M, CHO
    - **assay_subcellular_fraction**, (very sparse), e.g. Membrane, Microsome, Mitochondria
    - **bao_format**, (e.g, BAO_0000019: could be cell-based format, organism-based format)
    - **variant_id**, (very sparse)

    something else
    - **assay_test_type**, (sparse), e.g. in vitro, in vivo.
    - **description**
    - cell_id, (median sparse)
    - tissue_id, (median sparse)
    - curated_by, e.g. Autocuration, Intermediate, Expert. 
    - relationship_type, ? e.g. H, U, N, D, M, S 
    - aidx,  ? e.g. CLD0, 1480429, 1480486

support
- target_dictionary.chembl_id --> 'target_chembl_id'
- standard_type, standard_relation, standard_units
'compound_chembl_id'
- assays.relationship_type (e.g. D, stands for direct protein target assigned)

annotation
- asaays.src_id (e.g. integers that are unique identifiers for sources in UniChem )


y
- 'pchembl_value'

## functions

### def gather_data_for_size

In [8]:
def gather_data_for_size(uniprot_id: List[str], standard_type:str,
                         onlyDocs=True,removeMutants=True,confidenceScore=8,
                         maxAssaySize=100):
    """ Get the data for the size of the dataset """
    
    if isinstance(uniprot_id, str):
        uniprot_id = [uniprot_id]
    uniprot_id_str = ",".join(f"'{id}'" for id in uniprot_id)

    print('table temp_targets')
    %sql \
        drop table if exists temp_targets
    %sql \
        create table temp_targets as \
        select tid, component_id, accession, target_type, pref_name target_pref_name, chembl_id target_chembl_id \
        from target_components \
        join component_sequences using(component_id) \
        join target_dictionary using(tid) \
        where accession in ({uniprot_id_str}) \
        and target_type = 'SINGLE PROTEIN';
    print(f"The shape of temp_targets is {pd.read_sql('select * from temp_targets', con=conn).shape}")

    # collect activities for the assays that meet the basic criteria
    print('table temp_assays')
    %sql \
        drop table if exists temp_assays
    %sql \
        create table temp_assays as \
        select assay_id, assays.chembl_id assay_chembl_id, assays.description assay_desc, assays.doc_id assay_doc_id, variant_id, \
        docs.year doc_date, docs.chembl_id doc_chembl_id, \
        tid, accession, target_type, target_pref_name, target_chembl_id, \
        count(distinct(molregno)) cnt \
        from activities \
        join assays using(assay_id) \
        join docs on (assays.doc_id = docs.doc_id) \
        join temp_targets using (tid) \
        where standard_type =:standard_type \
        and pchembl_value is not null \
        group by tid, assay_doc_id, assay_id \
        order by cnt desc;
    print(f"The shape of temp_assays is {pd.read_sql('select * from temp_assays', con=conn).shape}")

    # now remove rows for assays which are not compatible without curation steps
    print('check onlyDocs')
    if onlyDocs:
        %sql \
            delete from temp_assays where doc_date is null;

    print('check removeMutants')
    if removeMutants:
        %sql \
            delete from temp_assays where variant_id is not null or lower(assay_desc) like '%mutant%'\
                or lower(assay_desc) like '%mutantion%' or lower(assay_desc) like '%variant%';
                
    print('check confidenceScore')
    # filter out assays with confidence score less than confidenceScore
    %sql \
        drop table if exists temp_assays_filtered;
    %sql \
        create table temp_assays_filtered as \
        select ta.*, confidence_score from temp_assays ta \
            join assays using(assay_id) \
            where confidence_score >= :confidenceScore;  
    print(f"The shape of temp_assays_filtered is {pd.read_sql('select * from temp_assays_filtered', con=conn).shape}")
    
    # select activities from the assays we've identified which have between less than maxAssaySize tested compounds
    # why are 'pchembl_value' and 'standard_type' used here again? because here join table 'activities' again only on 'assay_id'.
    print('temp_acts')
    %sql \
        drop table if exists temp_acts
    %sql \
        create table temp_acts as \
        select assay_id, assay_chembl_id, tid, target_chembl_id,molregno,pchembl_value,doc_chembl_id,standard_type,activity_id \
        from activities \
        join temp_assays_filtered using (assay_id)\
        where pchembl_value is not null \
        and standard_type =:standard_type \
        and cnt <= :maxAssaySize;  
    print(f"The shape of temp_acts is {pd.read_sql('select * from temp_acts', con=conn).shape}")

    # count the number of unique compounds in each assay_id
    # `group by`statemenet groups rows that have the same values into summary rows
    print('temp_acts_counts')
    %sql \
        drop table if exists temp_acts_counts
    %sql \
        create table temp_acts_counts as \
        select assay_id, assay_chembl_id, ta.tid, target_chembl_id, count(distinct molregno) cnt, doc_chembl_id \
        from temp_acts ta \
        join assays using (assay_id) \
        group by assay_chembl_id, target_chembl_id, doc_chembl_id \
        order by cnt desc;
    print(f"The shape of temp_acts_counts is {pd.read_sql('select * from temp_acts_counts', con=conn).shape}")
    
    # count the number of unique assays for each target
    print('temp_targets_counts')
    %sql \
        drop table if exists temp_targets_counts
    %sql \
        create table temp_targets_counts as \
        select tid, target_chembl_id, count(distinct assay_chembl_id) target_assay_count, sum(cnt) target_compound_count \
        from temp_acts_counts tac \
        group by target_chembl_id \
        order by target_assay_count desc;
    print(f"The shape of temp_targets_counts is {pd.read_sql('select * from temp_targets_counts', con=conn).shape}")

#### temp_targets

In [None]:
uniprot_id = gpcr_uniprot_ids_list
if isinstance(uniprot_id, str):
    uniprot_id = [uniprot_id]
uniprot_id_str = ",".join(f"'{id}'" for id in uniprot_id)

print('table temp_targets')
%sql \
    drop table if exists temp_targets
%sql \
    create table temp_targets as \
    select tid, component_id, accession, target_type, pref_name target_pref_name, chembl_id target_chembl_id \
    from target_components \
    join component_sequences using(component_id) \
    join target_dictionary using(tid) \
    where accession in ({uniprot_id_str}) \
    and target_type = 'SINGLE PROTEIN';

# display the table
target_df = %sql select * from temp_targets;
print(f'The shape of the target_df is {target_df.DataFrame().shape}')
target_df

#### temp_assays

In [None]:
standard_type = 'IC50'
print('table temp_assays')
# collect activities for the assays that meet the basic criteria
%sql \
    drop table if exists temp_assays
%sql \
    create table temp_assays as \
    select assay_id, assays.chembl_id assay_chembl_id, assays.description assay_desc, assays.doc_id assay_doc_id,variant_id, \
    docs.year doc_date, docs.chembl_id doc_chembl_id, \
    tid, accession, target_type, target_pref_name, target_chembl_id, \
    count(distinct(molregno)) cnt \
    from activities \
    join assays using(assay_id) \
    join docs on (assays.doc_id = docs.doc_id) \
    join temp_targets using (tid) \
    where standard_type =:standard_type \
    and pchembl_value is not null \
    group by tid, assay_doc_id, assay_id \
    order by cnt desc;

# save temp_assays to a pandas dataframe
assay_df_pre = %sql select * from temp_assays
assay_df = assay_df_pre.DataFrame()
print(f'The shape of the assay_df is {assay_df.shape}')
assay_df.head(1)

#### temp_assays_1_cs

In [None]:
onlyDocs = True
removeMutants = True
confidenceScore = 8    

# now remove rows for assays which are not compatible without curation steps
print('check onlyDocs')
if onlyDocs:
    %sql \
        delete from temp_assays where doc_date is null;
print('check removeMutants')
if removeMutants:
    %sql \
        delete from temp_assays where variant_id is not null or lower(assay_desc) like '%mutant%'\
            or lower(assay_desc) like '%mutantion%' or lower(assay_desc) like '%variant%';
print('check confidenceScore')
# filter out assays with confidence score less than confidenceScore
%sql \
    drop table if exists temp_assays_1_cs;
%sql \
    create table temp_assays_1_cs as \
    select ta.*, confidence_score from temp_assays ta \
        join assays using(assay_id) \
        where confidence_score >= :confidenceScore;

# save temp_assays to a pandas dataframe
curated_assay_df_pre = %sql select * from temp_assays_1_cs
curated_assay_df = curated_assay_df_pre.DataFrame()
print(f'The shape of the assay_df is {curated_assay_df.shape}')
# display the rows 50-100 of the table
curated_assay_df.sort_values(by='cnt', ascending=True).iloc[:5]

#### temp_acts

In [None]:
maxAssaySize = 100
# select activities from the assays we've identified which have between less than maxAssaySize tested compounds
print('temp_acts')
%sql \
    drop table if exists temp_acts
%sql \
    create table temp_acts as \
    select assay_id, assay_chembl_id, tid, target_chembl_id,molregno,pchembl_value,doc_chembl_id,standard_type,activity_id \
    from activities \
    join temp_assays_1_cs using (assay_id)\
    where pchembl_value is not null \
    and standard_type =:standard_type \
    and cnt <= :maxAssaySize;

# save temp_assays to a pandas dataframe
temp_acts_df_pre = %sql select * from temp_acts
temp_acts_df = temp_acts_df_pre.DataFrame()
print(f'The shape of the temp_acts_df is {temp_acts_df.shape}')
temp_acts_df.head(1)

#### temp_acts_counts

In [None]:
# count the number of unique compounds in each assay id
print('temp_acts_counts')
%sql \
    drop table if exists temp_acts_counts
%sql \
    create table temp_acts_counts as \
    select assay_id, assay_chembl_id, ta.tid, target_chembl_id, count(distinct molregno) cnt, doc_chembl_id \
    from temp_acts ta \
    join assays using (assay_id) \
    group by assay_chembl_id, target_chembl_id, doc_chembl_id \
    order by cnt desc;

# save temp_assays to a pandas dataframe
temp_acts_counts_df_pre = %sql select * from temp_acts_counts
temp_acts_counts_df = temp_acts_counts_df_pre.DataFrame()
print(f'The shape of the temp_acts_counts_df is {temp_acts_counts_df.shape}')
temp_acts_counts_df

In [None]:
# check the duplicate combinations of 'target_chembl_id and doc_chembl_id'
# here we discuss the necessity to only get one assay per target per doc --> not necessary
%sql \
    drop table if exists test_table
%sql \
    create table test_table as \
    select target_chembl_id, doc_chembl_id, COUNT(*) \
    from temp_acts_counts \
    group by target_chembl_id, doc_chembl_id \
    having COUNT(*) > 1;

test_table_df_pre = %sql select * from test_table
test_table_df = test_table_df_pre.DataFrame()
print(f'The shape of the test_table_df is {test_table_df.shape}')
test_table_df

#### temp_targets_counts

In [None]:
# count the number of unique compounds in each assay id
print('temp_targets_counts')
%sql \
    drop table if exists temp_targets_counts
%sql \
    create table temp_targets_counts as \
    select tid, target_chembl_id, count(distinct assay_chembl_id) target_assay_count, sum(cnt) target_compound_count \
    from temp_acts_counts tac\
    group by target_chembl_id \
    order by target_assay_count desc;

# save temp_assays to a pandas dataframe
temp_targets_counts_df_pre = %sql select * from temp_targets_counts
temp_targets_counts_df = temp_targets_counts_df_pre.DataFrame()
print(f'The shape of the temp_targets_counts_df is {temp_targets_counts_df.shape}')
temp_targets_counts_df

### Process the assay info

In [9]:
def append_assay_info(readout = 'Ki',uniprot_id = gpcr_uniprot_ids_list,
                      onlyDocs=True,removeMutants=True,confidenceScore=8,maxAssaySize=100):
    '''
    Append the assay information to the output of function gather_data_for_size
    '''
    
    gather_data_for_size(uniprot_id=uniprot_id,standard_type=readout,
                         onlyDocs=onlyDocs,removeMutants=removeMutants,confidenceScore=confidenceScore, maxAssaySize=maxAssaySize)

    print (f'raw_data for {readout}')
    
    %sql \
        drop table if exists raw_data
    raw_data = %sql \
        select assay_id, ta_1.assay_chembl_id, ta_1.tid, ta_1.target_chembl_id, standard_type, pchembl_value, \
        assay_type,assay_category,assay_organism, assay_tax_id, assay_strain, assay_tissue, assay_cell_type, assay_subcellular_fraction, bao_format, ta_1.variant_id, assay_test_type, \
        assay_desc, cell_id, tissue_id, curated_by, relationship_type, aidx, \
        ta_1.confidence_score, molregno, cid.chembl_id compound_chembl_id,canonical_smiles \
        from activities \
        join temp_assays_filtered ta_1 using (assay_id) \
        join temp_targets_counts using (tid) \
        join temp_acts_counts using (assay_id) \
        join assays using (assay_id) \
        join chembl_id_lookup cid on (entity_type='COMPOUND' and molregno=entity_id) \
        join compound_structures using (molregno) \
        where pchembl_value is not null \
        and standard_type=:readout;

    return raw_data
    
def hash_assay_info(raw_data, sel_cols:List[str]=['assay_type', 'assay_organism', 'assay_category',
                                                    'assay_tax_id','assay_strain','assay_tissue',
                                                    'assay_cell_type','assay_subcellular_fraction',
                                                    'bao_format', 'variant_id']):
    '''
    For the retrieved data from the database, concatenate the columns related to assay info, hash these values, and convert the data into a pandas dataframe.

    params:
        raw_data: The retrieved data from the database.
        sel_cols: A list of column names to be used for hashing.

    Returns:
        A pandas DataFrame with the original data and an additional 'assay_info_hash' column.

    The string in the sel_cols can be the below:
    'assay_type','assay_organism','assay_category',
    'assay_tax_id','assay_strain','assay_tissue',
    'assay_cell_type','assay_subcellular_fraction','bao_format','variant_id'
    '''

    processed_data = []
    for row in raw_data:
        original_data= [str(value) for value in row]
        # concatenate certain fields and hash
        assay_fields = ''.join([str(row[col]) for col in sel_cols])  # Add all fields you need
        hash_result = hashlib.md5(assay_fields.encode()).hexdigest()
        original_data.append(hash_result)
        processed_data.append(tuple(original_data))

    print(f'The length of the processed_data is {len(processed_data)}')

    return processed_data

#### IC50

In [10]:
readout = 'IC50'
print(f'raw_data for {readout}_mincuration')
raw_data = append_assay_info(readout=readout, uniprot_id=gpcr_uniprot_ids_list,
                             onlyDocs=False,removeMutants=False,confidenceScore=8)
ic50_mincur_8_data = hash_assay_info(raw_data)

raw_data for IC50_mincuration
table temp_targets
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
The shape of temp_targets is (381, 6)
table temp_assays
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
The shape of temp_assays is (9299, 13)
check onlyDocs
check removeMutants
check confidenceScore
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
The shape of temp_assays_filtered is (9299, 14)
temp_acts
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j

In [11]:
readout = 'IC50'
print(f'raw_data for {readout}_maxcuration')
raw_data = append_assay_info(readout=readout, uniprot_id=gpcr_uniprot_ids_list,
                             onlyDocs=True,removeMutants=True,confidenceScore=8)
ic50_maxcur_8_data = hash_assay_info(raw_data)

raw_data for IC50_maxcuration
table temp_targets
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
The shape of temp_targets is (381, 6)
table temp_assays
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
The shape of temp_assays is (9299, 13)
check onlyDocs
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
124 rows affected.
check removeMutants
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
232 rows affected.
check confidenceScore
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/c

#### Ki

In [12]:
readout = 'Ki'
print(f'raw_data for {readout}_mincuration')
raw_data = append_assay_info(readout=readout, uniprot_id=gpcr_uniprot_ids_list,
                             onlyDocs=False,removeMutants=False,confidenceScore=8)
ki_mincur_8_data = hash_assay_info(raw_data)

raw_data for Ki_mincuration
table temp_targets
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
The shape of temp_targets is (381, 6)
table temp_assays
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
The shape of temp_assays is (14519, 13)
check onlyDocs
check removeMutants
check confidenceScore
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
The shape of temp_assays_filtered is (14519, 14)
temp_acts
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j

In [13]:
readout = 'Ki'
print(f'raw_data for {readout}_maxcuration')
raw_data = append_assay_info(readout=readout, uniprot_id=gpcr_uniprot_ids_list,
                             onlyDocs=True,removeMutants=True,confidenceScore=8)
ki_maxcur_8_data = hash_assay_info(raw_data)

raw_data for Ki_maxcuration
table temp_targets
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
The shape of temp_targets is (381, 6)
table temp_assays
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
The shape of temp_assays is (14519, 13)
check onlyDocs
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
59 rows affected.
check removeMutants
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
181 rows affected.
check confidenceScore
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/che

#### EC50

In [14]:
readout = 'EC50'
print(f'raw_data for {readout}_mincuration')
raw_data = append_assay_info(readout=readout, uniprot_id=gpcr_uniprot_ids_list,
                             onlyDocs=False,removeMutants=False,confidenceScore=8)
ec50_mincur_8_data = hash_assay_info(raw_data)

raw_data for EC50_mincuration
table temp_targets
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
The shape of temp_targets is (381, 6)
table temp_assays
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
The shape of temp_assays is (6008, 13)
check onlyDocs
check removeMutants
check confidenceScore
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
The shape of temp_assays_filtered is (6008, 14)
temp_acts
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j

In [15]:
readout = 'EC50'
print(f'raw_data for {readout}_maxcuration')
raw_data = append_assay_info(readout=readout, uniprot_id=gpcr_uniprot_ids_list,
                             onlyDocs=True,removeMutants=True,confidenceScore=8)
ec50_maxcur_8_data = hash_assay_info(raw_data)

raw_data for EC50_maxcuration
table temp_targets
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
The shape of temp_targets is (381, 6)
table temp_assays
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
The shape of temp_assays is (6008, 13)
check onlyDocs
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
67 rows affected.
check removeMutants
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
209 rows affected.
check confidenceScore
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/ch

#### plot the datasets counts

In [None]:
def count_assay_cpd(mincur_data, maxcur_data):
    """
    calculate the number of unique assays and total compounds for each tid mincur and maxcur data
    """

    mincur_assay_count = Counter()
    mincur_cpd_count = Counter()
    maxcur_assay_count = Counter()
    maxcur_cpd_count = Counter()

    last_assay = None
    for row in mincur_data:
        tid = row[3]
        aid = row[1]
        conds = row[26]
        if aid != last_assay:
            mincur_assay_count[tid] += 1
            last_assay = aid
        mincur_cpd_count[tid] += 1

    last_assay = None
    for row in maxcur_data:
        tid = row[3]
        aid = row[1]
        conds = row[26]
        if aid != last_assay:
            maxcur_assay_count[tid+"|"+conds] += 1
            last_assay = aid
        maxcur_cpd_count[tid+"|"+conds] += 1
    
    return mincur_assay_count, mincur_cpd_count, maxcur_assay_count, maxcur_cpd_count

In [None]:
mincur_assay_count_ic50, mincur_cpd_count_ic50, maxcur_assay_count_ic50, maxcur_cpd_count_ic50 = count_assay_cpd(ic50_mincur_8_data, ic50_maxcur_8_data)
mincur_assay_count_ki, mincur_cpd_count_ki, maxcur_assay_count_ki, maxcur_cpd_count_ki = count_assay_cpd(ki_mincur_8_data, ki_maxcur_8_data)
mincur_assay_count_ec50, mincur_cpd_count_ec50, maxcur_assay_count_ec50, maxcur_cpd_count_ec50 = count_assay_cpd(ec50_mincur_8_data, ec50_maxcur_8_data)

In [None]:
def plot_dataset_size(mincur_cpd_count_ic50, maxcur_cpd_count_ic50,mincur_cpd_count_ki, maxcur_cpd_count_ki,mincur_cpd_count_ec50, maxcur_cpd_count_ec50,
                      ylabel:str='num_cpds'):
    """Plot the dataset size for mincur and maxcur data"""

    # sort the dataset by size
    mincur_scounts_ic50 = list(sorted(mincur_cpd_count_ic50.values(), reverse=True))
    maxcur_scounts_ic50 = list(sorted(maxcur_cpd_count_ic50.values(), reverse=True))
    mincur_scounts_ki = list(sorted(mincur_cpd_count_ki.values(), reverse=True))
    maxcur_scounts_ki = list(sorted(maxcur_cpd_count_ki.values(), reverse=True))
    mincur_scounts_ec50 = list(sorted(mincur_cpd_count_ec50.values(), reverse=True))
    maxcur_scounts_ec50 = list(sorted(maxcur_cpd_count_ec50.values(), reverse=True))

    plt.figure(figsize=(12,6))
    plt.subplot(1,2,1)

    plt.plot(mincur_scounts_ic50,label='IC50')
    plt.plot(mincur_scounts_ki,label='Ki')
    plt.plot(mincur_scounts_ec50,label='EC50')

    plt.legend()
    plt.xlabel('data set (target) index');plt.xlim(-5,100);
    plt.ylabel(ylabel)
    plt.title('minimal curation')
    plt.ylim(0,5500)

    plt.subplot(1,2,2)
    plt.plot(maxcur_scounts_ic50,label='IC50')
    plt.plot(maxcur_scounts_ki,label='Ki')
    plt.plot(maxcur_scounts_ec50,label='EC50')

    plt.legend()
    plt.xlabel('data set (target+conditions) index');plt.xlim(-5,100)
    plt.title('maximal curation');
    plt.ylim(0,5500)
    plt.tight_layout()

    plt.savefig(f'{FETCH_FIG_DIR}/combined_{ylabel}_sizes.pdf',bbox_inches='tight')

In [None]:
plot_dataset_size(mincur_cpd_count_ic50, maxcur_cpd_count_ic50,mincur_cpd_count_ki, maxcur_cpd_count_ki,mincur_cpd_count_ec50, maxcur_cpd_count_ec50,
                  ylabel='num_cpds')

In [None]:
plot_dataset_size(mincur_assay_count_ic50, maxcur_assay_count_ic50,mincur_assay_count_ki, maxcur_assay_count_ki,mincur_assay_count_ec50, maxcur_assay_count_ec50,
                  ylabel='num_assays')

### Write data

#### mincur and maxcur data for IC50, Ki, and EC50

In [16]:
def write_data(data, filename:str, columns=['assay_id', 'assay_chembl_id', 'tid', 'target_chembl_id', 'standard_type', 'pchembl_value', 
                                   'assay_type', 'assay_category', # checkpoint 1
                                   'assay_organism', 'assay_tax_id', 'assay_strain', 'assay_tissue', 'assay_cell_type', 'assay_subcellular_fraction', 'bao_format', 'variant_id', 'assay_test_type', 
                                   'assay_desc', 'cell_id', 'tissue_id', 'curated_by', 'relationship_type', 'aidx', 
                                   'confidence_score', 'molregno', 'compound_chembl_id', 'canonical_smiles', 'assay_info_hash']):
    """Save the data to a pandas dataframe"""

    data_df = pd.DataFrame(data, columns=columns)
    
    # write the data to a csv file
    data_df.to_csv(f'{FETCH_DATA_DIR}/{filename}.csv',index=False)
    
    return data_df

In [17]:
ic50_mincur_df = write_data(ic50_mincur_8_data, 'ic50_mincur_8_data')
ic50_maxcur_df = write_data(ic50_maxcur_8_data, 'ic50_maxcur_8_data')
ki_mincur_df = write_data(ki_mincur_8_data, 'ki_mincur_8_data')
ki_maxcur_df = write_data(ki_maxcur_8_data, 'ki_maxcur_8_data')
ec50_mincur_df = write_data(ec50_mincur_8_data, 'ec50_mincur_8_data')
ec50_maxcur_df = write_data(ec50_maxcur_8_data, 'ec50_maxcur_8_data')

#### OR data with unique 'assay_hash_info'

In [20]:
# only for dataset with maxcuration
def write_data_uniq_cond(df, min_dataset_size=100, readout='IC50', curation='maxcur'):
    """
    write the datasets with unique combinations of 'target_chembl_id' and 'assay_info_hash'
    """
    yaml=[]
    
    collected_data = defaultdict(list)
    assay_cond_accum = defaultdict(list)
    assay_accum = defaultdict(set)

    for i, row in df.iterrows():
        assay_info_hash = row['assay_info_hash']
        target_chembl_id = row['target_chembl_id']
        assay_chembl_id = row['assay_chembl_id']

        # append all columns to the collected_data
        collected_data[(target_chembl_id, assay_info_hash)].append(row.values)
        assay_cond_accum[target_chembl_id].append(assay_info_hash)
        assay_accum[(target_chembl_id,assay_info_hash)].add(assay_chembl_id)

    sets_to_keep = [k for k, v in collected_data.items() if len(v) >= min_dataset_size]
    for k in assay_cond_accum:
        assay_cond_accum[k] = [y for y in assay_cond_accum[k] if (k, y) in sets_to_keep]

    for target_chembl_id, conds in collected_data.keys():
        if (target_chembl_id, conds) not in sets_to_keep:
            continue
        rows = collected_data[(target_chembl_id, conds)]
        cols = np.array(rows).transpose()
        cond_idx = assay_cond_accum[target_chembl_id].index(conds)+1

        columns = df.columns
        df = pd.DataFrame(dict(zip(columns, cols)))
        fname = f'{readout.lower()}_target_{target_chembl_id}_{cond_idx}.csv'  
        df.to_csv(os.path.join(FETCH_DATA_DIR, f'{readout.lower()}_{curation}', fname), index=False)

        minAct = min(df.pchembl_value)
        maxAct = max(df.pchembl_value)
        medAct = np.median(pd.to_numeric(df.pchembl_value, errors='coerce'))
        actType = readout
        targetd = %sql \
        select * from target_dictionary  \
            where chembl_id=:target_chembl_id
        targetd = dict(targetd[0])    
        template=f'''  {target_chembl_id}-{cond_idx}:
        description: "Target {target_chembl_id}: {targetd['pref_name']}"
        args:
        filename: '{{{{ CATALOG_DIR }}}}/source_data/{fname}'
        smilesColumn: canonical_smiles
        metadata:
        url: https://www.ebi.ac.uk/chembl/target_report_card/{target_chembl_id}/
        source: ChEMBL_32
        target_organism: {targetd['organism']}
        assays_included: {','.join(assay_accum[(target_chembl_id,conds)])}
        num_points: {len(rows)}
        activity_type: {actType}
        min pchembl_value: {float(minAct):.2f}
        max pchembl_value: {float(maxAct):.2f}
        median pchembl_value: {float(medAct):.2f}
        driver: intake_rdkit.smiles.SmilesSource
        '''
        yaml.append(template)
    print('\n'.join(yaml))
    with open(os.path.join(FETCH_DATA_DIR, f'{readout.lower()}_{curation}', f'{readout.lower()}_{curation}_datasets.yaml'),'w+') as outf:
        header=f'''metadata:

    summary: |
        Collection of {readout} datasets with pchembl_values for bioactivity prediction.
        
    sources:'''
        print(header,file=outf)
        print('\n'.join(yaml),file=outf)

In [21]:
write_data_uniq_cond(ic50_maxcur_df, min_dataset_size=100, readout='IC50', curation='maxcur')
write_data_uniq_cond(ki_maxcur_df, min_dataset_size=100, readout='Ki', curation='maxcur')
write_data_uniq_cond(ec50_maxcur_df, min_dataset_size=100, readout='EC50', curation='maxcur')

 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34.db
Done.
 * sqlite:////storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/chembl_34_sqlite/chembl_34

### Check the HHD dataset

#### mor_ki

In [5]:
# load the csv files with the filename containing 'CHEMBL233' as separate dataframes
mor_ki_files = [f for f in os.listdir(os.path.join(FETCH_DATA_DIR, 'ki_maxcur')) if 'CHEMBL233' in f]
print(f'mor_ki_files contains {mor_ki_files}')
ki_target_CHEMBL233_38_df = pd.read_csv(os.path.join(FETCH_DATA_DIR, 'ki_maxcur', mor_ki_files[0]))

mor_ki_files contains ['ki_target_CHEMBL233_38.csv', 'ki_target_CHEMBL233_1.csv', 'ki_target_CHEMBL233_639.csv', 'ki_target_CHEMBL233_308.csv', 'ki_target_CHEMBL233_1022.csv', 'ki_target_CHEMBL233_36.csv', 'ki_target_CHEMBL233_1632.csv', 'ki_target_CHEMBL233_2942.csv', 'ki_target_CHEMBL233_379.csv']


In [6]:
ki_target_CHEMBL233_38_df

Unnamed: 0,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,pchembl_value,assay_type,assay_category,assay_organism,assay_tax_id,...,cell_id,tissue_id,curated_by,relationship_type,aidx,confidence_score,molregno,compound_chembl_id,canonical_smiles,assay_info_hash
0,149324,CHEMBL882430,129,CHEMBL233,Ki,7.14,B,,,,...,,,Autocuration,H,CLD0,8,103574,CHEMBL302801,CCN(CC)C(=O)c1ccc(/C(=C2\C[C@H]3CC[C@@H](C2)N3...,22cc1cdcf2fec4da1aa585de2827efd6
1,222067,CHEMBL843459,129,CHEMBL233,Ki,8.85,B,,,,...,,,Autocuration,H,CLD0,8,609120,CHEMBL610046,O=C1[C@@H](Cc2ccccc2)C[C@@]2(O)[C@H]3Cc4ccc(O)...,22cc1cdcf2fec4da1aa585de2827efd6
2,222067,CHEMBL843459,129,CHEMBL233,Ki,8.72,B,,,,...,,,Autocuration,H,CLD0,8,164404,CHEMBL101519,O=C1/C(=C\c2ccccc2)C[C@@]2(O)[C@H]3Cc4ccc(O)c5...,22cc1cdcf2fec4da1aa585de2827efd6
3,222067,CHEMBL843459,129,CHEMBL233,Ki,9.72,B,,,,...,,,Autocuration,H,CLD0,8,609125,CHEMBL610550,Oc1ccc2c3c1O[C@H]1[C@@H](Nc4ccccc4)CC[C@@]4(O)...,22cc1cdcf2fec4da1aa585de2827efd6
4,222067,CHEMBL843459,129,CHEMBL233,Ki,7.88,B,,,,...,,,Autocuration,H,CLD0,8,418977,CHEMBL252172,O=C1/C(=C/c2ccccc2)C[C@@]2(O)[C@H]3Cc4ccc(O)c5...,22cc1cdcf2fec4da1aa585de2827efd6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,884080,CHEMBL2215556,129,CHEMBL233,Ki,5.32,B,,,,...,,,Autocuration,H,CLD0,8,1445457,CHEMBL2205829,OC12C3C4CC5C6C4C1C6C(C53)N2Cc1ccc2c(c1)OCO2,22cc1cdcf2fec4da1aa585de2827efd6
258,884080,CHEMBL2215556,129,CHEMBL233,Ki,5.90,B,,,,...,,,Autocuration,H,CLD0,8,1445455,CHEMBL2205827,COc1ccc(CN2C3C4C5CC6C7C5C3C7C2(O)C64)cc1OC,22cc1cdcf2fec4da1aa585de2827efd6
259,884080,CHEMBL2215556,129,CHEMBL233,Ki,5.67,B,,,,...,,,Autocuration,H,CLD0,8,1445453,CHEMBL2205825,COc1ccc(CN2C3C4C5CC6C7C5C3C7C2(O)C64)cc1,22cc1cdcf2fec4da1aa585de2827efd6
260,884080,CHEMBL2215556,129,CHEMBL233,Ki,5.63,B,,,,...,,,Autocuration,H,CLD0,8,1445452,CHEMBL2205824,COc1cccc(CN2C3C4C5CC6C7C5C3C7C2(O)C64)c1,22cc1cdcf2fec4da1aa585de2827efd6


In [7]:
ki_target_CHEMBL233_38_df['assay_desc'].value_counts()

Binding affinity for mu opioid receptor                                                                                70
Binding affinity to mu opioid receptor                                                                                 27
Inhibition of binding of the non-selective opioid antagonist, [3H]diprenorphine, to cloned human mu opioid receptor    27
Binding affinity towards opioid receptor mu was determined                                                             14
Displacement of [125]OXY from mu opioid receptor                                                                       13
Binding affinity determined against Opioid receptor mu 1 from human cloned receptor                                    11
Binding affinity against Opioid receptor mu 1                                                                           8
Binding affinity towards mu opioid receptor was determined                                                              8
Inhibition of [3H]DAMGO 

In [9]:
ki_target_CHEMBL233_38_df[ki_target_CHEMBL233_38_df['assay_desc'] == 'Inhibition of [3H]DAMGO binding to mu-opioid receptor']

Unnamed: 0,assay_id,assay_chembl_id,tid,target_chembl_id,standard_type,pchembl_value,assay_type,assay_category,assay_organism,assay_tax_id,...,cell_id,tissue_id,curated_by,relationship_type,aidx,confidence_score,molregno,compound_chembl_id,canonical_smiles,assay_info_hash
193,302547,CHEMBL827398,129,CHEMBL233,Ki,6.04,B,,,,...,,,Expert,H,CLD0,8,547402,CHEMBL1701,CCOC(=O)C1(c2ccccc2)CCN(C)CC1.Cl,22cc1cdcf2fec4da1aa585de2827efd6
196,302547,CHEMBL827398,129,CHEMBL233,Ki,4.39,B,,,,...,,,Expert,H,CLD0,8,548924,CHEMBL536034,Cc1ccc(C2(C#N)CCN(C)CC2)cc1.Cl,22cc1cdcf2fec4da1aa585de2827efd6
197,302547,CHEMBL827398,129,CHEMBL233,Ki,4.4,B,,,,...,,,Expert,H,CLD0,8,548922,CHEMBL557576,CN1CCC(C#N)(c2ccc(Cl)c(Cl)c2)CC1.Cl,22cc1cdcf2fec4da1aa585de2827efd6
199,302547,CHEMBL827398,129,CHEMBL233,Ki,5.36,B,,,,...,,,Expert,H,CLD0,8,552826,CHEMBL539076,CCOC(=O)C1(c2ccc(Cl)cc2)CCN(C)CC1.Cl,22cc1cdcf2fec4da1aa585de2827efd6
200,302547,CHEMBL827398,129,CHEMBL233,Ki,5.63,B,,,,...,,,Expert,H,CLD0,8,548931,CHEMBL536264,CCOC(=O)C1(c2ccc(I)cc2)CCN(C)CC1.Cl,22cc1cdcf2fec4da1aa585de2827efd6
201,302547,CHEMBL827398,129,CHEMBL233,Ki,5.57,B,,,,...,,,Expert,H,CLD0,8,552824,CHEMBL539075,CCOC(=O)C1(c2ccc(C)cc2)CCN(C)CC1.Cl,22cc1cdcf2fec4da1aa585de2827efd6
202,302547,CHEMBL827398,129,CHEMBL233,Ki,5.69,B,,,,...,,,Expert,H,CLD0,8,552960,CHEMBL541124,CCOC(=O)C1(c2ccc(Cl)c(Cl)c2)CCN(C)CC1.Cl,22cc1cdcf2fec4da1aa585de2827efd6
203,302547,CHEMBL827398,129,CHEMBL233,Ki,5.69,B,,,,...,,,Expert,H,CLD0,8,548948,CHEMBL536952,CCOC(=O)C1(c2ccc3ccccc3c2)CCN(C)CC1.Cl,22cc1cdcf2fec4da1aa585de2827efd6


In [None]:


# IC50
# EC50

# Tmap

### tmap functions

In [None]:
def calc_fp_in_df(df, radius=2, nbits=1024):
    '''
    Cacluate the fingerprint based on column 'canonical_smiles' in the dataframe and add it to the dataframe
    '''
    new_df = df.copy()
    # Calculate the fingerprint based on the canonical_smiles
    new_df['mol'] = new_df['canonical_smiles'].apply(Chem.MolFromSmiles)
    new_df['fp'] = new_df['mol'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, radius, nbits))

    print(f'The shape of df is {new_df.shape}')
    
    return new_df 

def get_activity(x):
    ''' active is defined as pchembl_value > 7 '''
    if x > 7:
        return 'active'
    elif 5 < x <= 7:
        return 'intermediate'
    else:
        return 'inactive'

def tmap_plot(df, title:str,
              node_size:float=1/32, mmm_repeats:int=2, steps:int=5, k:int=1000,
              shader:str='smoothCircle',  point_scale:float=2.5, max_point_size:int=10):
    '''determining the layout of the TMAP, and plotting the TMAP'''
    lf = tm.LSHForest(512, 32) # a locality-sensitive hashing forest, used for approximate nearest neighbor search is initialized with parameters 512 and 32.
    ECFP4 = np.array(df['fp'])  # converting the FPs to vector units
    fps = []
    for i in ECFP4:
        vec = tm.VectorUint(i)
        fps.append(vec)
    lf.batch_add(fps)
    lf.index()
    cfg = tm.LayoutConfiguration()  # configuration parameters for TMAP layout
    cfg.node_size = node_size  # size of nodes which affects the magnitude of their repelling force.
    cfg.mmm_repeats = mmm_repeats  # number of repeats of the per-level layout algorithm
    cfg.sl_extra_scaling_steps = steps  # sets the number of repeats of the scaling
    cfg.k = k  # number of nearest neighbours used to create the k-nearest neighbour graph
    cfg.sl_scaling_type = tm.RelativeToAvgLength  # Defines the relative scale of the graph

    x, y, s, t, _ = tm.layout_from_lsh_forest(lf, cfg)

    # create a Faerun object
    category_labels, category_data = Faerun.create_categories(df['activity'])

    # TMAP for Aline's compounds based on the three categories, aromatic ring fraction, heavy atom counts and qualitative estimation drug-likeliness
    f = Faerun(
        view="front",
        coords=False,
        title="",
        clear_color="#FFFFFF"
    )

    labels = [] # the widget labels
    for i, s in enumerate(df['canonical_smiles']):
        labels.append(
            s
            + "__"
            # convert df['tid'].values to a list of strings and append it to the labels
            + str(df['tid'].values.tolist()[i])
            + "__"
            + str(df['assay_id'].tolist()[i])
            + "__"
            + str(df['assay_tax_id'].tolist()[i])
            )
    f.add_scatter(
        title,
        {
            "x": x,
            "y": y,
            "c": [category_data, # categories

                df['tid'].values.tolist(), # aromatic fraction
                df['assay_id'].values.tolist(), # heavy atom counts
                df['assay_tax_id'].values.tolist(), # qualitative estimation of drug likeliness

                ],
            "labels": labels, # SMILES as labels
        },
        point_scale=point_scale,
        max_point_size=max_point_size,
        shader=shader,
        legend_labels=[category_labels, None, None, None],
        categorical=[True, False, False, False],
        colormap=['Set1', 'rainbow', 'Set1', 'rainbow' ],
        #colormap=['Set1', 'tab20', 'turbo', 'tab10' ],
        series_title=['activity', 'tid', 'assay id', 'assay tax id'],
        has_legend=True,
    )
    f.add_tree('Tree', {"from": s, "to": t}, point_helper=title)
    f.plot(title, template='smiles')

In [None]:
mor_ic50_mincur_df = ic50_mincur_df[ic50_mincur_df['target_chembl_id'] == 'CHEMBL233']

In [None]:
tmap_plot(new_df, title='tmap_gpcr_ki_nodesize50', node_size=1/50)