In [1]:
# conda env: pyg(Python 3.9.16)
import os
import pandas as pd

import seaborn as sns
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

from datacat4ml.utils import mkdirs
from datacat4ml.const import FETCH_DATA_DIR, HET_OR_DIR, HET_GPCR_DIR, CAT_OR_DIR, CAT_GPCR_DIR, CAT_FIG_DIR
from datacat4ml.const import OR_names, OR_chemblids,OR_name_chemblids
from datacat4ml.Scripts.data_prep.data_categorize.categorize_regex import categorize_GPCRs, categorize_ORs, OR_dfs, GPCR_dfs, ki_gpcr_df

  ki_gpcr_df = pd.read_csv(os.path.join(FETCH_DATA_DIR, 'Ki_gpcr_maxcur_8_data.csv'))
  ic50_gpcr_df = pd.read_csv(os.path.join(FETCH_DATA_DIR, 'IC50_gpcr_maxcur_8_data.csv'))
  ec50_gpcr_df = pd.read_csv(os.path.join(FETCH_DATA_DIR, 'EC50_gpcr_maxcur_8_data.csv'))


In [2]:
ki_gpcr_df.columns

Index(['assay_id', 'assay_chembl_id', 'tid', 'target_chembl_id',
       'standard_type', 'standard_relation', 'standard_value',
       'standard_units', 'pchembl_value', 'assay_type', 'assay_category',
       'assay_organism', 'assay_tax_id', 'assay_strain', 'assay_tissue',
       'assay_cell_type', 'assay_subcellular_fraction', 'bao_format',
       'variant_id', 'assay_test_type', 'assay_desc', 'cell_id', 'tissue_id',
       'curated_by', 'relationship_type', 'aidx', 'confidence_score',
       'molregno', 'compound_chembl_id', 'canonical_smiles',
       'assay_info_hash'],
      dtype='object')

# het_datasets: ORs

In [3]:
for name in OR_names:
    chembl_id = OR_name_chemblids[name]
    or_df = OR_dfs[name]
    for standard_type in ['Ki', 'IC50', 'EC50']:
        or_type_df = or_df[or_df['standard_type'] == standard_type]
        file_path = os.path.join(HET_OR_DIR, chembl_id, standard_type)
        mkdirs(file_path)
        or_type_df.to_csv(os.path.join(file_path, f'{chembl_id}_{standard_type}.csv' ), index=False)

# het_gpcr_datasets: All GPCRs

In [4]:
ki_gpcr_df = pd.read_csv(os.path.join(FETCH_DATA_DIR, 'Ki_gpcr_maxcur_8_data.csv'))
ic50_gpcr_df = pd.read_csv(os.path.join(FETCH_DATA_DIR, 'IC50_gpcr_maxcur_8_data.csv'))
ec50_gpcr_df = pd.read_csv(os.path.join(FETCH_DATA_DIR, 'EC50_gpcr_maxcur_8_data.csv'))

  ki_gpcr_df = pd.read_csv(os.path.join(FETCH_DATA_DIR, 'Ki_gpcr_maxcur_8_data.csv'))
  ic50_gpcr_df = pd.read_csv(os.path.join(FETCH_DATA_DIR, 'IC50_gpcr_maxcur_8_data.csv'))
  ec50_gpcr_df = pd.read_csv(os.path.join(FETCH_DATA_DIR, 'EC50_gpcr_maxcur_8_data.csv'))


In [5]:
for target_chembl_id in ki_gpcr_df['target_chembl_id'].unique():
    target_df = ki_gpcr_df[ki_gpcr_df['target_chembl_id'] == target_chembl_id]
    file_path = os.path.join(HET_GPCR_DIR, target_chembl_id, 'Ki')
    mkdirs(file_path)
    target_df.to_csv(os.path.join(HET_GPCR_DIR, target_chembl_id, 'Ki', f'{target_chembl_id}_Ki.csv'), index=False)

for target_chembl_id in ic50_gpcr_df['target_chembl_id'].unique():
    target_df = ic50_gpcr_df[ic50_gpcr_df['target_chembl_id'] == target_chembl_id]
    file_path = os.path.join(HET_GPCR_DIR, target_chembl_id, 'IC50')
    mkdirs(file_path)
    target_df.to_csv(os.path.join(HET_GPCR_DIR, target_chembl_id, 'IC50', f'{target_chembl_id}_IC50.csv'), index=False)

for target_chembl_id in ec50_gpcr_df['target_chembl_id'].unique():
    target_df = ec50_gpcr_df[ec50_gpcr_df['target_chembl_id'] == target_chembl_id]
    file_path = os.path.join(HET_GPCR_DIR, target_chembl_id, 'EC50')
    mkdirs(file_path)
    target_df.to_csv(os.path.join(HET_GPCR_DIR, target_chembl_id, 'EC50', f'{target_chembl_id}_EC50.csv'), index=False)


# cat_datasets (ORs) & cat_gpcr_datasets

## Radio
radioligand replacement binding assay

### binding affinity: Ki, IC50

In [3]:
# create a boolean mask to filter rows that match p_bind 
# and do not match p_bind_ex
p_bind_RBA = r"(?i)affinity|displacement|3H|125I"
p_bind_RBA_ex = r"(?i)camp|gtp|calcium|ca2+|IP1|IP3|arrest|agonis"

# cat_datasets: ORs
or_type_dfs, or_bind_type_dfs, or_bind_plus_dfs, or_bind_exclude_dfs, or_bind_final_dfs, or_bind_len_dfs = categorize_ORs(targets='ORs', effect='bind', assay='RBA', std_types=['Ki', 'IC50'], 
                                                                                        pattern=p_bind_RBA, pattern_ex=p_bind_RBA_ex)

target_chembl_id: CHEMBL233

Target: CHEMBL233

Effect: bind

Assay: RBA

Pattern: (?i)affinity|displacement|3H|125I

Pattern_ex: (?i)camp|gtp|calcium|ca2+|IP1|IP3|arrest|agonis

Standard type: Ki

The shape of type_df is (5682, 31)

The shape of  is (5418, 31)
#assay_desc:
count                                                  5418
unique                                                  374
top       Displacement of [3H]-diprenorphine from human ...
freq                                                    207
Name: assay_desc, dtype: object

#canonical_smiles:
count                                                  5418
unique                                                 4794
top       C[C@@H](NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)NCC(=...
freq                                                     39
Name: canonical_smiles, dtype: object

The shape of plus_df is (139, 31)

The shape of exclude_df is (0, 31)

The shape of final_df is (5557, 31)

##########################
Standard type: IC50

In [4]:
# cat_datasets: GPCRs
gpcr_type_dfs, gpcr_bind_type_dfs, gpcr_bind_len_dfs = categorize_GPCRs(targets='GPCRs', effect='bind', assay='RBA', std_types=['Ki', 'IC50'], 
                                                    pattern=p_bind_RBA, pattern_ex=p_bind_RBA_ex)

Target: CHEMBL2327

Effect: bind

Assay: RBA

Pattern: (?i)affinity|displacement|3H|125I

Pattern_ex: (?i)camp|gtp|calcium|ca2+|IP1|IP3|arrest|agonis

Standard type: Ki

The shape of type_df is (873, 31)

The shape of  is (840, 31)
#assay_desc:
count                                                   840
unique                                                   56
top       Binding affinity for human tachykinin receptor...
freq                                                     86
Name: assay_desc, dtype: object

#canonical_smiles:
count                                                   840
unique                                                  792
top       O=C1CCC(=O)N[C@@H](Cc2c[nH]c3ccccc23)C(=O)N[C@...
freq                                                      5
Name: canonical_smiles, dtype: object

##########################
Standard type: IC50

The shape of type_df is (290, 31)

The shape of  is (214, 31)
#assay_desc:
count                                                   214
u

## G_GTP

### Agonism: EC50

In [5]:
# G-protein dependent functional assays
# GTPgamma binding assay
p_agon_G_GTP = r"(?i)gtp"
p_agon_G_GTP_ex = r"(?i)arrestin|camp|calcium|IP1|IP3|antagonis|inverse agonist|allosteric" 

# cat_datasets: ORs
or_type_dfs, or_agon_G_GTP_dfs, or_agon_G_GTP_plus_dfs, or_agon_G_GTP_exclude_dfs, or_agon_G_GTP_final_dfs, or_agon_G_GTP_len_dfs = categorize_ORs(targets='ORs', effect='agon', assay='G_GTP', std_types=['EC50'], 
                                                                                    pattern=p_agon_G_GTP, pattern_ex=p_agon_G_GTP_ex)


target_chembl_id: CHEMBL233

Target: CHEMBL233

Effect: agon

Assay: G_GTP

Pattern: (?i)gtp

Pattern_ex: (?i)arrestin|camp|calcium|IP1|IP3|antagonis|inverse agonist|allosteric

Standard type: EC50

The shape of type_df is (2296, 31)

The shape of  is (1067, 31)
#assay_desc:
count                                                  1067
unique                                                  123
top       [35S]GTPγS Functional Assay: [35S]GTPγS functi...
freq                                                     64
Name: assay_desc, dtype: object

#canonical_smiles:
count                                                  1067
unique                                                  955
top       C[C@@H](NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)NCC(=...
freq                                                     33
Name: canonical_smiles, dtype: object

The shape of plus_df is (80, 31)

The shape of exclude_df is (23, 31)

The shape of final_df is (1124, 31)

##########################
target_chembl_id: 

In [6]:
# cat_datasets: GPCRs
gpcr_type_dfs, gpcr_agon_G_GTP_dfs, gpcr_agon_G_GTP_len_dfs = categorize_GPCRs(targets='GPCRs', effect='agon', assay='G_GTP', std_types=['EC50'], 
                                                                                    pattern=p_agon_G_GTP, pattern_ex=p_agon_G_GTP_ex)

Target: CHEMBL2327

Effect: agon

Assay: G_GTP

Pattern: (?i)gtp

Pattern_ex: (?i)arrestin|camp|calcium|IP1|IP3|antagonis|inverse agonist|allosteric

Standard type: EC50

The shape of type_df is (5, 31)

The shape of  is (0, 31)
#assay_desc:
count       0
unique      0
top       NaN
freq      NaN
Name: assay_desc, dtype: object

#canonical_smiles:
count       0
unique      0
top       NaN
freq      NaN
Name: canonical_smiles, dtype: object

##########################
Target: CHEMBL3713916

Effect: agon

Assay: G_GTP

Pattern: (?i)gtp

Pattern_ex: (?i)arrestin|camp|calcium|IP1|IP3|antagonis|inverse agonist|allosteric

Standard type: EC50

The shape of type_df is (2, 31)

The shape of  is (0, 31)
#assay_desc:
count       0
unique      0
top       NaN
freq      NaN
Name: assay_desc, dtype: object

#canonical_smiles:
count       0
unique      0
top       NaN
freq      NaN
Name: canonical_smiles, dtype: object

##########################
Target: CHEMBL5144

Effect: agon

Assay: G_GTP

Patte

### Antagonism: IC50, Ki, Ke, Kb

In [7]:
# GTPgammaS binding assay
p_antag_G_GTP = r"(?i)gtp"
p_antag_G_GTP_ex = r"(?i)arrestin|camp|calcium|IP1|IP3|allosteric"

or_type_dfs, or_antag_G_GTP_dfs, or_antag_G_GTP_plus_dfs, or_antag_G_GTP_exclude_dfs, or_antag_G_GTP_final_dfs, or_antag_G_GTP_len_dfs = categorize_ORs(targets='ORs', effect='antag', assay='G_GTP', std_types=['IC50', 'Ki'], 
                                                                                       pattern=p_antag_G_GTP, pattern_ex=p_antag_G_GTP_ex)

target_chembl_id: CHEMBL233

Target: CHEMBL233

Effect: antag

Assay: G_GTP

Pattern: (?i)gtp

Pattern_ex: (?i)arrestin|camp|calcium|IP1|IP3|allosteric

Standard type: IC50

The shape of type_df is (1366, 31)

The shape of  is (440, 31)
#assay_desc:
count                                                   440
unique                                                   43
top       Antagonist activity assessed as inhibition of ...
freq                                                     56
Name: assay_desc, dtype: object

#canonical_smiles:
count                                                   440
unique                                                  396
top       C[C@@H](NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)NCC(=...
freq                                                      7
Name: canonical_smiles, dtype: object

The shape of plus_df is (0, 31)

The shape of exclude_df is (63, 31)

The shape of final_df is (377, 31)

##########################
Standard type: Ki

The shape of type_df is (56

In [8]:
# cat_datasets: GPCRs
gpcr_type_dfs, gpcr_antag_G_GTP_dfs, gpcr_antag_G_GTP_len_dfs = categorize_GPCRs(targets='GPCRs', effect='antag', assay='G_GTP', std_types=['IC50', 'Ki'], 
                                                                                       pattern=p_antag_G_GTP, pattern_ex=p_antag_G_GTP_ex)

Target: CHEMBL2327

Effect: antag

Assay: G_GTP

Pattern: (?i)gtp

Pattern_ex: (?i)arrestin|camp|calcium|IP1|IP3|allosteric

Standard type: IC50

The shape of type_df is (290, 31)

The shape of  is (0, 31)
#assay_desc:
count       0
unique      0
top       NaN
freq      NaN
Name: assay_desc, dtype: object

#canonical_smiles:
count       0
unique      0
top       NaN
freq      NaN
Name: canonical_smiles, dtype: object

##########################
Standard type: Ki

The shape of type_df is (873, 31)

The shape of  is (0, 31)
#assay_desc:
count       0
unique      0
top       NaN
freq      NaN
Name: assay_desc, dtype: object

#canonical_smiles:
count       0
unique      0
top       NaN
freq      NaN
Name: canonical_smiles, dtype: object

##########################
Target: CHEMBL3713916

Effect: antag

Assay: G_GTP

Pattern: (?i)gtp

Pattern_ex: (?i)arrestin|camp|calcium|IP1|IP3|allosteric

Standard type: IC50

The shape of type_df is (2, 31)

The shape of  is (0, 31)
#assay_desc:
count    

## G_cAMP

### Agonism: IC50, EC50

In [9]:
# cAMP accumulation assay
p_ago_G_cAMP = r"(?i)camp"
p_ago_G_cAMP_ex = r"(?i)arrestin|gtp|calcium|IP1|IP3|antagonis|inverse agonist|allosteric"

or_type_dfs, or_agon_G_cAMP_dfs, or_agon_G_cAMP_plus_dfs, or_agon_G_cAMP_exclude_dfs,or_agon_G_cAMP_final_dfs, or_agon_G_cAMP_len_dfs = categorize_ORs(targets='ORs', effect='agon', assay='G_cAMP', std_types=['IC50', 'EC50'], 
                                                                                       pattern=p_ago_G_cAMP, pattern_ex=p_ago_G_cAMP_ex)

target_chembl_id: CHEMBL233

Target: CHEMBL233

Effect: agon

Assay: G_cAMP

Pattern: (?i)camp

Pattern_ex: (?i)arrestin|gtp|calcium|IP1|IP3|antagonis|inverse agonist|allosteric

Standard type: IC50

The shape of type_df is (1366, 31)

The shape of  is (79, 31)
#assay_desc:
count                                                    79
unique                                                   15
top       Agonist activity at human MOR expressed in HEK...
freq                                                     16
Name: assay_desc, dtype: object

#canonical_smiles:
count                                                    79
unique                                                   56
top       C[C@@H](NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)NCC(=...
freq                                                      7
Name: canonical_smiles, dtype: object

The shape of plus_df is (6, 31)

The shape of exclude_df is (0, 31)

The shape of final_df is (85, 31)

##########################
Standard type: EC50

Th

In [10]:
# cat_datasets: GPCRs
gpcr_type_dfs, gpcr_agon_G_cAMP_dfs, gpcr_agon_G_cAMP_len_dfs = categorize_GPCRs(targets='GPCRs', effect='agon', assay='G_cAMP', std_types=['IC50', 'EC50'], 
                                                                                       pattern=p_ago_G_cAMP, pattern_ex=p_ago_G_cAMP_ex)

Target: CHEMBL2327

Effect: agon

Assay: G_cAMP

Pattern: (?i)camp

Pattern_ex: (?i)arrestin|gtp|calcium|IP1|IP3|antagonis|inverse agonist|allosteric

Standard type: IC50

The shape of type_df is (290, 31)

The shape of  is (0, 31)
#assay_desc:
count       0
unique      0
top       NaN
freq      NaN
Name: assay_desc, dtype: object

#canonical_smiles:
count       0
unique      0
top       NaN
freq      NaN
Name: canonical_smiles, dtype: object

##########################
Standard type: EC50

The shape of type_df is (5, 31)

The shape of  is (0, 31)
#assay_desc:
count       0
unique      0
top       NaN
freq      NaN
Name: assay_desc, dtype: object

#canonical_smiles:
count       0
unique      0
top       NaN
freq      NaN
Name: canonical_smiles, dtype: object

##########################
Target: CHEMBL3713916

Effect: agon

Assay: G_cAMP

Pattern: (?i)camp

Pattern_ex: (?i)arrestin|gtp|calcium|IP1|IP3|antagonis|inverse agonist|allosteric

Standard type: IC50

The shape of type_df is (2, 

### Antagonism

neither IC50 data not EC50 data within G_cAMP assay is related to antagonism

## G_Ca

### Agonism: EC50

In [11]:
# IP3/IP1 and Ca2+ assay
p_agon_G_Ca = r"(?i)calcium|ca2+|IP1|IP3"
p_agon_G_Ca_ex = r"(?i)arrestin|gtp|camp|antagonis|inverse agonist|allosteric"

# cat_datasets: ORs
or_type_dfs, or_agon_G_Ca_dfs, or_agon_G_Ca_plus_dfs, or_agon_G_Ca_exclude_dfs, or_agon_G_Ca_final_dfs, or_agon_G_Ca_len_dfs = categorize_ORs(targets='ORs', effect='agon', assay='G_Ca', std_types=['EC50'], 
                                                                                 pattern=p_agon_G_Ca, pattern_ex=p_agon_G_Ca_ex)

target_chembl_id: CHEMBL233

Target: CHEMBL233

Effect: agon

Assay: G_Ca

Pattern: (?i)calcium|ca2+|IP1|IP3

Pattern_ex: (?i)arrestin|gtp|camp|antagonis|inverse agonist|allosteric

Standard type: EC50

The shape of type_df is (2296, 31)

The shape of  is (151, 31)
#assay_desc:
count                                                   151
unique                                                   14
top       Agonist activity at human MOR expressed in CHO...
freq                                                     35
Name: assay_desc, dtype: object

#canonical_smiles:
count                                                   151
unique                                                  140
top       NC(=O)[C@H](Cc1ccccc1)NC(=O)[C@H](Cc1ccccc1)NC...
freq                                                      5
Name: canonical_smiles, dtype: object

The shape of plus_df is (25, 31)

The shape of exclude_df is (0, 31)

The shape of final_df is (176, 31)

##########################
target_chembl_id:

In [12]:
# cat_datasets: GPCRs
gpcr_type_dfs, gpcr_agon_G_Ca_dfs, gpcr_agon_G_Ca_len_dfs = categorize_GPCRs(targets='GPCRs', effect='agon', assay='G_Ca', std_types=['EC50'], 
                                                                                 pattern=p_agon_G_Ca, pattern_ex=p_agon_G_Ca_ex)

Target: CHEMBL2327

Effect: agon

Assay: G_Ca

Pattern: (?i)calcium|ca2+|IP1|IP3

Pattern_ex: (?i)arrestin|gtp|camp|antagonis|inverse agonist|allosteric

Standard type: EC50

The shape of type_df is (5, 31)

The shape of  is (0, 31)
#assay_desc:
count       0
unique      0
top       NaN
freq      NaN
Name: assay_desc, dtype: object

#canonical_smiles:
count       0
unique      0
top       NaN
freq      NaN
Name: canonical_smiles, dtype: object

##########################
Target: CHEMBL3713916

Effect: agon

Assay: G_Ca

Pattern: (?i)calcium|ca2+|IP1|IP3

Pattern_ex: (?i)arrestin|gtp|camp|antagonis|inverse agonist|allosteric

Standard type: EC50

The shape of type_df is (2, 31)

The shape of  is (1, 31)
#assay_desc:
count                                                     1
unique                                                    1
top       Affinity Phenotypic Cellular interaction (FLIP...
freq                                                      1
Name: assay_desc, dtype: object

#c

### Antagonism: IC50

In [13]:
# IP3/IP1 and Ca2+ assay
p_antag_G_Ca = r"(?i)calcium|ca2+|IP1|IP3"
p_antag_G_Ca_ex = r"(?i)arrestin|gtp|camp|allosteric"


In [14]:
# cat_datasets: ORs
# nearly no data points

In [15]:
# cat_datasets: GPCRs
gpcr_type_dfs, gpcr_antag_G_Ca_dfs, gpcr_antag_G_Ca_len_dfs = categorize_GPCRs(targets='GPCRs', effect='antag', assay='G_Ca', std_types=['IC50'], 
                                                                                 pattern=p_antag_G_Ca, pattern_ex=p_antag_G_Ca_ex)

Target: CHEMBL2327

Effect: antag

Assay: G_Ca

Pattern: (?i)calcium|ca2+|IP1|IP3

Pattern_ex: (?i)arrestin|gtp|camp|allosteric

Standard type: IC50

The shape of type_df is (290, 31)

The shape of  is (0, 31)
#assay_desc:
count       0
unique      0
top       NaN
freq      NaN
Name: assay_desc, dtype: object

#canonical_smiles:
count       0
unique      0
top       NaN
freq      NaN
Name: canonical_smiles, dtype: object

##########################
Target: CHEMBL3713916

Effect: antag

Assay: G_Ca

Pattern: (?i)calcium|ca2+|IP1|IP3

Pattern_ex: (?i)arrestin|gtp|camp|allosteric

Standard type: IC50

The shape of type_df is (2, 31)

The shape of  is (0, 31)
#assay_desc:
count       0
unique      0
top       NaN
freq      NaN
Name: assay_desc, dtype: object

#canonical_smiles:
count       0
unique      0
top       NaN
freq      NaN
Name: canonical_smiles, dtype: object

##########################
Target: CHEMBL5144

Effect: antag

Assay: G_Ca

Pattern: (?i)calcium|ca2+|IP1|IP3

Pattern_ex

## B_arrest

### Agonism: EC50

In [16]:
# G-protein independent functional assays
# Beta-arrestin recruitment assay
p_agon_B_arrest = r"(?i)arrest"
p_agon_B_arrest_ex = r"(?i)gtp|camp|calcium|IP1|IP3|antagonis|inverse agonist|allosteric"

or_type_dfs, or_agon_B_arrest_dfs, or_agon_B_arrest_plus_dfs, or_agon_B_arrest_exclude_dfs, or_agon_B_arrest_final_dfs, or_agon_B_arrest_len_dfs = categorize_ORs(targets='ORs', effect='agon', assay='B_arrest', std_types=['EC50'], 
                                                                                             pattern=p_agon_B_arrest, pattern_ex=p_agon_B_arrest_ex)

target_chembl_id: CHEMBL233

Target: CHEMBL233

Effect: agon

Assay: B_arrest

Pattern: (?i)arrest

Pattern_ex: (?i)gtp|camp|calcium|IP1|IP3|antagonis|inverse agonist|allosteric

Standard type: EC50

The shape of type_df is (2296, 31)

The shape of  is (267, 31)
#assay_desc:
count                                                   267
unique                                                   24
top       Agonist activity at human mu opioid receptor e...
freq                                                     45
Name: assay_desc, dtype: object

#canonical_smiles:
count                                                   267
unique                                                  214
top       C[C@@H](NC(=O)[C@@H](N)Cc1ccc(O)cc1)C(=O)NCC(=...
freq                                                     10
Name: canonical_smiles, dtype: object

The shape of plus_df is (0, 31)

The shape of exclude_df is (0, 31)



The shape of final_df is (267, 31)

##########################
target_chembl_id: CHEMBL237

Target: CHEMBL237

Effect: agon

Assay: B_arrest

Pattern: (?i)arrest

Pattern_ex: (?i)gtp|camp|calcium|IP1|IP3|antagonis|inverse agonist|allosteric

Standard type: EC50

The shape of type_df is (2062, 31)

The shape of  is (66, 31)
#assay_desc:
count                                                    66
unique                                                   17
top       Agonist activity at GFP-fused kappa opioid rec...
freq                                                     13
Name: assay_desc, dtype: object

#canonical_smiles:
count                                             66
unique                                            54
top       Clc1ccc(CSc2nnc(-c3ccccn3)n2Cc2ccco2)cc1Cl
freq                                               4
Name: canonical_smiles, dtype: object

The shape of plus_df is (0, 31)

The shape of exclude_df is (0, 31)

The shape of final_df is (66, 31)

###############

In [17]:
# cat_datasets: GPCRs
gpcr_type_dfs, gpcr_agon_B_arrest_dfs, gpcr_agon_B_arrest_len_dfs = categorize_GPCRs(targets='GPCRs', effect='agon', assay='B_arrest', std_types=['EC50'], 
                                                                                             pattern=p_agon_B_arrest, pattern_ex=p_agon_B_arrest_ex)

Target: CHEMBL2327

Effect: agon

Assay: B_arrest

Pattern: (?i)arrest

Pattern_ex: (?i)gtp|camp|calcium|IP1|IP3|antagonis|inverse agonist|allosteric

Standard type: EC50

The shape of type_df is (5, 31)

The shape of  is (0, 31)
#assay_desc:
count       0
unique      0
top       NaN
freq      NaN
Name: assay_desc, dtype: object

#canonical_smiles:
count       0
unique      0
top       NaN
freq      NaN
Name: canonical_smiles, dtype: object

##########################
Target: CHEMBL3713916

Effect: agon

Assay: B_arrest

Pattern: (?i)arrest

Pattern_ex: (?i)gtp|camp|calcium|IP1|IP3|antagonis|inverse agonist|allosteric

Standard type: EC50

The shape of type_df is (2, 31)

The shape of  is (0, 31)
#assay_desc:
count       0
unique      0
top       NaN
freq      NaN
Name: assay_desc, dtype: object

#canonical_smiles:
count       0
unique      0
top       NaN
freq      NaN
Name: canonical_smiles, dtype: object

##########################
Target: CHEMBL5144

Effect: agon

Assay: B_arrest



### Antagonism: IC50

In [18]:
# G-protein independent functional assays
# Beta-arrestin recruitment assay
p_antag_B_arrest = r"(?i)arrest"
p_antag_B_arrest_ex = r"(?i)gtp|camp|calcium|IP1|IP3|allosteric"

or_type_dfs, or_antag_B_arrest_dfs, or_antag_B_arrest_plus_dfs, or_antag_B_arrest_exclude_dfs, or_antag_B_arrest_final_dfs, or_antag_B_arrest_len_dfs  = categorize_ORs(targets='ORs', effect='antag', assay='B_arrest', std_types=['IC50'], 
                                                                                             pattern=p_antag_B_arrest, pattern_ex=p_antag_B_arrest_ex)

target_chembl_id: CHEMBL233

Target: CHEMBL233

Effect: antag

Assay: B_arrest

Pattern: (?i)arrest

Pattern_ex: (?i)gtp|camp|calcium|IP1|IP3|allosteric

Standard type: IC50

The shape of type_df is (1366, 31)

The shape of  is (40, 31)
#assay_desc:
count                                                    40
unique                                                    3
top       Antagonist activity at GAL4-VP16-fused MOR (un...
freq                                                     26
Name: assay_desc, dtype: object

#canonical_smiles:
count                                                    40
unique                                                   40
top       Cc1ccc(S(=O)(=O)NCc2ccc(C(=O)NCCN(Cc3ccccc3)C(...
freq                                                      1
Name: canonical_smiles, dtype: object

The shape of plus_df is (0, 31)

The shape of exclude_df is (0, 31)

The shape of final_df is (40, 31)

##########################
target_chembl_id: CHEMBL237

Target: CHEMBL237



In [19]:
# cat_datasets: GPCRs
gpcr_type_dfs, gpcr_antag_B_arrest_dfs, gpcr_antag_B_arrest_len_dfs = categorize_GPCRs(targets='GPCRs',  effect='antag', assay='B_arrest', std_types=['IC50'], 
                                                                                             pattern=p_antag_B_arrest, pattern_ex=p_antag_B_arrest_ex)

Target: CHEMBL2327

Effect: antag

Assay: B_arrest

Pattern: (?i)arrest

Pattern_ex: (?i)gtp|camp|calcium|IP1|IP3|allosteric

Standard type: IC50

The shape of type_df is (290, 31)

The shape of  is (7, 31)
#assay_desc:
count                                                     7
unique                                                    1
top       Antagonist activity at human TACR2 expressed i...
freq                                                      7
Name: assay_desc, dtype: object

#canonical_smiles:
count                                                     7
unique                                                    7
top       CC[C@H](C)[C@@H](OC(=O)C#Cc1ccccc1)C(=O)N1CCC[...
freq                                                      1
Name: canonical_smiles, dtype: object

##########################
Target: CHEMBL3713916

Effect: antag

Assay: B_arrest

Pattern: (?i)arrest

Pattern_ex: (?i)gtp|camp|calcium|IP1|IP3|allosteric

Standard type: IC50

The shape of type_df is (2, 31)


# Data Visualization

## Create the dfs to store the stats

In [20]:
or_combine_len_dfs = [or_bind_len_dfs, 
                    or_agon_G_GTP_len_dfs, 
                    or_agon_G_cAMP_len_dfs, 
                    or_agon_G_Ca_len_dfs, 
                    or_agon_B_arrest_len_dfs, 
                    or_antag_G_GTP_len_dfs,  
                    or_antag_B_arrest_len_dfs]

In [21]:
final_len_df = pd.DataFrame()
for len_dfs in or_combine_len_dfs:
    for key, len_df in len_dfs.items():
        #print(key)
        # use method 'concat' to append len_df to final_len_df
        final_len_df = pd.concat([final_len_df, len_df], axis=0, sort=False)

# remove the rows where 'std_type' is 'Ke' or 'Kb'
final_len_df = final_len_df[~final_len_df['std_type'].isin(['Ke', 'Kb'])]
        
final_len_df
# type_df = final_df + final_out_df
# final_df = effect_type_df + plus_df - exclude_df

Unnamed: 0,target,effect,assay,std_type,type_df,effect_type_df,plus_df,exclude_df,final_df,final_out_df
0,CHEMBL233,bind,RBA,Ki,5682,5418,139,0,5557,125
0,CHEMBL233,bind,RBA,IC50,1366,668,19,0,687,679
0,CHEMBL237,bind,RBA,Ki,4857,4535,140,29,4646,211
0,CHEMBL237,bind,RBA,IC50,1049,478,7,0,485,564
0,CHEMBL236,bind,RBA,Ki,5152,4933,65,1,4997,155
0,CHEMBL236,bind,RBA,IC50,1316,930,34,0,964,352
0,CHEMBL2014,bind,RBA,Ki,1381,1320,12,0,1332,49
0,CHEMBL2014,bind,RBA,IC50,747,496,11,0,507,240
0,CHEMBL233,agon,G_GTP,EC50,2296,1067,80,23,1124,1172
0,CHEMBL237,agon,G_GTP,EC50,2062,1365,103,0,1468,594


## Effectwise

### donut plot for effect-wise activity space

In [23]:
def effect_activity_space(target='CHEMBL233', startangle=-90, fig_format='pdf'):
    
    target_len_df = final_len_df[final_len_df['target'] == target]
    # extract coloums needed for plotting
    target_len_df = target_len_df[['effect', 'assay', 'std_type', 'final_df']]
    # create the pivot table
    target_len_df = pd.pivot_table(target_len_df, index=['effect','assay', 'std_type'], values=['final_df'])
    # make a new column for labeling the minor data
    target_len_df['assay-type'] = target_len_df.index.get_level_values('std_type') + ': ' + target_len_df.index.get_level_values('assay')


    ##########Plot the nested pie chart#####################
    # create a figure and subplots
    fig, ax = plt.subplots(figsize=(10, 6))
    
    width = 0.3

    # data for pie charts
    # Major category values = sum of minor category values
    major_data = target_len_df.groupby('effect')['final_df'].sum()
    major_labels = target_len_df.index.get_level_values('effect').unique()
    # Minor category values
    minor_data = target_len_df['final_df']
    minor_labels = target_len_df['assay-type'] + '(' + target_len_df['final_df'].astype(str) + ')'

    # create custom colormaps based on a given color
    # color for major category
    bind_color = '#b0e3e6'
    agon_color = '#f5426c'
    antag_color = '#4278f5'
    major_colors = [agon_color, antag_color, bind_color]
    # color for minor category
    bind_palette = sns.light_palette(bind_color, n_colors=2)[::-1]
    agon_palette = sns.light_palette(agon_color, n_colors=4)[::-1]
    antag_palette = sns.light_palette(antag_color, n_colors=6)[::-1]
    minor_colors = [agon_palette[3], agon_palette[2],agon_palette[1], agon_palette[0],
                    antag_palette[5], antag_palette[4], antag_palette[3], antag_palette[2], antag_palette[1], antag_palette[0],
                    bind_palette[1], bind_palette[0]
                    ]

    # Draw pies
    hfont = {'fontname':'Times New Roman'}
    # pie for major category - 'effect', put in the inner circle
    ax.pie(major_data, 
        radius=1-width, 
        colors=major_colors,
        labels=major_labels,
        startangle=startangle,
        textprops={'fontsize': 13, 'fontname':'Times New Roman', 'fontweight':'bold'},
        wedgeprops=dict(width=width, edgecolor='w'),
        labeldistance=0.6)
    
    # pie for minor category - 'assay', put in the outer circle
    ax.pie(minor_data,
        radius=1,
        colors=minor_colors,
        labels=minor_labels,
        startangle=startangle,
        textprops={'fontsize': 11, 'fontname':'Times New Roman'},
        wedgeprops=dict(width=width, edgecolor='w'),
        labeldistance=1)

    # Set a title
    ax.set_title(f'{target}: effect-wise activity space', fontsize=16, fontweight='bold', **hfont)

    # save the figure
    file_path = os.path.join(CAT_FIG_DIR, target)
    mkdirs(file_path)

    fig.savefig(os.path.join(file_path, f"{target}_effect-wise_activity_space_pie.{fig_format}"), dpi=300, bbox_inches='tight')

    return target_len_df

In [24]:
mor_effect_len_df = effect_activity_space(target='CHEMBL233', startangle=-90, fig_format='pdf')
kor_effect_len_df = effect_activity_space(target='CHEMBL237', startangle=-73, fig_format='pdf')
dor_effect_len_df = effect_activity_space(target='CHEMBL236', startangle=-70, fig_format='pdf')
nor_effect_len_df = effect_activity_space(target='CHEMBL2014', startangle=-50, fig_format='pdf')

findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times Ne

In [25]:
mor_effect_len_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,final_df,assay-type
effect,assay,std_type,Unnamed: 3_level_1,Unnamed: 4_level_1
agon,B_arrest,EC50,267,EC50: B_arrest
agon,G_Ca,EC50,176,EC50: G_Ca
agon,G_GTP,EC50,1124,EC50: G_GTP
agon,G_cAMP,EC50,587,EC50: G_cAMP
agon,G_cAMP,IC50,85,IC50: G_cAMP
antag,B_arrest,IC50,40,IC50: B_arrest
antag,G_GTP,IC50,377,IC50: G_GTP
antag,G_GTP,Ki,63,Ki: G_GTP
bind,RBA,IC50,687,IC50: RBA
bind,RBA,Ki,5557,Ki: RBA


In [26]:
kor_effect_len_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,final_df,assay-type
effect,assay,std_type,Unnamed: 3_level_1,Unnamed: 4_level_1
agon,B_arrest,EC50,66,EC50: B_arrest
agon,G_Ca,EC50,86,EC50: G_Ca
agon,G_GTP,EC50,1468,EC50: G_GTP
agon,G_cAMP,EC50,346,EC50: G_cAMP
agon,G_cAMP,IC50,36,IC50: G_cAMP
antag,B_arrest,IC50,73,IC50: B_arrest
antag,G_GTP,IC50,212,IC50: G_GTP
antag,G_GTP,Ki,57,Ki: G_GTP
bind,RBA,IC50,485,IC50: RBA
bind,RBA,Ki,4646,Ki: RBA


In [27]:
dor_effect_len_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,final_df,assay-type
effect,assay,std_type,Unnamed: 3_level_1,Unnamed: 4_level_1
agon,B_arrest,EC50,53,EC50: B_arrest
agon,G_Ca,EC50,45,EC50: G_Ca
agon,G_GTP,EC50,922,EC50: G_GTP
agon,G_cAMP,EC50,202,EC50: G_cAMP
agon,G_cAMP,IC50,33,IC50: G_cAMP
antag,B_arrest,IC50,38,IC50: B_arrest
antag,G_GTP,IC50,197,IC50: G_GTP
antag,G_GTP,Ki,91,Ki: G_GTP
bind,RBA,IC50,964,IC50: RBA
bind,RBA,Ki,4997,Ki: RBA


In [28]:
nor_effect_len_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,final_df,assay-type
effect,assay,std_type,Unnamed: 3_level_1,Unnamed: 4_level_1
agon,B_arrest,EC50,47,EC50: B_arrest
agon,G_Ca,EC50,66,EC50: G_Ca
agon,G_GTP,EC50,314,EC50: G_GTP
agon,G_cAMP,EC50,37,EC50: G_cAMP
agon,G_cAMP,IC50,80,IC50: G_cAMP
antag,B_arrest,IC50,0,IC50: B_arrest
antag,G_GTP,IC50,157,IC50: G_GTP
antag,G_GTP,Ki,6,Ki: G_GTP
bind,RBA,IC50,507,IC50: RBA
bind,RBA,Ki,1332,Ki: RBA


## Typewise

### donut plot for type-wise activity space

In [29]:
def type_activity_space(target='CHEMBL233', startangle=-90, fig_format='pdf'):
    
    target_len_df = final_len_df[final_len_df['target'] == target]
    # extract columns needed for plotting
    target_len_df = target_len_df[['effect', 'assay', 'std_type', 'final_df']]
    # create the pivot table
    target_len_df = pd.pivot_table(target_len_df, index=['std_type', 'effect','assay'], values=['final_df'])
    # make a new column for labeling the minor data
    target_len_df['effect-assay'] = target_len_df.index.get_level_values('effect') + ': ' + target_len_df.index.get_level_values('assay')


    ##########Plot the nested pie chart#####################
    # create a figure and subplots
    fig, ax = plt.subplots(figsize=(10, 6))

    width = 0.3

    # data for pie charts
    # Major category values = sum of minor category values
    major_data = target_len_df.groupby('std_type')['final_df'].sum()
    major_labels = target_len_df.index.get_level_values('std_type').unique()
    # Minor category values
    minor_data = target_len_df['final_df']
    minor_labels = target_len_df['effect-assay'] + '(' + target_len_df['final_df'].astype(str) + ')'

    # create custom colormaps based on a given color
    # color for major category
    ec50_color = '#b0daff'
    ic50_color = '#19a7ce'
    kb_color = '#146c94'
    ke_color = '#164b60' 
    ki_color = '#c4b0ff' #'#e5beec', #f9f54b

    major_colors = [ec50_color, ic50_color, kb_color, ke_color, ki_color]
    # color for minor category
    ki_palette = sns.light_palette(ki_color, n_colors=2)[::-1]
    ic50_palette = sns.light_palette(ic50_color, n_colors=4)[::-1]
    ec50_palette = sns.light_palette(ec50_color, n_colors=4)[::-1]
    ke_palette = sns.light_palette(ke_color, n_colors=1)[::-1]
    kb_palette = sns.light_palette(kb_color, n_colors=1)[::-1]
    minor_colors = [ec50_palette[3], ec50_palette[2], ec50_palette[1], ec50_palette[0],
                    ic50_palette[3], ic50_palette[2], ic50_palette[1], ic50_palette[0],
                    kb_palette[0],
                    ke_palette[0],
                    ki_palette[1], ki_palette[0]]
    # Draw pies
    hfont = {'fontname':'Times New Roman'}
    # pie for major category - 'effect', put in the inner circle
    ax.pie(major_data, 
        radius=1-width, 
        colors=major_colors,
        labels=major_labels,
        startangle=startangle,
        textprops={'fontsize': 11, 'fontweight':'bold'},
        wedgeprops=dict(width=width, edgecolor='w'),
        labeldistance=0.65)
    # pie for minor category - 'assay', put in the outer circle
    ax.pie(minor_data,
        radius=1,
        colors=minor_colors,
        labels=minor_labels,
        startangle=startangle,
        textprops={'fontsize': 11, 'fontname':'Times New Roman'},
        wedgeprops=dict(width=width, edgecolor='w'),
        labeldistance=1)
    
    # Set a title
    ax.set_title(f'{target}: type-wise activity space', fontsize=16, fontweight='bold', **hfont)

    # save the figure
    file_path = os.path.join(CAT_FIG_DIR, target)
    mkdirs(file_path)

    fig.savefig(os.path.join(file_path, f"{target}_type-wise_activity_space_pie.{fig_format}"), dpi=300, bbox_inches='tight')
    print(f"Figure saved in {file_path}")
    
    return target_len_df

In [30]:
mor_type_len_df = type_activity_space(target='CHEMBL233', startangle=-90, fig_format='pdf')
kor_type_len_df = type_activity_space(target='CHEMBL237', startangle=-103, fig_format='pdf')
dor_type_len_df = type_activity_space(target='CHEMBL236', startangle=-101, fig_format='pdf')
nor_type_len_df = type_activity_space(target='CHEMBL2014', startangle=-100, fig_format='pdf')

findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times Ne

Figure saved in /storage/homefs/yc24j783/datacat4ml/datacat4ml/Figures/data_prep/data_categorize/CHEMBL233
Figure saved in /storage/homefs/yc24j783/datacat4ml/datacat4ml/Figures/data_prep/data_categorize/CHEMBL237


findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times New Roman' not found.
findfont: Font family 'Times Ne

Figure saved in /storage/homefs/yc24j783/datacat4ml/datacat4ml/Figures/data_prep/data_categorize/CHEMBL236
Figure saved in /storage/homefs/yc24j783/datacat4ml/datacat4ml/Figures/data_prep/data_categorize/CHEMBL2014


In [31]:
mor_type_len_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,final_df,effect-assay
std_type,effect,assay,Unnamed: 3_level_1,Unnamed: 4_level_1
EC50,agon,B_arrest,267,agon: B_arrest
EC50,agon,G_Ca,176,agon: G_Ca
EC50,agon,G_GTP,1124,agon: G_GTP
EC50,agon,G_cAMP,587,agon: G_cAMP
IC50,agon,G_cAMP,85,agon: G_cAMP
IC50,antag,B_arrest,40,antag: B_arrest
IC50,antag,G_GTP,377,antag: G_GTP
IC50,bind,RBA,687,bind: RBA
Ki,antag,G_GTP,63,antag: G_GTP
Ki,bind,RBA,5557,bind: RBA


In [32]:
kor_type_len_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,final_df,effect-assay
std_type,effect,assay,Unnamed: 3_level_1,Unnamed: 4_level_1
EC50,agon,B_arrest,66,agon: B_arrest
EC50,agon,G_Ca,86,agon: G_Ca
EC50,agon,G_GTP,1468,agon: G_GTP
EC50,agon,G_cAMP,346,agon: G_cAMP
IC50,agon,G_cAMP,36,agon: G_cAMP
IC50,antag,B_arrest,73,antag: B_arrest
IC50,antag,G_GTP,212,antag: G_GTP
IC50,bind,RBA,485,bind: RBA
Ki,antag,G_GTP,57,antag: G_GTP
Ki,bind,RBA,4646,bind: RBA


In [33]:
dor_type_len_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,final_df,effect-assay
std_type,effect,assay,Unnamed: 3_level_1,Unnamed: 4_level_1
EC50,agon,B_arrest,53,agon: B_arrest
EC50,agon,G_Ca,45,agon: G_Ca
EC50,agon,G_GTP,922,agon: G_GTP
EC50,agon,G_cAMP,202,agon: G_cAMP
IC50,agon,G_cAMP,33,agon: G_cAMP
IC50,antag,B_arrest,38,antag: B_arrest
IC50,antag,G_GTP,197,antag: G_GTP
IC50,bind,RBA,964,bind: RBA
Ki,antag,G_GTP,91,antag: G_GTP
Ki,bind,RBA,4997,bind: RBA


In [34]:
nor_type_len_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,final_df,effect-assay
std_type,effect,assay,Unnamed: 3_level_1,Unnamed: 4_level_1
EC50,agon,B_arrest,47,agon: B_arrest
EC50,agon,G_Ca,66,agon: G_Ca
EC50,agon,G_GTP,314,agon: G_GTP
EC50,agon,G_cAMP,37,agon: G_cAMP
IC50,agon,G_cAMP,80,agon: G_cAMP
IC50,antag,B_arrest,0,antag: B_arrest
IC50,antag,G_GTP,157,antag: G_GTP
IC50,bind,RBA,507,bind: RBA
Ki,antag,G_GTP,6,antag: G_GTP
Ki,bind,RBA,1332,bind: RBA
